In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.model_selection import train_test_split


In [2]:
print(os.getcwd())
print(os.listdir())

/Users/donaldmurataj/Documents/Development/Paidy/credit-risk-prediction/src/credit_risk_prediction
['.DS_Store', 'modeling.ipynb', '__init__.py', 'py.typed', 'data', 'analysis.ipynb']


In [3]:
# Load datasets and ignore "Unnamed: 0" column on read
full_df = pd.read_csv("data/cs-training.csv", usecols=lambda c: c != "Unnamed: 0")

# Define type groups
int_columns = [
    'SeriousDlqin2yrs', 'age', 'NumberOfTime30-59DaysPastDueNotWorse',
    'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
    'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
    'NumberOfDependents'
]

float_columns = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio', 'MonthlyIncome']

# Convert types
full_df[int_columns] = full_df[int_columns].astype('Int64')
full_df[float_columns] = full_df[float_columns].astype(float)

# Preview result
full_df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0


### Assume null delinquency or dependants just means 0 (We'll deal with income later)

In [4]:
full_df[['NumberOfDependents', 'SeriousDlqin2yrs']] = (
    full_df[['NumberOfDependents', 'SeriousDlqin2yrs']].fillna(0)
)

null_counts = full_df.isnull().sum()
print(null_counts)

SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                          0
dtype: int64


### Variable Transformations

#### Dealing with late payers

In [5]:
# Create binary variables for "ever late" (1) vs "never late" (0)

# 30–59 days past due
full_df['30_59DaysPastDueNotWorse'] = (full_df['NumberOfTime30-59DaysPastDueNotWorse'] > 0).astype(int)

# 60–89 days past due
full_df['60_89DaysPastDueNotWorse'] = (full_df['NumberOfTime60-89DaysPastDueNotWorse'] > 0).astype(int)

# 90+ days late
full_df['90DaysLate'] = (full_df['NumberOfTimes90DaysLate'] > 0).astype(int)

# Check the distribution to confirm
for col in ['30_59DaysPastDueNotWorse', '60_89DaysPastDueNotWorse', '90DaysLate']:
    print(f"\n{col} value counts (%):")
    print((full_df[col].value_counts(normalize=True) * 100).round(2))


30_59DaysPastDueNotWorse value counts (%):
30_59DaysPastDueNotWorse
0    84.01
1    15.99
Name: proportion, dtype: float64

60_89DaysPastDueNotWorse value counts (%):
60_89DaysPastDueNotWorse
0    94.93
1     5.07
Name: proportion, dtype: float64

90DaysLate value counts (%):
90DaysLate
0    94.44
1     5.56
Name: proportion, dtype: float64


#### Dealing with Ratios

In [9]:
# --- DebtRatio ---
full_df['debtRatio_1_or_more'] = (full_df['DebtRatio'] >= 1).astype(int)

# --- RevolvingUtilizationOfUnsecuredLines ---
full_df['RevolvingUtilizationOfUnsecuredLines_1_or_more'] = (
    full_df['RevolvingUtilizationOfUnsecuredLines'] >= 1
).astype(int)

# Check the distribution to confirm
for col in ['debtRatio_1_or_more', 'RevolvingUtilizationOfUnsecuredLines_1_or_more']:
    print(f"\n{col} value counts (%):")
    print((full_df[col].value_counts(normalize=True) * 100).round(2))



debtRatio_1_or_more value counts (%):
debtRatio_1_or_more
0    76.42
1    23.58
Name: proportion, dtype: float64

RevolvingUtilizationOfUnsecuredLines_1_or_more value counts (%):
RevolvingUtilizationOfUnsecuredLines_1_or_more
0    97.77
1     2.23
Name: proportion, dtype: float64


In [None]:
# Step 1: Split into 70% training, 30% remaining
training_df, temp_df = train_test_split(full_df, test_size=0.3, random_state=42)

# Step 2: Split the 30% portion into validation (20%) and test (10%)
# (⅔ of temp_df -> validation, ⅓ -> test)
validation_df, test_df = train_test_split(temp_df, test_size=(1/3), random_state=42)

# Reset indexes
training_df = training_df.reset_index(drop=True)
validation_df = validation_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Check results
print(f"Training set: {training_df.shape}")
print(f"Validation set: {validation_df.shape}")
print(f"Test set: {test_df.shape}")