In [35]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.model_selection import train_test_split


In [36]:
print(os.getcwd())
print(os.listdir())

/Users/donaldmurataj/Documents/Development/Paidy/credit-risk-prediction/src/credit_risk_prediction
['.DS_Store', 'modeling.ipynb', '__init__.py', 'py.typed', 'data', 'analysis.ipynb']


In [37]:
# Load datasets and ignore "Unnamed: 0" column on read
training_df = pd.read_csv("data/cs-training.csv", usecols=lambda c: c != "Unnamed: 0")
test_df = pd.read_csv("data/cs-test.csv", usecols=lambda c: c != "Unnamed: 0")

# Define type groups
int_columns = [
    'SeriousDlqin2yrs', 'age', 'NumberOfTime30-59DaysPastDueNotWorse',
    'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
    'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
    'NumberOfDependents'
]

float_columns = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio', 'MonthlyIncome']

# Convert types
training_df[int_columns] = training_df[int_columns].astype('Int64')
training_df[float_columns] = training_df[float_columns].astype(float)

test_df[int_columns] = test_df[int_columns].astype('Int64')
test_df[float_columns] = test_df[float_columns].astype(float)

# Preview result
training_df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0


In [38]:
print(training_df.shape)
print(test_df.shape)

(150000, 11)
(101503, 11)


In [39]:
# 1/3 of test data should be the actual test data, other 2/3rds I want as validation
validation_df = test_df.sample(n=67000, random_state=42)

# The remaining rows become the validation set
test_df = test_df.drop(validation_df.index)

# Reset indexes
test_df = test_df.reset_index(drop=True)
validation_df = validation_df.reset_index(drop=True)

# Check
print(f"Training set: {training_df.shape}")
print(f"Validation set: {validation_df.shape}")
print(f"Test set (final): {test_df.shape}")

Training set: (150000, 11)
Validation set: (67000, 11)
Test set (final): (34503, 11)


### Merge data into one for easier null removal and overall data analysis

In [40]:
# Optional: add a marker to know which rows are train vs test
training_df['dataset'] = 'train'
validation_df['dataset'] = 'validation'
test_df['dataset'] = 'test'

# Concatenate into one DataFrame
full_df = pd.concat([training_df, validation_df, test_df], ignore_index=True)
full_df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,dataset
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2,train
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1,train
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0,train
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0,train
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0,train


### Assume null delinquency or dependants just means 0 (We'll deal with income later)

In [41]:
full_df[['NumberOfDependents', 'SeriousDlqin2yrs']] = (
    full_df[['NumberOfDependents', 'SeriousDlqin2yrs']].fillna(0)
)

null_counts = full_df.isnull().sum()
print(null_counts)

SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           49834
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                          0
dataset                                     0
dtype: int64
