In [12]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
import time
import pyodbc
from sklearn.utils import resample
print(pyodbc.drivers())
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_score, recall_score, f1_score, auc
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
from scipy import stats
import random
random.seed(42)

['SQL Server', 'ODBC Driver 17 for SQL Server', 'SQL Server Native Client RDA 11.0', 'Microsoft Access Driver (*.mdb, *.accdb)', 'Microsoft Excel Driver (*.xls, *.xlsx, *.xlsm, *.xlsb)', 'Microsoft Access Text Driver (*.txt, *.csv)', 'Microsoft Access dBASE Driver (*.dbf, *.ndx, *.mdx)']


In [3]:
df  = pd.read_pickle('../data/interim/processed_data.pkl')
df.shape

(121799, 35)

In [5]:
data = df.copy(deep=True)
data.head()

Unnamed: 0,Client_Income,Car_Owned,Bike_Owned,Active_Loan,House_Own,Child_Count,Credit_Amount,Loan_Annuity,Accompany_Client,Client_Income_Type,Client_Education,Client_Marital_Status,Client_Gender,Loan_Contract_Type,Client_Housing_Type,Population_Region_Relative,Age_Days,Employed_Days,Registration_Days,ID_Days,Mobile_Tag,Homephone_Tag,Workphone_Working,Client_Family_Members,Cleint_City_Rating,Application_Process_Day,Application_Process_Hour,Client_Permanent_Match_Tag,Client_Contact_Work_Tag,Score_Source_2,Score_Source_3,Phone_Change,Credit_Bureau,ID,Default
0,20250.0,1.0,0.0,1.0,1.0,0.0,15282.0,1826.550049,Alone,Service,Graduation,M,Male,CL,Home,0.008575,14162.0,4129.0,7833.0,21.0,1,0,1,2.0,2.0,3.0,10,Yes,Yes,0.215068,0.511202,755.0,1.0,12138936,0
1,13500.0,0.0,0.0,1.0,1.0,0.0,60415.199219,3097.800049,Alone,Retired,Secondary,M,Male,CL,Home,0.009175,22493.0,12020.75,12617.0,5280.0,1,0,1,2.0,2.0,4.0,15,Yes,Yes,0.657508,0.549596,1687.0,4.0,12215264,0
2,12150.0,0.0,0.0,0.0,1.0,0.0,16320.150391,1294.650024,Alone,Retired,Secondary,W,Male,CL,Home,0.016612,20507.0,12020.75,2834.0,4053.0,1,0,0,1.0,2.0,3.0,9,Yes,Yes,0.063343,0.113294,533.0,5.0,12130547,0
3,15750.0,0.0,1.0,1.0,1.0,0.0,45000.0,3007.350098,Alone,Service,Secondary,S,Female,CL,Home,0.008866,14536.0,3465.0,5871.0,2594.0,1,1,0,1.0,2.0,2.0,17,Yes,Yes,0.129861,0.590233,340.0,2.0,12106559,0
4,9000.0,0.0,0.0,0.0,0.0,0.0,22500.0,2191.949951,Alone,Service,Secondary,S,Male,CL,Home,0.020892,15734.0,2504.0,6019.0,1191.0,1,0,0,1.0,2.0,2.0,11,Yes,Yes,0.41863,0.595456,2458.0,2.0,12186490,0


In [6]:
data['ID'] = data['ID'].astype('int')

In [7]:
imp_cols = ['Client_Income', 'Car_Owned', 'Active_Loan', 'House_Own',
        'Credit_Amount', 'Loan_Annuity', 
       'Client_Income_Type', 'Client_Education', 
       'Client_Gender', 'Loan_Contract_Type', 'Client_Housing_Type',
       'Age_Days', 'Employed_Days', 'ID_Days', 'Cleint_City_Rating',
       'Client_Permanent_Match_Tag', 
       'Score_Source_2', 'Score_Source_3', 'Phone_Change', 'Credit_Bureau',
       'ID', 'Default']

In [8]:
imp_data = data[imp_cols]
imp_data.shape

(121799, 22)

### Undersampling

In [9]:
# Step 1: Separate majority and minority classes
data_majority = imp_data[imp_data['Default'] == '0']
data_minority = imp_data[imp_data['Default'] == '1']

# Step 2: Define the percentage for undersampling (e.g., 40%)
undersample_percent = 0.4  # Retain 40% of majority class

# Calculate the number of records to keep from the majority class
n_majority_to_keep = int(len(data_majority) * undersample_percent)
print(n_majority_to_keep)

44783


In [13]:

# Step 3: Downsample the majority class
data_majority_downsampled = resample(
    data_majority,
    replace=False,  # Sample without replacement
    n_samples=n_majority_to_keep,  # Retain only the specified percentage
    random_state=42  # For reproducibility
)

# Step 4: Combine the downsampled majority class with the minority class
data_balanced = pd.concat([data_majority_downsampled, data_minority])

# Step 5: Shuffle the combined dataset
data_balanced = data_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 6: Check the new class distribution
print("Class distribution after percentage-based undersampling:\n", data_balanced['Default'].value_counts())

Class distribution after percentage-based undersampling:
 Default
0    44783
1     9841
Name: count, dtype: int64


In [14]:
# Separate features and target
X = data_balanced.drop(columns=['Default'])
y = data_balanced['Default']

In [15]:
# Step 2: Encode categorical variables using One-Hot Encoding
X = pd.get_dummies(X, drop_first=True)  # Convert categorical features to numerical

# Step 3: Train-Test Split (80-20 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

X_train shape: (43699, 30)
X_test shape: (10925, 30)
y_train: (43699,)
y_test: (10925,)
