In [2]:
# Importing Required Libraries for Analysis
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.stats.contingency import association
import math
from sklearn.preprocessing import power_transform
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score, precision_score,recall_score,confusion_matrix,fbeta_score
from sklearn.preprocessing import OrdinalEncoder,RobustScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.inspection import permutation_importance
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer,KNNImputer,SimpleImputer
from sklearn.decomposition import SparsePCA,PCA

In [2]:
## Data Loading and Initial Inspection
"""
Dataset : logistic_regression.csv

"""
df = pd.read_csv('../data/logistic_regression.csv')
print(df.head())

   loan_amnt        term  int_rate  installment grade sub_grade  \
0    10000.0   36 months     11.44       329.48     B        B4   
1     8000.0   36 months     11.99       265.68     B        B5   
2    15600.0   36 months     10.49       506.97     B        B3   
3     7200.0   36 months      6.49       220.65     A        A2   
4    24375.0   60 months     17.27       609.33     C        C5   

                 emp_title emp_length home_ownership  annual_inc  ...  \
0                Marketing  10+ years           RENT    117000.0  ...   
1          Credit analyst     4 years       MORTGAGE     65000.0  ...   
2             Statistician   < 1 year           RENT     43057.0  ...   
3          Client Advocate    6 years           RENT     54000.0  ...   
4  Destiny Management Inc.    9 years       MORTGAGE     55000.0  ...   

  open_acc pub_rec revol_bal revol_util total_acc  initial_list_status  \
0     16.0     0.0   36369.0       41.8      25.0                    w   
1     17.0

In [3]:
# As loan_status is the target variable , Checking if there is imbalance in loan_status
loan_status_freq = df['loan_status'].value_counts()
target_imbalance = loan_status_freq['Fully Paid']/loan_status_freq['Charged Off']
print(target_imbalance)

4.098682940017767


In [4]:
# Applying train test split before EDA. As its a imbalanced classification with 4:1 ratio applying straify
X_train_validation, X_test_final, y_train_validation, y_test_final = train_test_split(df.drop(['loan_status'],axis = 1),df['loan_status'],random_state=40,test_size=0.1,stratify=df['loan_status'])


In [5]:
X_train,X_validation,y_train,y_validation = train_test_split(X_train_validation,y_train_validation,random_state=40,test_size=0.1,stratify=y_train_validation)


In [6]:
# Checking lengths of datasets
print('X_train ',X_train.shape)
print('y_train ',y_train.shape)
print('X_validation ',X_validation.shape)
print('y_validation ',y_validation.shape)
print('X_test ',X_test_final.shape)
print('y_test ',y_test_final.shape)

X_train  (320784, 26)
y_train  (320784,)
X_validation  (35643, 26)
y_validation  (35643,)
X_test  (39603, 26)
y_test  (39603,)


In [7]:
loan_data = X_train.copy()
y_encoder = LabelEncoder( )
y_encoder.fit(y_train)
y_encoder.classes_= np.array(['Fully Paid','Charged Off'])
y_train_encoded = y_encoder.transform(y_train[loan_data.index])
def empl_length(x):
    if x!=x:
        return None
    if x[:2]=='10':
        return 10
    if x[0]=='<':
        return 0
    else:
        return int(x[0])
loan_data['emp_length_num'] = loan_data['emp_length'].apply(empl_length)
loan_data['term_num'] = loan_data['term'].apply(lambda x: int(x[1:3])).apply(int)
state_col = loan_data['address'].apply(lambda x: x.replace(',','').split()[-2])
state_encoder = OneHotEncoder()
state_encoded = state_encoder.fit_transform(pd.DataFrame(state_col))
state_df = pd.DataFrame.sparse.from_spmatrix(state_encoded,columns=['state_'+i for i in state_encoder.categories_[0]],index = loan_data.index)
loan_data = loan_data.join(state_df)
pincode_col = loan_data['address'].apply(lambda x: x.replace(',','').split()[-1])
pincode_encoder = OneHotEncoder()
pincode_encoded = pincode_encoder.fit_transform(pd.DataFrame(pincode_col))
pincode_df = pd.DataFrame.sparse.from_spmatrix(pincode_encoded,columns=['pincode_'+i for i in pincode_encoder.categories_[0]],index = loan_data.index)
loan_data = loan_data.join(pincode_df)
homeown = loan_data['home_ownership']
homeown_encoder = OneHotEncoder(min_frequency=1000)
homeown_prp = homeown_encoder.fit_transform(pd.DataFrame(homeown))
homeown_prp_df = pd.DataFrame.sparse.from_spmatrix(homeown_prp,columns=homeown_encoder.get_feature_names_out(),index = loan_data.index)
loan_data= loan_data.join(homeown_prp_df)
verif_encoder = OneHotEncoder()
verif_prp = verif_encoder.fit_transform(pd.DataFrame(loan_data['verification_status']))
verif_prp_df = pd.DataFrame.sparse.from_spmatrix(verif_prp,columns=verif_encoder.get_feature_names_out(),index = loan_data.index)
loan_data= loan_data.join(verif_prp_df)
ils_encoder = OneHotEncoder()
ils_prp = ils_encoder.fit_transform(pd.DataFrame(loan_data['initial_list_status']))
ils_prp_df = pd.DataFrame.sparse.from_spmatrix(ils_prp,columns=ils_encoder.get_feature_names_out(),index = loan_data.index)
loan_data= loan_data.join(ils_prp_df)
loan_subgrade = loan_data['sub_grade']
loan_subgrade_ordinal_encoder = OrdinalEncoder(categories=[['G5', 'G4', 'G3', 'G2', 'G1', 'F5', 'F4', 'F3', 'F2', 'F1', 'E5',
       'E4', 'E3', 'E2', 'E1', 'D5', 'D4', 'D3', 'D2', 'D1', 'C5', 'C4',
       'C3', 'C2', 'C1', 'B5', 'B4', 'B3', 'B2', 'B1', 'A5', 'A4', 'A3',
       'A2', 'A1']])
loan_data['sub_grade_enc'] = (loan_subgrade_ordinal_encoder.fit_transform(pd.DataFrame(loan_subgrade)))
loan_subgrade_target_encoder = TargetEncoder(random_state=40)
k = 1.2
loan_data['sub_grade_enc'] = (loan_data['sub_grade_enc']**k)/pd.Series(loan_subgrade_target_encoder.fit_transform(pd.DataFrame(loan_subgrade),y_train_encoded).reshape(1,-1)[0],index = loan_subgrade.index)
emp_title_encoder = OneHotEncoder(min_frequency=1000,handle_unknown='infrequent_if_exist')
emp_title_prp = emp_title_encoder.fit_transform(pd.DataFrame(loan_data['emp_title']))
emp_title_prp_df = pd.DataFrame.sparse.from_spmatrix(emp_title_prp,columns=emp_title_encoder.get_feature_names_out(),index = loan_data.index)
loan_data= loan_data.join(emp_title_prp_df)
loan_data['acc_open_perc'] = loan_data['open_acc']/(loan_data['total_acc']+1)
purpose_encoder = OneHotEncoder(min_frequency=1000)
purpose_prp = purpose_encoder.fit_transform(pd.DataFrame(loan_data['purpose']))
purpose_prp_df = pd.DataFrame.sparse.from_spmatrix(purpose_prp,columns=purpose_encoder.get_feature_names_out(),index = loan_data.index)
loan_data= loan_data.join(purpose_prp_df)
loan_data['credit_yr'] = pd.to_datetime(loan_data['earliest_cr_line'],format = "%b-%Y").apply(lambda x: x.year) 
loan_data['credit_month'] = pd.to_datetime(loan_data['earliest_cr_line'],format = "%b-%Y").apply(lambda x: x.month )
loan_data['issue_yr'] = pd.to_datetime(loan_data['issue_d'],format = "%b-%Y").apply(lambda x: x.year)  
loan_data['issue_month'] = pd.to_datetime(loan_data['issue_d'],format = "%b-%Y").apply(lambda x: x.month )
title_encoder = OneHotEncoder(min_frequency=1000,handle_unknown='infrequent_if_exist')
title_prp = title_encoder.fit_transform(pd.DataFrame(loan_data['title']))
title_prp_df = pd.DataFrame.sparse.from_spmatrix(title_prp,columns=title_encoder.get_feature_names_out(),index = loan_data.index)
loan_data= loan_data.join(title_prp_df)

X_train_prp = loan_data.drop(columns = ['term','installment','grade','emp_length','sub_grade','emp_title','home_ownership','verification_status','issue_d','purpose','title','earliest_cr_line','address','initial_list_status','application_type'])
scaler_1 = StandardScaler()
X_train_prp_s = pd.DataFrame(scaler_1.fit_transform(X_train_prp),index = X_train_prp.index,columns=X_train_prp.columns)
imp_mean = SimpleImputer(strategy='mean')
X_train_prp_imputed = pd.DataFrame(imp_mean.fit_transform(X_train_prp_s),columns=imp_mean.get_feature_names_out(),index = X_train_prp_s.index)
pca = PCA(n_components=120)
X_train_prp_pca = pca.fit_transform(X_train_prp_imputed)
X_train_prp_pca



array([[ 2.81021286,  1.41605861,  0.57898055, ..., -0.21475251,
         0.18593412, -0.25350122],
       [-0.9232165 ,  2.09387859, -1.92191432, ...,  0.08666308,
         0.07881045,  0.20948448],
       [ 1.19361143, -2.89680118, -1.20134756, ..., -0.0212977 ,
         0.03693095, -0.1419411 ],
       ...,
       [-1.76885635, -0.60523999,  0.66441857, ...,  0.08718215,
        -0.09399867, -0.0696243 ],
       [-1.05575514, -1.52390176,  0.19273947, ..., -0.42341593,
         0.17600811, -0.03045699],
       [ 0.94729087,  1.02876056,  0.51351647, ...,  0.1662929 ,
        -0.05906222,  0.04681927]])

In [9]:
def preprocess(X_test,y_test):
    y_test_encoded = y_encoder.transform(y_test[X_test.index])
    X_test['emp_length_num'] = X_test['emp_length'].apply(empl_length)
    X_test['term_num'] = X_test['term'].apply(lambda x: int(x[1:3])).apply(int)
    state_col = X_test['address'].apply(lambda x: x.replace(',','').split()[-2])
    state_encoded = state_encoder.transform(pd.DataFrame(state_col))
    state_df = pd.DataFrame.sparse.from_spmatrix(state_encoded,columns=['state_'+i for i in state_encoder.categories_[0]],index = X_test.index)
    X_test = X_test.join(state_df)
    pincode_col = X_test['address'].apply(lambda x: x.replace(',','').split()[-1])
    pincode_encoded = pincode_encoder.transform(pd.DataFrame(pincode_col))
    pincode_df = pd.DataFrame.sparse.from_spmatrix(pincode_encoded,columns=['pincode_'+i for i in pincode_encoder.categories_[0]],index = X_test.index)
    X_test = X_test.join(pincode_df)
    homeown = X_test['home_ownership']
    homeown_prp = homeown_encoder.transform(pd.DataFrame(homeown))
    homeown_prp_df = pd.DataFrame.sparse.from_spmatrix(homeown_prp,columns=homeown_encoder.get_feature_names_out(),index = X_test.index)
    X_test= X_test.join(homeown_prp_df)
    verif_prp = verif_encoder.transform(pd.DataFrame(X_test['verification_status']))
    verif_prp_df = pd.DataFrame.sparse.from_spmatrix(verif_prp,columns=verif_encoder.get_feature_names_out(),index = X_test.index)
    X_test= X_test.join(verif_prp_df)
    ils_prp = ils_encoder.transform(pd.DataFrame(X_test['initial_list_status']))
    ils_prp_df = pd.DataFrame.sparse.from_spmatrix(ils_prp,columns=ils_encoder.get_feature_names_out(),index = X_test.index)
    X_test= X_test.join(ils_prp_df)
    loan_subgrade = X_test['sub_grade']
    X_test['sub_grade_enc'] = (loan_subgrade_ordinal_encoder.transform(pd.DataFrame(loan_subgrade)))
    X_test['sub_grade_enc'] = (X_test['sub_grade_enc']**k)/pd.Series(loan_subgrade_target_encoder.transform(pd.DataFrame(loan_subgrade)).reshape(1,-1)[0],index = loan_subgrade.index)
    emp_title_prp = emp_title_encoder.transform(pd.DataFrame(X_test['emp_title']))
    emp_title_prp_df = pd.DataFrame.sparse.from_spmatrix(emp_title_prp,columns=emp_title_encoder.get_feature_names_out(),index = X_test.index)
    X_test= X_test.join(emp_title_prp_df)
    X_test['acc_open_perc'] = X_test['open_acc']/(X_test['total_acc']+1)
    purpose_prp = purpose_encoder.transform(pd.DataFrame(X_test['purpose']))
    purpose_prp_df = pd.DataFrame.sparse.from_spmatrix(purpose_prp,columns=purpose_encoder.get_feature_names_out(),index = X_test.index)
    X_test= X_test.join(purpose_prp_df)
    X_test['credit_yr'] = pd.to_datetime(X_test['earliest_cr_line'],format = "%b-%Y").apply(lambda x: x.year) 
    X_test['credit_month'] = pd.to_datetime(X_test['earliest_cr_line'],format = "%b-%Y").apply(lambda x: x.month )
    X_test['issue_yr'] = pd.to_datetime(X_test['issue_d'],format = "%b-%Y").apply(lambda x: x.year)  
    X_test['issue_month'] = pd.to_datetime(X_test['issue_d'],format = "%b-%Y").apply(lambda x: x.month )
    title_prp = title_encoder.transform(pd.DataFrame(X_test['title']))
    title_prp_df = pd.DataFrame.sparse.from_spmatrix(title_prp,columns=title_encoder.get_feature_names_out(),index = X_test.index)
    X_test= X_test.join(title_prp_df)
    X_test_prp = X_test.drop(columns = ['term','installment','grade','emp_length','sub_grade','emp_title','home_ownership','verification_status','issue_d','purpose','title','earliest_cr_line','address','initial_list_status','application_type'])
    X_test_prp_s = pd.DataFrame(scaler_1.transform(X_test_prp),index = X_test_prp.index,columns=X_test_prp.columns)
    X_test_prp_imputed = pd.DataFrame(imp_mean.transform(X_test_prp_s),columns=imp_mean.get_feature_names_out(),index = X_test_prp_s.index)
    X_test_prp_pca = pca.transform(X_test_prp_imputed)
    return X_test_prp_pca,y_test_encoded

In [10]:
X_test_prp_pca,y_test_encoded = preprocess(X_test_final,y_test_final)
X_validation_prp_pca,y_validation_encoded = preprocess(X_validation,y_validation)



In [11]:
model_1 = LogisticRegression(n_jobs=-1,penalty='l2',solver='saga',random_state=40,class_weight={0:1,1:2.5})
model_1.fit(X_train_prp_pca,y_train_encoded)
print("Accuracy:", accuracy_score(y_train_encoded,model_1.predict(X_train_prp_pca)))
print("Precision:", precision_score(y_train_encoded,model_1.predict(X_train_prp_pca)))
print("ROC-AUC:", roc_auc_score(y_train_encoded,model_1.predict_proba(X_train_prp_pca)[:,1]))
print("Recall", recall_score(y_train_encoded,model_1.predict(X_train_prp_pca)))
print("Fbeta", fbeta_score(y_train_encoded,model_1.predict(X_train_prp_pca),beta = 0.75,))



Accuracy: 0.8623372736794853
Precision: 0.6440719630045015
ROC-AUC: 0.9077119601156327
Recall 0.6663275848366844
Fbeta 0.6519106352777548


In [12]:

print("Accuracy:", accuracy_score(y_validation_encoded,model_1.predict(X_validation_prp_pca)))
print("Precision:", precision_score(y_validation_encoded,model_1.predict(X_validation_prp_pca)))
print("ROC-AUC:", roc_auc_score(y_validation_encoded,model_1.predict_proba(X_validation_prp_pca)[:,1]))
print("Recall", recall_score(y_validation_encoded,model_1.predict(X_validation_prp_pca)))
print("Fbeta", fbeta_score(y_validation_encoded,model_1.predict(X_validation_prp_pca),beta = 0.75,))

Accuracy: 0.8609544651123643
Precision: 0.6416144745998609
ROC-AUC: 0.9057915261425948
Recall 0.6594192533257045
Fbeta 0.6479123449086177


In [13]:

print("Accuracy:", accuracy_score(y_test_encoded,model_1.predict(X_test_prp_pca)))
print("Precision:", precision_score(y_test_encoded,model_1.predict(X_test_prp_pca)))
print("ROC-AUC:", roc_auc_score(y_test_encoded,model_1.predict_proba(X_test_prp_pca)[:,1]))
print("Recall", recall_score(y_test_encoded,model_1.predict(X_test_prp_pca)))
print("Fbeta", fbeta_score(y_test_encoded,model_1.predict(X_test_prp_pca),beta = 0.75,))

Accuracy: 0.8621821579173296
Precision: 0.6438270835928741
ROC-AUC: 0.9066972207715824
Recall 0.6653791682760396
Fbeta 0.6514230972848968
