### Imports


In [None]:

# Numpy,Pandas
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
plt.style.use('ggplot') 
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
import missingno as msno

import warnings
warnings.filterwarnings('ignore')  

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows',None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [None]:
# https://www.kaggle.com/datasets/jeandedieunyandwi/lending-club-dataset?datasetId=608703&sortBy=voteCount
loans = pd.read_csv('lending_club_loan_two.csv' , encoding='latin-1') 
loans.head() 

### Data Analysiss

In [None]:
loans.info()

In [None]:
cat_cols = loans.select_dtypes(include=["object"]).columns.tolist()

cat_cols

In [None]:
cat_cols = ['term',
 'emp_title',
 'emp_length',
 'home_ownership',
 'verification_status',
 'purpose',
 'title',
 'initial_list_status',
 'application_type',
 ]

for col in cat_cols:
    loans[col] = loans[col].astype("category")
    
loans["issue_d"] = pd.to_datetime(loans["issue_d"]).astype('datetime64[ns]')
loans["earliest_cr_line"] = pd.to_datetime(loans["earliest_cr_line"]).astype('datetime64[ns]')

In [None]:
loans.describe(include=['category']).T

#### Missing Value Detection

In [None]:
loans.isnull().sum(axis=0).sort_values(ascending=False)/float(len(loans)) 

In [None]:
cat_cols = loans.select_dtypes(include=["category"]).columns.tolist()
loans[cat_cols].nunique()

#### Outlier Detection

In [None]:
num_cols = loans.select_dtypes(include=["float64"]).columns.tolist()
num_cols.remove("pub_rec_bankruptcies")
num_cols.remove("mort_acc")
num_cols.remove("pub_rec")
original_obs = loans.shape[0]

for col in num_cols:
    q25 = np.nanquantile(loans[col], 0.25)
    q75 = np.nanquantile(loans[col], 0.75)
    IQR = q75 - q25
    drop_ind = (loans[col]< q25 - 1.5*IQR) | (loans[col]> q75 + 1.5*IQR)
    print("Percentage of outliers in column ", col ,"is : ", np.round(loans[drop_ind][col].count()*100/loans[col].count(), decimals=3), "  || n_obs =", loans[drop_ind][col].count(),"||  max =", q75 + 1.5*IQR )

In [None]:
loans_od=loans.copy()
for col in num_cols:
    q25 = np.nanquantile(loans_od[col], 0.25)
    q75 = np.nanquantile(loans_od[col], 0.75)
    med = np.nanquantile(loans_od[col], 0.5)
    IQR = q75 - q25

    drop_ind = (loans_od[col]< q25 - 1.5*IQR)
    loans_od = loans_od[~drop_ind]
    drop_ind = (loans_od[col]> q75 + 1.5*IQR)
    loans_od = loans_od[~drop_ind]
    
print("Percentage of values dropped : ", 100 - loans_od.shape[0]*100/original_obs)

#### Numerical Features

In [None]:
from itertools import cycle
cycol = cycle('bgrcm')

num_cols = ['loan_amnt',
 'int_rate',
 'installment',
 'annual_inc',
 'dti',
 'open_acc',
 'revol_bal',
 'revol_util',
 'total_acc']

fig, ax = plt.subplots(len(num_cols),2, figsize=(16,40))

for i in range(len(num_cols)):
    color_next = next(cycol)
    sns.distplot(loans_od[num_cols[i]], ax=ax[i,0], color=color_next)
    ax[i,0].set_title(num_cols[i])
    ax[i,0].set_xlabel('')
    sns.boxplot(loans_od[num_cols[i]], width = 0.3, ax=ax[i,1], color=color_next)
    ax[i,1].set_title(num_cols[i]+str("-Boxplot"))
    ax[i,1].set_xlabel('')
    
plt.show()

#### Target feature

In [None]:
loans['loan_status'] = loans['loan_status'].map({'Fully Paid':0,'Charged Off':1})

In [None]:
fig, axs = plt.subplots(1,2,figsize=(14,7))
sns.countplot(x='loan_status',data=loans,ax=axs[0])
axs[0].set_title("Frequency of each Loan Status")
loans['loan_status'].value_counts().plot(x=None,y=None, kind='pie', ax=axs[1],autopct='%1.2f%%')
axs[1].set_title("Percentage of each Loan status")
plt.show()

#### Categorical Features

In [None]:
#Target V & categorical V
low_cat_cols = ['term','grade','emp_length','home_ownership','verification_status','purpose', 'initial_list_status', 'application_type']
for col in low_cat_cols:
    Gender=pd.crosstab(loans[col],loans["loan_status"])
    Gender.div(Gender.sum(1).astype(float),axis=0).plot(kind='bar',stacked=True,figsize=(4,4))

### Data Preprocessing

#### 1.Outlier Treatment


In [None]:
for col in num_cols:
    q25 = np.nanquantile(loans[col], 0.25)
    q75 = np.nanquantile(loans[col], 0.75)
    med = np.nanquantile(loans[col], 0.50)
    IQR = q75 - q25
    
    replace_ind = (loans[col]< q25 - 1.5*IQR)
    loans.loc[replace_ind,col] = med
    replace_ind = (loans[col]> q75 + 1.5*IQR)
    loans.loc[replace_ind,col] = med

#### 2. MIssing Values 

In [None]:
loans.isnull().sum(axis=0).sort_values(ascending=False)/float(len(loans)) 

##### title

In [None]:
loans.drop(columns=['title'], inplace=True)

##### emp_title

In [None]:
loans.drop(columns=['emp_title'], inplace=True)

##### emp_length

In [None]:
loans.drop(columns=['emp_length'], inplace=True)

##### public_rec_bankruptcies, revol_util
The columns 'public_rec_bankruptcies' and 'revol_util' have very few missing values. We replace those with medians.

In [None]:
#df = df.dropna(axis = 0, how= 'any', subset = ['pub_rec_bankruptcies'])
loans["pub_rec_bankruptcies"] = loans["pub_rec_bankruptcies"].fillna(loans["pub_rec_bankruptcies"].median())
loans["revol_util"] = loans["revol_util"].fillna(loans["revol_util"].median())

##### mort_acc

We are now left with mort_acc. We fill up using the mode.

In [None]:
loans['mort_acc'].mode()

In [None]:
loans['mort_acc'] = loans['mort_acc'].fillna(0.00000)

In [None]:
loans.isnull().sum(axis=0).sort_values(ascending=False)/float(len(loans)) 

#### 3. feature engineering

In [None]:
def convert01(x):
    if x == 0:
        return 0
    else:
        return 1
    
loans['pub_rec'] = loans['pub_rec'].apply(lambda x: convert01(x))
loans['pub_rec_bankruptcies'] = loans['pub_rec_bankruptcies'].apply(lambda x: convert01(x))
loans['mort_acc'] = loans['mort_acc'].apply(lambda x: convert01(x))

##### address

In [None]:
loans['address'].head(3)

In [None]:
loans.drop(columns=['address'],inplace=True)

##### issue_d

In [None]:
loans['issue_d'].head(3)

In [None]:
loans.drop(columns=['issue_d'], inplace=True)

##### earliest_cr_line
We also extract the earliest issued credit line year out of our data.

In [None]:
print(loans['earliest_cr_line'] )

In [None]:
loans['earliest_cr_line'] = loans['earliest_cr_line'].dt.year

In [None]:
print(loans['earliest_cr_line'] )

##### Home Ownership
We see that 'OTHER', 'NONE' & 'ANY' are very few in number. We could club all three into one category 'Other'.

In [None]:
def make_other(x):
    if x in ['ANY','OTHER','NONE']:
        return 'Other'
    else:
        return x
    
loans['home_ownership'] = loans['home_ownership'].apply(lambda x : make_other(x))

##### Grade and Sub Grade

In [None]:
loans.drop(columns=['sub_grade'],inplace=True)

In [None]:

mapping_dict = {
    "grade":{
        "A": 1,
        "B": 2,
        "C": 3,
        "D": 4,
        "E": 5,
        "F": 6,
        "G": 7
    }
}

loans = loans.replace(mapping_dict) 
loans['grade'].head() 

#### OHE / Dummy Encoding

In [None]:
dummies = ['term', 'home_ownership', 'verification_status','purpose', 'application_type','initial_list_status']

loans = pd.get_dummies(loans, columns=dummies, drop_first=True)


#### StandardScaler 

In [None]:
loans.info()

In [None]:
col = loans.select_dtypes(include=['int64','float64']).columns
len(col)

In [None]:
col = col.drop('loan_status') 
loans_ml_df = loans 

from sklearn.preprocessing import StandardScaler 
sc =StandardScaler() 
loans_ml_df[col] =sc.fit_transform(loans_ml_df[col])
loans_ml_df.head() 

In [None]:

x_feature = list(loans_ml_df.columns)
x_feature.remove('loan_status')
x_val = loans_ml_df[x_feature]
y_val = loans_ml_df['loan_status']
len(x_feature) 

In [None]:
loans_ml_df.isnull().values.any()

### feature selection

#### wrapper approacc

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

rfe = RFE(model, n_features_to_select=20,step=1) 
rfe = rfe.fit(x_val, y_val)

print(rfe.support_)
print(rfe.ranking_) 

In [None]:
col_filter = x_val.columns[rfe.support_]
col_filter 

#### filter approach

In [None]:
colormap = plt.cm.viridis
plt.figure(figsize=(20,20))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(loans_ml_df[col_filter].corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)

In [None]:
drop_col = ['total_acc', 'verification_status_Verified','application_type_JOINT','int_rate']
col_new = col_filter.drop(drop_col) 

In [None]:
len(col_new)

In [None]:
colormap = plt.cm.viridis
plt.figure(figsize=(12,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(loans_ml_df[col_new].corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)


#### embedded approach

In [None]:
from sklearn.ensemble import RandomForestClassifier
names = loans_ml_df[col_new].columns
clf=RandomForestClassifier(n_estimators=10,random_state=123)
clf.fit(x_val[col_new], y_val) 
names, clf.feature_importances_
for feature in zip(names, clf.feature_importances_):
    print(feature)

In [None]:
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (12,6)

importances = clf.feature_importances_
feat_names = names
indices = np.argsort(importances)[::-1]
fig = plt.figure(figsize=(20,6))
plt.title("Feature importances by RandomTreeClassifier")
plt.bar(range(len(indices)), importances[indices], color='lightblue',  align="center")
plt.step(range(len(indices)), np.cumsum(importances[indices]), where='mid', label='Cumulative')
plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical',fontsize=14)
plt.xlim([-1, len(indices)])
plt.show()

In [None]:
# drop_col = ['application_type_INDIVIDUAL', 'purpose_educational','purpose_renewable_energy','purpose_wedding','purpose_house','purpose_medical','purpose_small_business']
drop_col = ['application_type_INDIVIDUAL', 'purpose_educational','purpose_renewable_energy']
col_new = col_new.drop(drop_col) 

In [None]:
print(col_new)

#### Train-Test-Validation Split

In [None]:

X = loans_ml_df.drop('loan_status',1)
y = loans_ml_df["loan_status"]

n_sample = y.shape[0]
n_pos_sample = y[y == 0].shape[0]
n_neg_sample = y[y == 1].shape[0]
print('Total:{}; 0 :{:.2%}; 1 :{:.2%}'.format(n_sample,
                                                   n_pos_sample / n_sample,
                                                   n_neg_sample / n_sample))
print('shape', X.shape[1])

In [None]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) 

n_sample = y_train.shape[0]
n_pos_sample = y_train[y_train == 0].shape[0]
n_neg_sample = y_train[y_train == 1].shape[0]
print('Total:{}; 0 :{:.2%}; 1 :{:.2%}'.format(n_sample,
                                                   n_pos_sample / n_sample,
                                                   n_neg_sample / n_sample))
print('shape', X_train.shape[1])

n_sample = y_test.shape[0]
n_pos_sample = y_test[y_test == 0].shape[0]
n_neg_sample = y_test[y_test == 1].shape[0]
print('Total:{}; 0 :{:.2%}; 1 :{:.2%}'.format(n_sample,
                                                   n_pos_sample / n_sample,
                                                   n_neg_sample / n_sample))
print('shape',  X_test.shape[1])

In [None]:

X_new = loans_ml_df[col_new]
y_new = loans_ml_df["loan_status"]

n_sample = y_new.shape[0]
n_pos_sample = y_new[y_new == 0].shape[0]
n_neg_sample = y_new[y_new == 1].shape[0]
print('Total:{}; 0 :{:.2%}; 1 :{:.2%}'.format(n_sample,
                                                   n_pos_sample / n_sample,
                                                   n_neg_sample / n_sample))
print('shape', X_new.shape[1])

In [None]:
from sklearn.model_selection import train_test_split 
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, test_size = 0.3, random_state = 0) 

n_sample = y_train_new.shape[0]
n_pos_sample = y_train_new[y_train_new == 0].shape[0]
n_neg_sample = y_train_new[y_train_new == 1].shape[0]
print('Total:{}; 0 :{:.2%}; 1 :{:.2%}'.format(n_sample,
                                                   n_pos_sample / n_sample,
                                                   n_neg_sample / n_sample))
print('shape', X_train_new.shape[1])

n_sample = y_test_new.shape[0]
n_pos_sample = y_test_new[y_test_new == 0].shape[0]
n_neg_sample = y_test_new[y_test_new == 1].shape[0]
print('Total:{}; 0 :{:.2%}; 1 :{:.2%}'.format(n_sample,
                                                   n_pos_sample / n_sample,
                                                   n_neg_sample / n_sample))
print('shape',  X_test_new.shape[1])

#### 
### Model Building + Validation and Evaluation of Results

#### Régression logistique

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold 


param_grid = {'C': [0.01,0.1, 1, 10, 100, 1000]}

kflod = StratifiedKFold(n_splits=10, shuffle = True,random_state=0)
model = LogisticRegression(class_weight='balanced')

grid_search = GridSearchCV(model,param_grid, cv= kflod) 
grid_search.fit(X_train_new, y_train_new)


In [None]:
results = pd.DataFrame(grid_search.cv_results_) 
best = np.argmax(results.mean_test_score.values)
print(results)

In [None]:
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.5f}".format(grid_search.best_score_))


In [None]:
mode_lr_new = LogisticRegression(C=0.01,class_weight='balanced')
mode_lr_new.fit(X_train_new,y_train_new)
y_pred_new = mode_lr_new.predict(X_test_new)
print("Test set accuracy score: {:.5f}".format(accuracy_score(y_test_new, y_pred_new)))

In [None]:
print(classification_report(y_test_new, y_pred_new))

In [None]:
roc_auc1 = roc_auc_score(y_test_new, y_pred_new)
print("Area under the ROC curve : %f" % roc_auc1)

In [None]:
from imblearn.over_sampling import SMOTE 

sm = SMOTE(random_state=42) 
X_train_sm, y_train_sm = sm.fit_resample(X_train_new, y_train_new)
print('apres SMOTE ')
n_sample = y_train_sm.shape[0]
n_pos_sample = y_train_sm[y_train_sm == 0].shape[0]
n_neg_sample = y_train_sm[y_train_sm == 1].shape[0]
print('Total: {}; 0 : {:.2%}; 1 : {:.2%}'.format(n_sample,
                                                   n_pos_sample / n_sample,
                                                   n_neg_sample / n_sample))

In [None]:
X_train_rf = X_train_sm.copy()
y_train_rf = y_train_sm.copy()
X_test_rf = X_test_new.copy()
y_test_rf = y_test_new.copy()

In [None]:
mode_lr_sm = LogisticRegression(C=0.01)
mode_lr_sm.fit(X_train_sm,y_train_sm)
y_pred_sm = mode_lr_sm.predict(X_test_new)
print("Test set accuracy score: {:.5f}".format(accuracy_score(y_test_new, y_pred_sm)))

In [None]:
print(classification_report(y_test_new, y_pred_sm))

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc2 = roc_auc_score(y_test_new, y_pred_sm)
print("Area under the ROC curve : %f" % roc_auc2)

#### Random Forest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train_rf, y_train_rf)
score1 = rf.score(X_test_rf,y_test_rf)
print(score1)
from sklearn.metrics import roc_auc_score
proba = rf.predict_proba(X_test_rf)
score2 = roc_auc_score(y_test,proba[:,1])
print(score2)

In [None]:
from sklearn.model_selection import cross_val_score
score3 = cross_val_score(rf,X_train_rf,y_train_rf,scoring='accuracy',cv = 5)
print(score3)
print(score3.mean())

In [None]:
predictions = rf.predict(X_test_rf)
print(rf.score(X_test_rf,y_test_rf))
print(classification_report(y_test_rf, predictions))

In [None]:
num_estimator = {'n_estimators':range(50,300,50)}
gs1 = GridSearchCV(estimator = rf,param_grid = num_estimator,scoring='roc_auc',cv = 5)
gs1.fit(X_train_rf,y_train_rf)
print(gs1.best_estimator_)
print(gs1.best_score_)

In [None]:
maxdepth = {'max_depth':range(3,10,1)}
gs2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators = 250),param_grid = maxdepth,scoring = 'roc_auc',cv = 3)
gs2.fit(X_train_rf,y_train_rf)
print(gs2.best_estimator_)
print(gs2.best_score_)

In [None]:
minsamples = {'min_samples_split':range(2,14,2)}
gs3 = GridSearchCV(estimator = RandomForestClassifier(max_depth=9, n_estimators=250),param_grid = minsamples,scoring = 'roc_auc',cv = 3)
gs3.fit(X_train_rf,y_train_rf)
print(gs3.best_estimator_)
print(gs3.best_score_)

In [None]:
best_rfc = RandomForestClassifier(max_depth=9, min_samples_split=12, n_estimators=250)
best_rfc.fit(X_train_rf,y_train_rf)
print(best_rfc.score(X_test_rf,y_test_rf))
y_pred_rf = best_rfc.predict(X_test_rf)
print(classification_report(y_test_rf, y_pred_rf))

In [None]:
from bayes_opt import BayesianOptimization
def RF_evaluate(n_estimators, min_samples_split, max_features, max_depth):
    val = cross_val_score(
            RandomForestClassifier(n_estimators=int(n_estimators),
                       min_samples_split=int(min_samples_split),
                       max_features=min(max_features, 0.999),
                       max_depth=int(max_depth),
                       random_state=90,
                       n_jobs=-1),
            X_train_rf, y_train_rf, scoring='f1', cv=5
        ).mean()
    return val

pbounds = {'n_estimators': (50, 250), 'min_samples_split': (2, 25),'max_features': (0.1, 0.999),'max_depth': (5, 12)}

RF_bo = BayesianOptimization(f=RF_evaluate, pbounds=pbounds, verbose=2,random_state=1,)

RF_bo.maximize(init_points=5,n_iter=10,acq='ei')
print(RF_bo.max)

In [None]:
best_rfc = RandomForestClassifier(max_depth=10, max_features=0.93, min_samples_split=23, n_estimators=211)
best_rfc.fit(X_train_rf,y_train_rf)
print(best_rfc.score(X_test_rf,y_test_rf))
y_pred_rf = best_rfc.predict(X_test_rf)
print(classification_report(y_test_rf, y_pred_rf))