In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC # SVM for classification
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.metrics import f1_score, precision_score,recall_score, confusion_matrix # Evaluation metrics
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from xgboost import XGBClassifier # XGBoost classifier
import warnings

In [21]:
df = pd.read_csv('data/loan_data.csv') # Read the data 

In [3]:
# Drop installment column as it is highly correlated with loan amount 
df.drop('installment',axis=1,inplace=True) 

In [20]:
df.head() # Check the first 5 rows

Unnamed: 0,loan_amnt,term,int_rate,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,loan_status,...,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,zip_code
0,10000.0,36,11.44,6,27,10,RENT,117000.0,Not Verified,0,...,16.0,0,36369.0,41.8,25.0,0,INDIVIDUAL,0,0,22690
1,8000.0,36,11.99,6,26,4,MORTGAGE,65000.0,Not Verified,0,...,17.0,0,20131.0,53.3,27.0,1,INDIVIDUAL,1,0,5113
2,15600.0,36,10.49,6,28,0,RENT,43057.0,Source Verified,0,...,13.0,0,11987.0,92.2,26.0,1,INDIVIDUAL,0,0,5113
3,7200.0,36,6.49,7,34,6,RENT,54000.0,Not Verified,0,...,6.0,0,5472.0,21.5,13.0,1,INDIVIDUAL,0,0,813
4,24375.0,60,17.27,5,21,9,MORTGAGE,55000.0,Verified,1,...,13.0,0,24584.0,69.8,43.0,1,INDIVIDUAL,1,0,11650


In [5]:
df.dropna(inplace=True) # Drop the rows with missing values 

In [22]:
df.info() # Check the data types of the columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 396030 entries, 0 to 396029
Data columns (total 27 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   loan_amnt             396030 non-null  float64
 1   term                  396030 non-null  object 
 2   int_rate              396030 non-null  float64
 3   installment           396030 non-null  float64
 4   grade                 396030 non-null  object 
 5   sub_grade             396030 non-null  object 
 6   emp_title             373103 non-null  object 
 7   emp_length            377729 non-null  object 
 8   home_ownership        396030 non-null  object 
 9   annual_inc            396030 non-null  float64
 10  verification_status   396030 non-null  object 
 11  issue_d               396030 non-null  object 
 12  loan_status           396030 non-null  object 
 13  purpose               396030 non-null  object 
 14  title                 394274 non-null  object 
 15  

In [7]:
# Mapping of target variable -


df.loc[(df.home_ownership == 'ANY') | (df.home_ownership == 'NONE'), 'home_ownership'] = 'OTHER'
print(df.home_ownership.value_counts())

df['issue_d'] = pd.to_datetime(df['issue_d'])
df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'])

def pub_rec(number):
    if number == 0.0:
        return 0
    else:
        return 1

def mort_acc(number):
    if number == 0.0:
        return 0
    elif number >= 1.0:
        return 1
    else:
        return number

def pub_rec_bankruptcies(number):
    if number == 0.0:
        return 0
    elif number >= 1.0:
        return 1
    else:
        return number


df['pub_rec'] = df.pub_rec.apply(pub_rec)
df['mort_acc'] = df.mort_acc.apply(mort_acc)
df['pub_rec_bankruptcies'] = df.pub_rec_bankruptcies.apply(pub_rec_bankruptcies) 

home_ownership
MORTGAGE    170831
RENT        133932
OWN          31045
OTHER           59
Name: count, dtype: int64


  df['issue_d'] = pd.to_datetime(df['issue_d'])
  df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'])


In [8]:
print(df.loan_status.value_counts())


print(df.info())

loan_status
Fully Paid     269555
Charged Off     66312
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 335867 entries, 0 to 396028
Data columns (total 26 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   loan_amnt             335867 non-null  float64       
 1   term                  335867 non-null  object        
 2   int_rate              335867 non-null  float64       
 3   grade                 335867 non-null  object        
 4   sub_grade             335867 non-null  object        
 5   emp_title             335867 non-null  object        
 6   emp_length            335867 non-null  object        
 7   home_ownership        335867 non-null  object        
 8   annual_inc            335867 non-null  float64       
 9   verification_status   335867 non-null  object        
 10  issue_d               335867 non-null  datetime64[ns]
 11  loan_status           335867 non-null  o

In [9]:
term_values = {' 36 months': 36, ' 60 months': 60}
df['term'] = df.term.map(term_values)

list_status = {'w': 0, 'f': 1}
df['initial_list_status'] = df.initial_list_status.map(list_status)

# Let's fetch ZIP from address and then drop the remaining details -
df['zip_code'] = df.address.apply(lambda x: x[-5:])

# Dropping some variables which IMO we can let go for now -
df.drop(columns=['issue_d', 'emp_title', 'title',
                   'address', 'earliest_cr_line'],
                   axis=1, inplace=True) 

emp_len_to_int = {'10+ years': 10, '4 years': 4, '< 1 year': 0, '6 years': 6, '9 years': 9, '2 years': 2, 
                  '3 years': 3, '8 years': 8, '7 years': 7, '5 years': 5, '1 year': 1, 'nan': np.nan} 


grade_to_int = {'A':7, 'B':6, 'C':5, 'D':4, 'E':3, 'F':2, 'G':1}

sub_grade_to_int = {'A1':35, 'A2':34, 'A3':33, 'A4':32, 'A5':31, 
                    'B1':30, 'B2':29, 'B3':28, 'B4':27, 'B5':26, 
                    'C1':25, 'C2':24, 'C3':23, 'C4':22, 'C5':21, 
                    'D1':20, 'D2':19, 'D3':18, 'D4':17, 'D5':16, 
                    'E1':15, 'E2':14, 'E3':13, 'E4':12, 'E5':11, 
                    'F1':10, 'F2':9, 'F3':8, 'F4':7, 'F5':6, 
                    'G1':5, 'G2':4, 'G3':3, 'G4':2, 'G5':1}

df['emp_length'] = df['emp_length'].apply(lambda x: emp_len_to_int[str(x).split('-')[0]])  

df['grade'] = df['grade'].apply(lambda x: grade_to_int[str(x).split('-')[0]])
df['sub_grade'] = df['sub_grade'].apply(lambda x: sub_grade_to_int[str(x).split('-')[0]])

In [10]:
# loan_status_values = {'Fully Paid': 0, 'Charged Off': 1} 
# df['loan_status'] = df.loan_status.map(loan_status_values)  
# df['loan_status'] = df['loan_status'].map({'Fully Paid': 0, 'Charged Off': 1})
df['loan_status'] = df['loan_status'].apply(lambda x: 0 if x == 'Fully Paid' else 1)
df['loan_status'] = df['loan_status'].astype('int64') 

In [11]:
df.shape

(335867, 22)

In [12]:
numerical_data = df.select_dtypes(include='number')
num_cols = numerical_data.columns
len(num_cols)
for col in num_cols:
    mean = df[col].mean()
    std = df[col].std()

    upper_limit = mean+3*std
    lower_limit = mean-3*std

    df = df[(df[col]<upper_limit) & (df[col]>lower_limit)]

print(f"After removing outliers: {df.shape}")

After removing outliers: (319771, 22)


In [13]:
num_features = df.select_dtypes(exclude="object").columns
cat_features = df.select_dtypes(include="object").columns 

In [14]:
df.shape

(319771, 22)

In [15]:
num_features

Index(['loan_amnt', 'term', 'int_rate', 'grade', 'sub_grade', 'emp_length',
       'annual_inc', 'loan_status', 'dti', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'mort_acc',
       'pub_rec_bankruptcies'],
      dtype='object')

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 319771 entries, 0 to 396028
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   loan_amnt             319771 non-null  float64
 1   term                  319771 non-null  int64  
 2   int_rate              319771 non-null  float64
 3   grade                 319771 non-null  int64  
 4   sub_grade             319771 non-null  int64  
 5   emp_length            319771 non-null  int64  
 6   home_ownership        319771 non-null  object 
 7   annual_inc            319771 non-null  float64
 8   verification_status   319771 non-null  object 
 9   loan_status           319771 non-null  int64  
 10  purpose               319771 non-null  object 
 11  dti                   319771 non-null  float64
 12  open_acc              319771 non-null  float64
 13  pub_rec               319771 non-null  int64  
 14  revol_bal             319771 non-null  float64
 15  revol

In [140]:
def remove_outlier(df, col):
    '''
    This function removes the outliers from the column passed in the argument
    '''
    q1 = df[col].quantile(0.25) # 25th quantile
    q3 = df[col].quantile(0.75) # 75th quantile
    iqr = q3-q1 # Interquartile range
    fence_low  = q1-1.5*iqr # Inner fence
    fence_high = q3+1.5*iqr # Outer fence
    df_out = df.loc[(df[col] > fence_low) & (df[col] < fence_high)] # Data within inner and outer fence
    return df_out # Return the dataframe with outliers removed 

# remove outliers from all the columns using the above function and for loop
for col in num_features:
    df = remove_outlier(df, col) 

In [17]:
df.info() # Check the data types of the columns 

<class 'pandas.core.frame.DataFrame'>
Index: 319771 entries, 0 to 396028
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   loan_amnt             319771 non-null  float64
 1   term                  319771 non-null  int64  
 2   int_rate              319771 non-null  float64
 3   grade                 319771 non-null  int64  
 4   sub_grade             319771 non-null  int64  
 5   emp_length            319771 non-null  int64  
 6   home_ownership        319771 non-null  object 
 7   annual_inc            319771 non-null  float64
 8   verification_status   319771 non-null  object 
 9   loan_status           319771 non-null  int64  
 10  purpose               319771 non-null  object 
 11  dti                   319771 non-null  float64
 12  open_acc              319771 non-null  float64
 13  pub_rec               319771 non-null  int64  
 14  revol_bal             319771 non-null  float64
 15  revol

In [19]:
print(df.home_ownership.value_counts())
print("*"*50)
print(df.verification_status.value_counts())
print("*"*50)
print(df.purpose.value_counts()) 
print("*"*50)
print(df.application_type.value_counts())
print("*"*50)
print(df.zip_code.value_counts()) 
print("finished")

home_ownership
MORTGAGE    160550
RENT        129950
OWN          29213
OTHER           58
Name: count, dtype: int64
**************************************************
verification_status
Source Verified    109788
Not Verified       105112
Verified           104871
Name: count, dtype: int64
**************************************************
purpose
debt_consolidation    195023
credit_card            69386
home_improvement       18248
other                  15309
major_purchase          6065
small_business          3087
medical                 3071
car                     2976
moving                  2063
vacation                1890
house                   1537
wedding                  902
renewable_energy         214
Name: count, dtype: int64
**************************************************
application_type
INDIVIDUAL    319427
JOINT            265
DIRECT_PAY        79
Name: count, dtype: int64
**************************************************
zip_code
70466    45993
30723    45720

In [154]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']

In [155]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape 

((255816, 21), (63955, 21))

In [156]:
def evaluate_model(true, predicted) :
    '''Write a code to evaluate the model using the following metrics: F1 score, Accuracy, Precision, Recall, and Confusion Matrix.'''
    from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
    print('F1 score: ', f1_score(true, predicted))
    print('Accuracy: ', accuracy_score(true, predicted))
    print('Precision: ', precision_score(true, predicted))
    print('Recall: ', recall_score(true, predicted))
    print('Confusion Matrix: ', confusion_matrix(true, predicted))
    return None 

In [157]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [158]:
X_train = preprocessor.fit_transform(X_train) # Fit and transform the training data
X_test = preprocessor.transform(X_test) # Transform the test data 

In [82]:
X_train.shape, X_test.shape 

((268693, 21), (67174, 21))

In [159]:
def evaluate_model(true, predicted) :
    '''Write a code to evaluate the model using the following metrics: F1 score, Accuracy, Precision, Recall, and Confusion Matrix.'''
    from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
    f1_score_1 =  f1_score(true, predicted)
    accuracy_score_1 = accuracy_score(true, predicted)
    precision_score_1 = precision_score(true, predicted)
    recall_score_1 = recall_score(true, predicted)
    # confusion_matrix_1 = confusion_matrix(true, predicted)
    return f1_score_1, accuracy_score_1, precision_score_1, recall_score_1  

In [None]:
X_train, X_test, y_train, y_test

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Regressor": RandomForestClassifier(),
    "XGBRegressor": XGBClassifier(), 
    "AdaBoost Regressor": AdaBoostClassifier()
}

In [160]:
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced'),
    "Decision Tree Classifier": DecisionTreeClassifier(class_weight='balanced'),
    "Random Forest Classifier": RandomForestClassifier(class_weight='balanced'),
    "XGBRegressor": XGBClassifier(scale_pos_weight=sum(y_train==0)/sum(y_train==1)), 
    "AdaBoost Regressor": AdaBoostClassifier()
}
model_list = []
f1_score_list = []
accuracy_score_list =[]
precision_score_list = []
recall_score_list = [] 
 

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model 

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate Train and Test dataset
    f1_score_train , accuracy_score_train, precision_score_train, recall_score_train  = evaluate_model(y_train, y_train_pred)

    f1_score_test , accuracy_score_test, precision_score_test, recall_score_test = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- F_1 Score: {:.4f}".format(f1_score_train))
    print("- Accuracy Score: {:.4f}".format(accuracy_score_train))
    print("- Precision Score: {:.4f}".format(precision_score_train))
    print("- Recall Score: {:.4f}".format(recall_score_train))
    # print("- Confusion Matrix: {:.4f}".format(confusion_matrix_train))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- F_1 Score: {:.4f}".format(f1_score_test))
    print("- Accuracy Score: {:.4f}".format(accuracy_score_test))
    print("- Precision Score: {:.4f}".format(precision_score_test))
    print("- Recall Score: {:.4f}".format(recall_score_test))
    # print("- Confusion Matrix: {:.4f}".format(confusion_matrix_test))
    f1_score_list.append(f1_score_test)
    accuracy_score_list.append(accuracy_score_test)
    precision_score_list.append(precision_score_test)
    recall_score_list.append(recall_score_list)

    
    print('='*35)
    print('\n')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression
Model performance for Training set
- F_1 Score: 0.6225
- Accuracy Score: 0.8115
- Precision Score: 0.5126
- Recall Score: 0.7925
----------------------------------
Model performance for Test set
- F_1 Score: 0.6245
- Accuracy Score: 0.8122
- Precision Score: 0.5129
- Recall Score: 0.7983


Decision Tree Classifier
Model performance for Training set
- F_1 Score: 1.0000
- Accuracy Score: 1.0000
- Precision Score: 1.0000
- Recall Score: 1.0000
----------------------------------
Model performance for Test set
- F_1 Score: 0.5834
- Accuracy Score: 0.8357
- Precision Score: 0.5787
- Recall Score: 0.5883


Random Forest Classifier
Model performance for Training set
- F_1 Score: 1.0000
- Accuracy Score: 1.0000
- Precision Score: 1.0000
- Recall Score: 0.9999
----------------------------------
Model performance for Test set
- F_1 Score: 0.6242
- Accuracy Score: 0.8901
- Precision Score: 0.9433
- Recall Score: 0.4665


XGBRegressor
Model performance for Training set
- F_1 Sco