In [16]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

In [17]:
os.chdir(r'C:\Users\ganga\OneDrive\Desktop\Hero vired case studies\Prediction of Loan Interest Rate')
print(os.listdir())

['cleaned_data.xlsx', 'Dataset (5).csv', 'encoded_data.xlsx', 'final_project_18.pptx', 'High_risk_data.xlsx', 'Loan Interest Category Prediction using Decision Tree.docx', 'Modified_data.xlsx', 'safe_customers_data.xlsx']


In [18]:
data = pd.read_csv('Dataset (5).csv')
data.head()

Unnamed: 0,Loan_ID,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,Interest_Rate
0,10000001,7000,< 1 year,Rent,68000.0,not verified,car,18.37,0,,9,14,Female,1
1,10000002,30000,4 years,Mortgage,,VERIFIED - income,debt_consolidation,14.93,0,17.0,12,24,Female,3
2,10000003,24725,7 years,Mortgage,75566.4,VERIFIED - income source,debt_consolidation,15.88,0,,12,16,Male,3
3,10000004,16000,< 1 year,,56160.0,VERIFIED - income source,debt_consolidation,14.34,3,,16,22,Male,3
4,10000005,17000,8 years,Own,96000.0,VERIFIED - income source,debt_consolidation,22.17,1,,19,30,Female,1


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164309 entries, 0 to 164308
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Loan_ID                  164309 non-null  int64  
 1   Loan_Amount_Requested    164309 non-null  object 
 2   Length_Employed          156938 non-null  object 
 3   Home_Owner               138960 non-null  object 
 4   Annual_Income            139207 non-null  float64
 5   Income_Verified          164309 non-null  object 
 6   Purpose_Of_Loan          164309 non-null  object 
 7   Debt_To_Income           164309 non-null  float64
 8   Inquiries_Last_6Mo       164309 non-null  int64  
 9   Months_Since_Deliquency  75930 non-null   float64
 10  Number_Open_Accounts     164309 non-null  int64  
 11  Total_Accounts           164309 non-null  int64  
 12  Gender                   164309 non-null  object 
 13  Interest_Rate            164309 non-null  int64  
dtypes: f

In [20]:
data['Loan_Amount_Requested'] = data['Loan_Amount_Requested'].str.replace(r'\D', '', regex=True)
data['Loan_Amount_Requested'] = pd.to_numeric(data['Loan_Amount_Requested'], downcast='integer')

In [21]:
data.isnull().sum()

Loan_ID                        0
Loan_Amount_Requested          0
Length_Employed             7371
Home_Owner                 25349
Annual_Income              25102
Income_Verified                0
Purpose_Of_Loan                0
Debt_To_Income                 0
Inquiries_Last_6Mo             0
Months_Since_Deliquency    88379
Number_Open_Accounts           0
Total_Accounts                 0
Gender                         0
Interest_Rate                  0
dtype: int64

In [22]:
for i in data.select_dtypes(include='number').columns:
    data[i] = data[i].fillna(data[i].mean())

In [23]:
for i in data.select_dtypes(include='O').columns:
    data[i] = data[i].fillna(data[i].mode()[0])

In [24]:
data.isnull().sum()

Loan_ID                    0
Loan_Amount_Requested      0
Length_Employed            0
Home_Owner                 0
Annual_Income              0
Income_Verified            0
Purpose_Of_Loan            0
Debt_To_Income             0
Inquiries_Last_6Mo         0
Months_Since_Deliquency    0
Number_Open_Accounts       0
Total_Accounts             0
Gender                     0
Interest_Rate              0
dtype: int64

In [25]:
data['Home_Owner'].unique()

array(['Rent', 'Mortgage', 'Own', 'Other', 'None'], dtype=object)

In [26]:
data['Home_Owner'] = np.where(data['Home_Owner']=='Other', 'None', data['Home_Owner'])

In [27]:
data.select_dtypes(include='number').columns

Index(['Loan_ID', 'Loan_Amount_Requested', 'Annual_Income', 'Debt_To_Income',
       'Inquiries_Last_6Mo', 'Months_Since_Deliquency', 'Number_Open_Accounts',
       'Total_Accounts', 'Interest_Rate'],
      dtype='object')

In [28]:
data.select_dtypes(include='O').columns

Index(['Length_Employed', 'Home_Owner', 'Income_Verified', 'Purpose_Of_Loan',
       'Gender'],
      dtype='object')

In [29]:
data.Gender.dtype

dtype('O')

In [30]:
data.head()

Unnamed: 0,Loan_ID,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,Interest_Rate
0,10000001,7000,< 1 year,Rent,68000.0,not verified,car,18.37,0,34.229356,9,14,Female,1
1,10000002,30000,4 years,Mortgage,73331.159434,VERIFIED - income,debt_consolidation,14.93,0,17.0,12,24,Female,3
2,10000003,24725,7 years,Mortgage,75566.4,VERIFIED - income source,debt_consolidation,15.88,0,34.229356,12,16,Male,3
3,10000004,16000,< 1 year,Mortgage,56160.0,VERIFIED - income source,debt_consolidation,14.34,3,34.229356,16,22,Male,3
4,10000005,17000,8 years,Own,96000.0,VERIFIED - income source,debt_consolidation,22.17,1,34.229356,19,30,Female,1


In [15]:
data.drop(labels=['Length_Employed', 'Home_Owner', 'Income_Verified', 'Purpose_Of_Loan','Gender'], axis=1, inplace=True)

In [16]:
final_data = data.copy()

In [17]:
final_data = final_data.loc[:20000, :]

In [18]:
final_data.shape

(20001, 9)

In [19]:
final_data.Interest_Rate.value_counts()

2    8501
3    7369
1    4131
Name: Interest_Rate, dtype: int64

In [20]:
X = final_data.drop('Interest_Rate', axis=1)
y = final_data['Interest_Rate']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
print(X_train.shape, X_test.shape)

(16000, 8) (4001, 8)


#### Decision TreeC lassifier

In [23]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

DecisionTreeClassifier()

In [24]:
def compare_model_train_test(model):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    train_clf_report = classification_report(y_train, y_train_pred)
    test_clf_report = classification_report(y_test, y_test_pred)
    print('Training accuracy is :', train_accuracy, '\n', train_clf_report, '\n',
         'Testing accuracy is :', round(test_accuracy,2), '\n', test_clf_report, '\n',)

In [25]:
compare_model_train_test(clf)

Training accuracy is : 1.0 
               precision    recall  f1-score   support

           1       1.00      1.00      1.00      3296
           2       1.00      1.00      1.00      6818
           3       1.00      1.00      1.00      5886

    accuracy                           1.00     16000
   macro avg       1.00      1.00      1.00     16000
weighted avg       1.00      1.00      1.00     16000
 
 Testing accuracy is : 0.41 
               precision    recall  f1-score   support

           1       0.28      0.30      0.29       835
           2       0.45      0.45      0.45      1683
           3       0.45      0.44      0.44      1483

    accuracy                           0.41      4001
   macro avg       0.39      0.40      0.39      4001
weighted avg       0.42      0.41      0.41      4001
 



#### Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()
reg.fit(X_train, y_train)

LogisticRegression()

In [27]:
compare_model_train_test(reg)

Training accuracy is : 0.4625 
               precision    recall  f1-score   support

           1       0.29      0.01      0.02      3296
           2       0.45      0.76      0.57      6818
           3       0.49      0.37      0.42      5886

    accuracy                           0.46     16000
   macro avg       0.41      0.38      0.34     16000
weighted avg       0.43      0.46      0.40     16000
 
 Testing accuracy is : 0.46 
               precision    recall  f1-score   support

           1       0.22      0.01      0.01       835
           2       0.45      0.76      0.56      1683
           3       0.50      0.37      0.43      1483

    accuracy                           0.46      4001
   macro avg       0.39      0.38      0.33      4001
weighted avg       0.42      0.46      0.40      4001
 



#### Random Forest Classifier

In [28]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier()

In [29]:
compare_model_train_test(clf)

Training accuracy is : 1.0 
               precision    recall  f1-score   support

           1       1.00      1.00      1.00      3296
           2       1.00      1.00      1.00      6818
           3       1.00      1.00      1.00      5886

    accuracy                           1.00     16000
   macro avg       1.00      1.00      1.00     16000
weighted avg       1.00      1.00      1.00     16000
 
 Testing accuracy is : 0.48 
               precision    recall  f1-score   support

           1       0.40      0.20      0.26       835
           2       0.46      0.59      0.51      1683
           3       0.53      0.51      0.52      1483

    accuracy                           0.48      4001
   macro avg       0.46      0.43      0.43      4001
weighted avg       0.47      0.48      0.46      4001
 



In [30]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [31]:
compare_model_train_test(knn)

Training accuracy is : 0.5993125 
               precision    recall  f1-score   support

           1       0.49      0.52      0.50      3296
           2       0.60      0.71      0.65      6818
           3       0.69      0.51      0.59      5886

    accuracy                           0.60     16000
   macro avg       0.59      0.58      0.58     16000
weighted avg       0.61      0.60      0.60     16000
 
 Testing accuracy is : 0.39 
               precision    recall  f1-score   support

           1       0.23      0.25      0.24       835
           2       0.44      0.51      0.47      1683
           3       0.44      0.32      0.37      1483

    accuracy                           0.39      4001
   macro avg       0.37      0.36      0.36      4001
weighted avg       0.39      0.39      0.39      4001
 



In [34]:
num_data = final_data.copy()

In [35]:
num_data.head()

Unnamed: 0,Loan_ID,Loan_Amount_Requested,Annual_Income,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Interest_Rate
0,10000001,7000,68000.0,18.37,0,34.229356,9,14,1
1,10000002,30000,73331.159434,14.93,0,17.0,12,24,3
2,10000003,24725,75566.4,15.88,0,34.229356,12,16,3
3,10000004,16000,56160.0,14.34,3,34.229356,16,22,3
4,10000005,17000,96000.0,22.17,1,34.229356,19,30,1


In [36]:
final_data.head()

Unnamed: 0,Loan_ID,Loan_Amount_Requested,Annual_Income,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Interest_Rate
0,10000001,7000,68000.0,18.37,0,34.229356,9,14,1
1,10000002,30000,73331.159434,14.93,0,17.0,12,24,3
2,10000003,24725,75566.4,15.88,0,34.229356,12,16,3
3,10000004,16000,56160.0,14.34,3,34.229356,16,22,3
4,10000005,17000,96000.0,22.17,1,34.229356,19,30,1


In [44]:
sample_data = final_data.loc[:5000, ['Loan_Amount_Requested', 'Debt_To_Income','Months_Since_Deliquency', 'Interest_Rate']]

In [45]:
sample_data.head()

Unnamed: 0,Loan_Amount_Requested,Debt_To_Income,Months_Since_Deliquency,Interest_Rate
0,7000,18.37,34.229356,1
1,30000,14.93,17.0,3
2,24725,15.88,34.229356,3
3,16000,14.34,34.229356,3
4,17000,22.17,34.229356,1


In [46]:
sample_data['Interest_Rate'].value_counts()

2    2132
3    1821
1    1048
Name: Interest_Rate, dtype: int64

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

DecisionTreeClassifier()

In [56]:
compare_model_train_test(clf)

Training accuracy is : 1.0 
               precision    recall  f1-score   support

           1       1.00      1.00      1.00      3296
           2       1.00      1.00      1.00      6818
           3       1.00      1.00      1.00      5886

    accuracy                           1.00     16000
   macro avg       1.00      1.00      1.00     16000
weighted avg       1.00      1.00      1.00     16000
 
 Testing accuracy is : 0.41 
               precision    recall  f1-score   support

           1       0.29      0.31      0.30       835
           2       0.45      0.45      0.45      1683
           3       0.45      0.44      0.45      1483

    accuracy                           0.41      4001
   macro avg       0.40      0.40      0.40      4001
weighted avg       0.42      0.41      0.42      4001
 



In [79]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
x_resample, y_resmple = ros.fit_resample(X_train, y_train)

In [80]:
from collections import Counter
Counter(y_resmple)

Counter({1: 6825, 2: 6825, 3: 6825})

In [73]:
X_train, X_test, y_train, y_test = train_test_split(x_resample, y_resmple, test_size=0.2, random_state=42)

In [68]:
x_resample.shape

(25503, 8)

In [81]:
clf.fit(x_resample, y_resmple)
pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
accuracy

0.5932170162713194