In [236]:
import sklearn
import matplotlib.pyplot as plt
import numpy as np
from sklearn import *
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, f1_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
import pickle
import joblib

acc_df link: https://www.kaggle.com/code/wordsforthewise/eda-with-python/notebook

In [237]:
cols = ['grade','loan_amnt','int_rate','home_ownership','annual_inc','purpose', 'dti', 'tot_cur_bal']
acc_df = pd.read_csv("trunc_accepted.csv", usecols=cols)

In [238]:
len(acc_df)
# acc_df = acc_df[]
for i in cols: 
    #acc_df = acc_df[acc_df[i] != np.nan]
    #print(f"unique values of \'{i}\' are {acc_df[i].unique()}")
    print(i)

acc_df.head()

grade
loan_amnt
int_rate
home_ownership
annual_inc
purpose
dti
tot_cur_bal


Unnamed: 0,loan_amnt,int_rate,grade,home_ownership,annual_inc,purpose,dti,tot_cur_bal
0,3600.0,13.99,C,MORTGAGE,55000.0,debt_consolidation,5.91,144904.0
1,24700.0,11.99,C,MORTGAGE,65000.0,small_business,16.06,204396.0
2,20000.0,10.78,B,MORTGAGE,63000.0,home_improvement,10.78,189699.0
3,35000.0,14.85,C,MORTGAGE,110000.0,debt_consolidation,17.06,301500.0
4,10400.0,22.45,F,MORTGAGE,104433.0,major_purchase,25.37,331730.0


### Label-encoding DataFrame

In [239]:
acc_df['grade'] = acc_df['grade'].map({'A': 0, 'B' : 1, 'C' : 2, 'D' : 3, 'E' : 4, 'F' : 5, 'G' : 5})

purpose_map = {'debt_consolidation' : 0, 'small_business' : 1, 'home_improvement' : 2, 'major_purchase' : 3,
 'credit_card' : 4, 'other' : 5, 'house' : 6, 'vacation' : 7, 'car' : 8, 'medical' : 9, 'moving' : 10,
 'renewable_energy' : 11, 'wedding' : 12, 'educational' : 13}
acc_df['purpose'] = acc_df['purpose'].map(purpose_map)

home_ownership_map = {'MORTGAGE' : 0, 'RENT' : 1, 'OWN' : 2, 'ANY' : 3, 'NONE' : 4, 'OTHER' : 5}
acc_df['home_ownership'] = acc_df['home_ownership'].map(home_ownership_map)

print(f"Possible values of \'home_ownership\' are {acc_df['home_ownership'].unique()}")
print(f"Possible values of purpose are {acc_df['purpose'].unique()}")
print(f"Possible values of \'Grade\' are: {acc_df['grade'].unique()}")

Possible values of 'home_ownership' are [ 0.  1.  2.  3. nan  4.  5.]
Possible values of purpose are [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. nan]
Possible values of 'Grade' are: [ 2.  1.  5.  0.  4.  3. nan]


# IMPUTING NaNs in DataFrame

In [240]:
print(f"Possible values of \'home_ownership\' are {acc_df['home_ownership'].unique()}")
print(f"Possible values of purpose are {acc_df['home_ownership'].unique()}")
print(f"Possible values of \'Grade\' are: {acc_df['grade'].unique()}")
print(acc_df.dtypes)
print(f"{acc_df.columns}")
curr_cols = acc_df.columns
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
acc_df = pd.DataFrame(imp.fit_transform(acc_df), columns = curr_cols)
print(acc_df.dtypes)
print(f"Possible values of \'home_ownership\' are {acc_df['home_ownership'].unique()}")
print(f"Possible values of purpose are {acc_df['purpose'].unique()}")
print(f"Possible values of \'Grade\' are: {acc_df['grade'].unique()}")

Possible values of 'home_ownership' are [ 0.  1.  2.  3. nan  4.  5.]
Possible values of purpose are [ 0.  1.  2.  3. nan  4.  5.]
Possible values of 'Grade' are: [ 2.  1.  5.  0.  4.  3. nan]
loan_amnt         float64
int_rate          float64
grade             float64
home_ownership    float64
annual_inc        float64
purpose           float64
dti               float64
tot_cur_bal       float64
dtype: object
Index(['loan_amnt', 'int_rate', 'grade', 'home_ownership', 'annual_inc',
       'purpose', 'dti', 'tot_cur_bal'],
      dtype='object')
loan_amnt         float64
int_rate          float64
grade             float64
home_ownership    float64
annual_inc        float64
purpose           float64
dti               float64
tot_cur_bal       float64
dtype: object
Possible values of 'home_ownership' are [0. 1. 2. 3. 4. 5.]
Possible values of purpose are [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13.]
Possible values of 'Grade' are: [2. 1. 5. 0. 4. 3.]


## Pre-processing data for random forests

In [241]:
X = acc_df.drop('grade', axis=1)
y = acc_df['grade']

print(X.columns)
print(y.mean())
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = 0.2)

Index(['loan_amnt', 'int_rate', 'home_ownership', 'annual_inc', 'purpose',
       'dti', 'tot_cur_bal'],
      dtype='object')
1.6584961921103234


### Fitting and Evaluating the Model

In [242]:
rf = RandomForestClassifier(n_jobs = -1)
rf.fit(X_train, y_train)

## prediction and accuracy!

In [243]:
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
# print(f"Accuracy: {f1_score(y_test, y_pred)}")

Accuracy: 0.9438803382130796


In [244]:
#with open('model_pkl', 'wb') as f:
 #   pickle.dump(rf, f)

In [245]:
joblib.dump(rf, 'rf_jlib')

['rf_jlib']

In [246]:
acc_df['home_ownership'].unique()
home_ownership_map = {'MORTGAGE' : 0, 'RENT' : 1, 'OWN' : 2, 'ANY' : 3, 'NONE' : 4, 'OTHER' : 5}

In [247]:
print(acc_df['purpose'].unique())
purpose_map = {'debt_consolidation' : 0, 'small_business' : 1, 'home_improvement' : 2, 'major_purchase' : 3,
 'credit_card' : 4, 'other' : 5, 'house' : 6, 'vacation' : 7, 'car' : 8, 'medical' : 9, 'moving' : 10,
 'renewable_energy' : 11, 'wedding' : 12, 'educational' : 13}

[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13.]


In [248]:
print(X.columns)

Index(['loan_amnt', 'int_rate', 'home_ownership', 'annual_inc', 'purpose',
       'dti', 'tot_cur_bal'],
      dtype='object')
