In [16]:
import pandas as pd
import string
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

In [17]:
df = pd.read_csv('train.csv', index_col="Loan_ID")

In [18]:
df.head()

Unnamed: 0_level_0,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,Interest_Rate
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
10000001,7000,< 1 year,Rent,68000.0,not verified,car,18.37,0,,9,14,Female,1
10000002,30000,4 years,Mortgage,,VERIFIED - income,debt_consolidation,14.93,0,17.0,12,24,Female,3
10000003,24725,7 years,Mortgage,75566.4,VERIFIED - income source,debt_consolidation,15.88,0,,12,16,Male,3
10000004,16000,< 1 year,,56160.0,VERIFIED - income source,debt_consolidation,14.34,3,,16,22,Male,3
10000005,17000,8 years,Own,96000.0,VERIFIED - income source,debt_consolidation,22.17,1,,19,30,Female,1


In [19]:
df.isna().sum()

Loan_Amount_Requested          0
Length_Employed             7371
Home_Owner                 25349
Annual_Income              25102
Income_Verified                0
Purpose_Of_Loan                0
Debt_To_Income                 0
Inquiries_Last_6Mo             0
Months_Since_Deliquency    88379
Number_Open_Accounts           0
Total_Accounts                 0
Gender                         0
Interest_Rate                  0
dtype: int64

#### Handling missing data

In [20]:
def cleaning_data(df):
    df["Loan_Amount_Requested"] = df["Loan_Amount_Requested"].str.replace(",","")
    df["Loan_Amount_Requested"] = df["Loan_Amount_Requested"].astype(float)
    df["Length_Employed"] = df["Length_Employed"].str.replace(r'[><+]+',"")
    df["Length_Employed"] = df["Length_Employed"].str.extract('(\d+)').astype(float)
    df.drop(df[df.Length_Employed.isna()].index, inplace=True)
    df["Annual_Income"] = df.groupby(["Length_Employed"])["Annual_Income"].transform(lambda x: x.fillna(x.mean()))
    df["Home_Owner"] = df["Home_Owner"].transform(lambda x: x.fillna("NA"))
    df["Months_Since_Deliquency"] = df["Months_Since_Deliquency"].transform(lambda x: x.fillna(-1))
    return df

In [21]:
df = cleaning_data(df)

In [22]:
def normalization(df):
    scaler = MinMaxScaler()
    df["Loan_Amount_Requested"] = scaler.fit_transform(df[["Loan_Amount_Requested"]])
    df["Length_Employed"] = scaler.fit_transform(df[["Length_Employed"]])
    df["Annual_Income"] = scaler.fit_transform(df[["Annual_Income"]])
    df["Debt_To_Income"] = scaler.fit_transform(df[["Debt_To_Income"]])
    df["Inquiries_Last_6Mo"] = scaler.fit_transform(df[["Inquiries_Last_6Mo"]])
    df["Months_Since_Deliquency"] = scaler.fit_transform(df[["Months_Since_Deliquency"]])
    df["Number_Open_Accounts"] = scaler.fit_transform(df[["Number_Open_Accounts"]])
    df["Total_Accounts"] = scaler.fit_transform(df[["Total_Accounts"]])
    return df

In [23]:
df = normalization(df)

In [24]:
def label_encode(df):
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    df["Income_Verified"] = le.fit_transform(df["Income_Verified"])
    df["Home_Owner"] = le.fit_transform(df["Home_Owner"])
    df["Purpose_Of_Loan"] = le.fit_transform(df["Purpose_Of_Loan"])
    df["Gender"] = le.fit_transform(df["Gender"])
    return df

In [25]:
df = label_encode(df)

In [26]:
#Setting a range of parameters to be used in the randomforest classifier to find the best suited parameter for the model
# Number of trees in random forest
n_estimators = [int(x) for x in range(200, 4000, 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', "None"]
# Maximum number of levels in tree
max_depth = [int(x) for x in range(100, 500, 10)]
max_depth.append(None)

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth
              }

In [27]:
X_features = df.drop("Interest_Rate", axis=1)

In [28]:
X_features

Unnamed: 0_level_0,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
10000001,0.188406,0.000000,5,0.008538,2,0,0.459365,0.000,0.000000,0.118421,0.077922,0
10000002,0.855072,0.333333,0,0.008958,0,2,0.373343,0.000,0.105263,0.157895,0.142857,0
10000003,0.702174,0.666667,0,0.009547,1,2,0.397099,0.000,0.000000,0.157895,0.090909,1
10000004,0.449275,0.000000,1,0.006958,1,2,0.358590,0.375,0.000000,0.210526,0.129870,1
10000005,0.478261,0.777778,4,0.012273,1,2,0.554389,0.125,0.000000,0.250000,0.181818,0
...,...,...,...,...,...,...,...,...,...,...,...,...
10164305,0.586232,1.000000,5,0.010207,1,1,0.339085,0.000,0.000000,0.171053,0.090909,1
10164306,0.159420,0.777778,4,0.009424,1,11,0.228057,0.000,0.216374,0.171053,0.181818,1
10164307,0.572464,1.000000,0,0.010207,0,1,0.454364,0.125,0.000000,0.223684,0.181818,1
10164308,0.275362,0.333333,5,0.008958,2,6,0.468867,0.125,0.286550,0.210526,0.389610,0


In [29]:
y = df["Interest_Rate"]

In [30]:
y

Loan_ID
10000001    1
10000002    3
10000003    3
10000004    3
10000005    1
           ..
10164305    2
10164306    2
10164307    1
10164308    3
10164309    2
Name: Interest_Rate, Length: 156938, dtype: int64

In [31]:
X_train, X_test, y, y_test = train_test_split(X_features, y, test_size=0.2, random_state=25)

In [32]:
# Using the random grid to search for best hyperparameters in the random forest classifier
rf = RandomForestClassifier()
# Random search of parameters, using default 5 fold cross validation, 
# search across different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y)

ValueError: Found input variables with inconsistent numbers of samples: [125550, 156938]

In [None]:
def evaluate_RF(model, test_features, test_labels, n_est, depth):
    predictions = model.predict(test_features)
    precision, recall, fscore, support = score(test_labels, predictions, average='weighted')
    accuracy = model.score(test_features, test_labels)
    tn, fp, fn, tp = confusion_matrix(test_labels,predictions).ravel()
    print("------------ Random Forest Classifier Performance Metrics ------------")
    print("F-score: {} ".format(round(fscore,3)*100))
    print("Precision: {} ".format(round(precision,3)*100))
    print("Recall: {} ".format(round(recall,3)*100))
    print("Accuracy: {} ".format(round(accuracy,3)*100))
    rfMetrics = [precision, recall, fscore, accuracy, tn, fp, fn, tp]
    return rfMetrics

In [None]:
rf_start_time = time.time()
rf_Model = rforestModel_train()
rf_end_time = time.time()

In [None]:
rfMetrics = evaluate_RF(rf_Model, X_test, y_test, n_estimators, max_depth)