In [154]:
import pandas as pd
import string
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [155]:
df = pd.read_csv('train.csv', index_col="Loan_ID")

In [156]:
df.head()

Unnamed: 0_level_0,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,Interest_Rate
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
10000001,7000,< 1 year,Rent,68000.0,not verified,car,18.37,0,,9,14,Female,1
10000002,30000,4 years,Mortgage,,VERIFIED - income,debt_consolidation,14.93,0,17.0,12,24,Female,3
10000003,24725,7 years,Mortgage,75566.4,VERIFIED - income source,debt_consolidation,15.88,0,,12,16,Male,3
10000004,16000,< 1 year,,56160.0,VERIFIED - income source,debt_consolidation,14.34,3,,16,22,Male,3
10000005,17000,8 years,Own,96000.0,VERIFIED - income source,debt_consolidation,22.17,1,,19,30,Female,1


In [157]:
df.isna().sum()

Loan_Amount_Requested          0
Length_Employed             7371
Home_Owner                 25349
Annual_Income              25102
Income_Verified                0
Purpose_Of_Loan                0
Debt_To_Income                 0
Inquiries_Last_6Mo             0
Months_Since_Deliquency    88379
Number_Open_Accounts           0
Total_Accounts                 0
Gender                         0
Interest_Rate                  0
dtype: int64

#### Handling missing data

In [158]:
def cleaning_data(df):
    df["Loan_Amount_Requested"] = df["Loan_Amount_Requested"].str.replace(",","")
    df["Loan_Amount_Requested"] = df["Loan_Amount_Requested"].astype(float)
    df["Length_Employed"] = df["Length_Employed"].str.replace(r'[><+]+',"")
    df["Length_Employed"] = df["Length_Employed"].str.extract('(\d+)').astype(float)
    df.drop(df[df.Length_Employed.isna()].index, inplace=True)
    df["Annual_Income"] = df.groupby(["Length_Employed"])["Annual_Income"].transform(lambda x: x.fillna(x.mean()))
    df["Home_Owner"] = df["Home_Owner"].transform(lambda x: x.fillna("NA"))
    df["Months_Since_Deliquency"] = df["Months_Since_Deliquency"].transform(lambda x: x.fillna(-1))
    return df

In [159]:
cleaning_data(df)

Unnamed: 0_level_0,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,Interest_Rate
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
10000001,7000.0,1.0,Rent,68000.000000,not verified,car,18.37,0,-1.0,9,14,Female,1
10000002,30000.0,4.0,Mortgage,71150.557520,VERIFIED - income,debt_consolidation,14.93,0,17.0,12,24,Female,3
10000003,24725.0,7.0,Mortgage,75566.400000,VERIFIED - income source,debt_consolidation,15.88,0,-1.0,12,16,Male,3
10000004,16000.0,1.0,,56160.000000,VERIFIED - income source,debt_consolidation,14.34,3,-1.0,16,22,Male,3
10000005,17000.0,8.0,Own,96000.000000,VERIFIED - income source,debt_consolidation,22.17,1,-1.0,19,30,Female,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10164305,20725.0,10.0,Rent,80509.442067,VERIFIED - income source,credit_card,13.56,0,-1.0,13,16,Male,2
10164306,6000.0,8.0,Own,74645.174631,VERIFIED - income source,small_business,9.12,0,36.0,13,30,Male,2
10164307,20250.0,10.0,Mortgage,80509.442067,VERIFIED - income,credit_card,18.17,1,-1.0,17,30,Male,1
10164308,10000.0,4.0,Rent,71150.557520,not verified,major_purchase,18.75,1,48.0,16,62,Female,3


In [160]:
# df["Home_Owner"] = df["Home_Owner"].astype("category")
# df["Income_Verified"] = df["Income_Verified"].astype("category")
# df["Purpose_Of_Loan"] = df["Purpose_Of_Loan"].astype("category")
# df["Gender"] = df["Gender"].astype("category")

In [161]:
scaler = MinMaxScaler()
df["Loan_Amount_Requested"] = scaler.fit_transform(df[["Loan_Amount_Requested"]])
df["Length_Employed"] = scaler.fit_transform(df[["Length_Employed"]])
df["Annual_Income"] = scaler.fit_transform(df[["Annual_Income"]])
df["Debt_To_Income"] = scaler.fit_transform(df[["Debt_To_Income"]])
df["Inquiries_Last_6Mo"] = scaler.fit_transform(df[["Inquiries_Last_6Mo"]])
df["Months_Since_Deliquency"] = scaler.fit_transform(df[["Months_Since_Deliquency"]])
df["Number_Open_Accounts"] = scaler.fit_transform(df[["Number_Open_Accounts"]])
df["Total_Accounts"] = scaler.fit_transform(df[["Total_Accounts"]])

In [162]:
df

Unnamed: 0_level_0,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,Interest_Rate
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
10000001,0.188406,0.000000,Rent,0.008538,not verified,car,0.459365,0.000,0.000000,0.118421,0.077922,Female,1
10000002,0.855072,0.333333,Mortgage,0.008958,VERIFIED - income,debt_consolidation,0.373343,0.000,0.105263,0.157895,0.142857,Female,3
10000003,0.702174,0.666667,Mortgage,0.009547,VERIFIED - income source,debt_consolidation,0.397099,0.000,0.000000,0.157895,0.090909,Male,3
10000004,0.449275,0.000000,,0.006958,VERIFIED - income source,debt_consolidation,0.358590,0.375,0.000000,0.210526,0.129870,Male,3
10000005,0.478261,0.777778,Own,0.012273,VERIFIED - income source,debt_consolidation,0.554389,0.125,0.000000,0.250000,0.181818,Female,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10164305,0.586232,1.000000,Rent,0.010207,VERIFIED - income source,credit_card,0.339085,0.000,0.000000,0.171053,0.090909,Male,2
10164306,0.159420,0.777778,Own,0.009424,VERIFIED - income source,small_business,0.228057,0.000,0.216374,0.171053,0.181818,Male,2
10164307,0.572464,1.000000,Mortgage,0.010207,VERIFIED - income,credit_card,0.454364,0.125,0.000000,0.223684,0.181818,Male,1
10164308,0.275362,0.333333,Rent,0.008958,not verified,major_purchase,0.468867,0.125,0.286550,0.210526,0.389610,Female,3


In [163]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df["Income_Verified"] = le.fit_transform(df["Income_Verified"])
df["Home_Owner"] = le.fit_transform(df["Home_Owner"])
df["Purpose_Of_Loan"] = le.fit_transform(df["Purpose_Of_Loan"])
df["Gender"] = le.fit_transform(df["Gender"])

In [164]:
#Setting a range of parameters to be used in the randomforest classifier to find the best suited parameter for the model
# Number of trees in random forest
n_estimators = [int(x) for x in range(200, 4000, 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', "None"]
# Maximum number of levels in tree
max_depth = [int(x) for x in range(100, 500, 10)]
max_depth.append(None)

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth
              }

In [165]:
X_features = df.drop("Interest_Rate", axis=1)

In [166]:
X_features

Unnamed: 0_level_0,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
10000001,0.188406,0.000000,5,0.008538,2,0,0.459365,0.000,0.000000,0.118421,0.077922,0
10000002,0.855072,0.333333,0,0.008958,0,2,0.373343,0.000,0.105263,0.157895,0.142857,0
10000003,0.702174,0.666667,0,0.009547,1,2,0.397099,0.000,0.000000,0.157895,0.090909,1
10000004,0.449275,0.000000,1,0.006958,1,2,0.358590,0.375,0.000000,0.210526,0.129870,1
10000005,0.478261,0.777778,4,0.012273,1,2,0.554389,0.125,0.000000,0.250000,0.181818,0
...,...,...,...,...,...,...,...,...,...,...,...,...
10164305,0.586232,1.000000,5,0.010207,1,1,0.339085,0.000,0.000000,0.171053,0.090909,1
10164306,0.159420,0.777778,4,0.009424,1,11,0.228057,0.000,0.216374,0.171053,0.181818,1
10164307,0.572464,1.000000,0,0.010207,0,1,0.454364,0.125,0.000000,0.223684,0.181818,1
10164308,0.275362,0.333333,5,0.008958,2,6,0.468867,0.125,0.286550,0.210526,0.389610,0


In [167]:
y = df["Interest_Rate"]

In [168]:
y

Loan_ID
10000001    1
10000002    3
10000003    3
10000004    3
10000005    1
           ..
10164305    2
10164306    2
10164307    1
10164308    3
10164309    2
Name: Interest_Rate, Length: 156938, dtype: int64

In [None]:
# Using the random grid to search for best hyperparameters in the random forest classifier
rf = RandomForestClassifier()
# Random search of parameters, using default 5 fold cross validation, 
# search across different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_features, y)