In [1]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

SEED = 42
np.random.seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

In [2]:
os.listdir("./data")

['sample_submission_HSqiq1Q.csv', 'test_fjtUOL8.csv', 'train_fNxu4vz.csv']

In [3]:
train = pd.read_csv("./data/train_fNxu4vz.csv")
test = pd.read_csv("./data/test_fjtUOL8.csv")

In [4]:
test.head()

Unnamed: 0,Loan_ID,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender
0,10164310,27500,10+ years,Mortgage,129000.0,VERIFIED - income,debt_consolidation,12.87,0,68.0,10,37,Male
1,10164311,26000,10+ years,,110000.0,not verified,credit_card,11.37,0,,6,23,Male
2,10164312,6075,< 1 year,Rent,75000.0,VERIFIED - income,debt_consolidation,6.83,2,,5,20,Male
3,10164313,12000,10+ years,Mortgage,73000.0,VERIFIED - income source,debt_consolidation,7.76,0,,6,8,Male
4,10164314,35000,< 1 year,Mortgage,156000.0,not verified,debt_consolidation,9.62,0,26.0,9,21,Male


In [5]:
train["Loan_Amount_Requested"] = train["Loan_Amount_Requested"].apply(lambda x : float(x.replace(',', '')))
test["Loan_Amount_Requested"] = test["Loan_Amount_Requested"].apply(lambda x : float(x.replace(',', '')))

train["Months_Since_Deliquency"] = train["Months_Since_Deliquency"].fillna(0)
test["Months_Since_Deliquency"] = test["Months_Since_Deliquency"].fillna(0)

income_median = train.Annual_Income.median()
train["Annual_Income"] = train["Annual_Income"].fillna(income_median)
test["Annual_Income"] = test["Annual_Income"].fillna(income_median)

homeowner_most_freq = train.Home_Owner.value_counts().index[0]
train["Home_Owner"] = train["Home_Owner"].fillna(homeowner_most_freq)
test["Home_Owner"] = test["Home_Owner"].fillna(homeowner_most_freq)

length_employed_most_freq = train.Length_Employed.value_counts().index[0]
train["Length_Employed"] = train["Length_Employed"].fillna(length_employed_most_freq)
test["Length_Employed"] = test["Length_Employed"].fillna(length_employed_most_freq)

train.Interest_Rate = train.Interest_Rate.astype("category")

In [6]:
X_train = train.drop(["Loan_ID","Interest_Rate"], axis=1)
y_train = train["Interest_Rate"]

In [7]:
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = test.drop(["Loan_ID"], axis=1)
X_test = pd.get_dummies(X_test, drop_first=True)

## Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=SEED)

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

# Make submission file
submission = pd.DataFrame(index=test.Loan_ID, data=y_pred)
submission.rename(columns={0: "Interest_Rate"}, inplace=True)
submission.to_csv("submission_lr.csv")

submission.head()

## Decision Tree Classifier

In [16]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(random_state=SEED)
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)

# Make submission file
submission = pd.DataFrame(index=test.Loan_ID, data=y_pred)
submission.rename(columns={0: "Interest_Rate"}, inplace=True)
submission.to_csv("submission_dtr.csv")

submission.head()

Unnamed: 0_level_0,Interest_Rate
Loan_ID,Unnamed: 1_level_1
10164310,3
10164311,1
10164312,3
10164313,3
10164314,2


## Random Forest Classifier

In [26]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=SEED)
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

# Make submission file
submission = pd.DataFrame(index=test.Loan_ID, data=y_pred)
submission.rename(columns={0: "Interest_Rate"}, inplace=True)
submission.to_csv("submission_rfc.csv")

submission.head()

Unnamed: 0_level_0,Interest_Rate
Loan_ID,Unnamed: 1_level_1
10164310,2
10164311,1
10164312,2
10164313,2
10164314,2


## XG Boost Classifier

In [27]:
import xgboost as xgb

xg_clf = xgb.XGBClassifier(random_state=SEED)

xg_clf.fit(X_train.values, y_train)

y_pred = xg_clf.predict(X_test.values)

# Make submission file
submission = pd.DataFrame(index=test.Loan_ID, data=y_pred)
submission.rename(columns={0: "Interest_Rate"}, inplace=True)
submission.to_csv("submission_xgb.csv")

submission.head()

Unnamed: 0_level_0,Interest_Rate
Loan_ID,Unnamed: 1_level_1
10164310,3
10164311,2
10164312,3
10164313,2
10164314,2
