In [1]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

SEED = 42
np.random.seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

In [2]:
os.listdir("./data")

['sample_submission_HSqiq1Q.csv', 'test_fjtUOL8.csv', 'train_fNxu4vz.csv']

In [3]:
train = pd.read_csv("./data/train_fNxu4vz.csv")
test = pd.read_csv("./data/test_fjtUOL8.csv")

In [4]:
test.head()

Unnamed: 0,Loan_ID,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender
0,10164310,27500,10+ years,Mortgage,129000.0,VERIFIED - income,debt_consolidation,12.87,0,68.0,10,37,Male
1,10164311,26000,10+ years,,110000.0,not verified,credit_card,11.37,0,,6,23,Male
2,10164312,6075,< 1 year,Rent,75000.0,VERIFIED - income,debt_consolidation,6.83,2,,5,20,Male
3,10164313,12000,10+ years,Mortgage,73000.0,VERIFIED - income source,debt_consolidation,7.76,0,,6,8,Male
4,10164314,35000,< 1 year,Mortgage,156000.0,not verified,debt_consolidation,9.62,0,26.0,9,21,Male


## Data Preprocessing

In [5]:
def preprocess_data(train, test):
    
    # Make Loan amount into float
    train["Loan_Amount_Requested"] = train["Loan_Amount_Requested"].apply(lambda x : float(x.replace(',', '')))
    test["Loan_Amount_Requested"] = test["Loan_Amount_Requested"].apply(lambda x : float(x.replace(',', '')))

    # Fill deliquency missin values with '0' [Assuming there is no deliquency committed]
    train["Months_Since_Deliquency"] = train["Months_Since_Deliquency"].fillna(0)
    test["Months_Since_Deliquency"] = test["Months_Since_Deliquency"].fillna(0)

    # Replace missing income values with median value [Median of train data]
    income_median = train.Annual_Income.median()
    train["Annual_Income"] = train["Annual_Income"].fillna(income_median)
    test["Annual_Income"] = test["Annual_Income"].fillna(income_median)

    # Replace missing values in Home owner with most frequent in training data
    homeowner_most_freq = train.Home_Owner.value_counts().index[0]
    train["Home_Owner"] = train["Home_Owner"].fillna(homeowner_most_freq)
    test["Home_Owner"] = test["Home_Owner"].fillna(homeowner_most_freq)

    # Replace missing values in length employed with most frequent in training data
    length_employed_most_freq = train.Length_Employed.value_counts().index[0]
    train["Length_Employed"] = train["Length_Employed"].fillna(length_employed_most_freq)
    test["Length_Employed"] = test["Length_Employed"].fillna(length_employed_most_freq)

    # Change target column to categorical
    train.Interest_Rate = train.Interest_Rate.astype("category")
    
    return train, test

train, test = preprocess_data(train, test)

## Feature Engineering

In [6]:
def make_length_employed_coarse(val):
    if val in ['< 1 year']:
        return "fresher"
    elif val in ['1 year', '2 years', '3 years', '4 years']:
        return "less_experienced"
    elif val in ['5 years', '6 years', '7 years', '8 years', '9 years']:
        return "more_experienced"
    elif val in ['10+ years']:
        return "most_experienced"

def make_home_owner_coarse(val):
    if val in ['Other', 'None']:
        return "Mortgage"
    else:
        return val
    
def make_loan_purpose_coarse(val):
    if val in ['debt_consolidation', 'credit_card', 'other']:
        return val
    elif val in ['car', 'medical', 'house', 'moving', 'wedding', 'educational', 'vacation', 'home_improvement']:
        return "personal"
    elif val in ['major_purchase', 'small_business', 'renewable_energy']:
        return "business"    
    

def engineer_features(train, test):

    train["Length_Employed_coarse"] = train["Length_Employed"].apply(make_length_employed_coarse)
    test["Length_Employed_coarse"] = test["Length_Employed"].apply(make_length_employed_coarse)

    train["Home_Owner_coarse"] = train["Home_Owner"].apply(make_home_owner_coarse)
    test["Home_Owner_coarse"] = test["Home_Owner"].apply(make_home_owner_coarse)

    train["Purpose_Of_Loan_coarse"] = train["Purpose_Of_Loan"].apply(make_loan_purpose_coarse)
    test["Purpose_Of_Loan_coarse"] = test["Purpose_Of_Loan"].apply(make_loan_purpose_coarse)
    
    return train, test

train, test = engineer_features(train, test)

In [7]:
def prepare_for_training(train, test):
    X_train = train.drop(["Loan_ID","Interest_Rate"], axis=1)
    y_train = train["Interest_Rate"]

    X_test = test.drop(["Loan_ID"], axis=1)
    
    X_train = pd.get_dummies(X_train, drop_first=True).values
    X_test = pd.get_dummies(X_test, drop_first=True).values
    
    return X_train, y_train, X_test

X_train, y_train, X_test = prepare_for_training(train, test)

In [9]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Random Forest Classifier

In [10]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=SEED)
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

# Make submission file
submission = pd.DataFrame(index=test.Loan_ID, data=y_pred)
submission.rename(columns={0: "Interest_Rate"}, inplace=True)
submission.to_csv("./submissions/submission_rfc.csv")

submission.head()

Unnamed: 0_level_0,Interest_Rate
Loan_ID,Unnamed: 1_level_1
10164310,2
10164311,1
10164312,3
10164313,3
10164314,3


## XG Boost Classifier

In [8]:
import xgboost as xgb

xg_clf = xgb.XGBClassifier(random_state=SEED)

xg_clf.fit(X_train, y_train)

y_pred = xg_clf.predict(X_test)

# Make submission file
submission = pd.DataFrame(index=test.Loan_ID, data=y_pred)
submission.rename(columns={0: "Interest_Rate"}, inplace=True)
submission.to_csv("./submissions/submission_xgb.csv")

submission.head()

Unnamed: 0_level_0,Interest_Rate
Loan_ID,Unnamed: 1_level_1
10164310,2
10164311,2
10164312,3
10164313,2
10164314,2


In [9]:
from sklearn.model_selection import GridSearchCV

In [10]:

xgb_param_grid = {"max_depth": [5,10,20,30],
              "learning_rate": [0.01,0.1,0.5,0.9,1.],
              "gamma":[0.1,0.5,1.0],
              "n_estimators":[50,100,150,200]
              }

xgb_grid = GridSearchCV(estimator = xgb.XGBClassifier(objective="multi:softmax", num_class=3, 
                                                            n_jobs=-1, eval_metric="logloss",
                                                            silent=1,random_state=SEED), 
                           param_grid = xgb_param_grid, 
                           scoring = "f1_weighted",
                           cv = 5, n_jobs = -1, verbose=2)

In [None]:
%%time
xgb_grid.fit(X_train, y_train)

best_xgb = xgb_grid.best_estimator_

Fitting 5 folds for each of 240 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
print("Best Score: ", xgb_grid.best_score_)
print("Best Params: ", xgb_grid.best_params_)

In [None]:
y_pred_best = best_xgb.predict(X_test)

# Make submission file
submission = pd.DataFrame(index=test.Loan_ID, data=y_pred_best)
submission.rename(columns={0: "Interest_Rate"}, inplace=True)
submission.to_csv("./submissions/submission_xgb.csv")

submission.head()