In [1]:
####################
# Start of File
# Load Almost All Modules
####################
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV as gscv
from sklearn.metrics import jaccard_score as jcs
from sklearn.metrics import f1_score as f1s
from sklearn.metrics import log_loss as lls

%matplotlib inline

In [2]:
# READ in `loan_test.csv`
df = pd.read_csv('loan_train.csv')

# PROCESS data into a better format
df['due_date'] = pd.to_datetime(df['due_date'])
df['effective_date'] = pd.to_datetime(df['effective_date'])
df['dayofweek'] = df['effective_date'].dt.dayofweek
df['weekend'] = df['dayofweek'].apply(lambda x: 1 if (x>3)  else 0)
df['Gender'].replace(to_replace=['male','female'], value=[0,1],inplace=True)

# FORMAT data
Feature = df[['Principal','terms','age','Gender','weekend']]
Feature = pd.concat([Feature,pd.get_dummies(df['education'])], axis=1)

# REGRESSION TESTING - Train/Test split
# ['Principal', 'terms', 'age', 'Gender', 'weekend', 'Bechalor', 'High School or Below', 'Master or Above', 'college']
#Feature.drop(['Principal'], axis = 1,inplace=True)
#Feature.drop(['terms'], axis = 1,inplace=True)
#Feature.drop(['age'], axis = 1,inplace=True)
#Feature.drop(['Gender'], axis = 1,inplace=True)
#Feature.drop(['weekend'], axis = 1,inplace=True)
#Feature.drop(['High School or Below'], axis = 1,inplace=True)
#Feature.drop(['Bechalor'], axis = 1,inplace=True)
Feature.drop(['Master or Above'], axis = 1,inplace=True)
#Feature.drop(['college'], axis = 1,inplace=True)
X = Feature
y = df['loan_status'].values
X = preprocessing.StandardScaler().fit(X).transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# K Nearest Neighbor(KNN)


In [3]:
from sklearn.neighbors import KNeighborsClassifier as knnc

knn_param = {'n_neighbors':[2, 3, 4, 5, 6, 7, 8, 9, 10]}
knn = gscv(knnc(), knn_param, cv=7, scoring='accuracy', refit=True)
knn.fit(X_train, y_train)
print(knn.best_params_)
print(knn.best_score_)
 
  


{'n_neighbors': 9}
0.7461538461538462


In [4]:
# knn.fit(X_test, y_test)
# print(knn.best_params_)
# print(knn.best_score_)

# Decision Tree


In [5]:
from sklearn.tree import DecisionTreeClassifier as dtc

tree_param = { 'criterion':['entropy', 'gini'], 'max_depth':range(3,16,1), 'min_samples_leaf':range(1,8,1)}
tree = gscv(dtc(), tree_param, cv=7, scoring='accuracy', refit=True)
tree.fit(X_train, y_train)
print(tree.best_params_)
print(tree.best_score_)
    

{'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 1}
0.7357142857142857


In [6]:
# tree.fit(X_test, y_test)
# print(tree.best_params_)
# print(tree.best_score_)

# Support Vector Machine


In [7]:
from sklearn.svm import SVC

svm_param = {'C':[0.1, 1, 10, 100], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001], 'kernel':['linear', 'rbf', 'sigmoid']}
svm = gscv(SVC(), svm_param, cv=7, scoring='accuracy', refit=True)
svm.fit(X_train, y_train)
print(svm.best_params_)
print(svm.best_score_)

{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
0.7573260073260073


In [8]:
# svm.fit(X_test, y_test)
# print(svm.best_params_)
# print(svm.best_score_)

# Logistic Regression


In [9]:
from sklearn.linear_model import LogisticRegression as lrc

lrg_param = {'C':np.linspace(start=0.001, stop=26, num=100), "solver":['newton-cg', 'lbfgs', 'liblinear']}
lrg = gscv(lrc(), lrg_param, cv=7, scoring='accuracy', refit=True)
lrg.fit(X_train, y_train)
print(lrg.best_params_)
print(lrg.best_score_)

{'C': 0.2636161616161616, 'solver': 'newton-cg'}
0.7540293040293039


In [10]:
# lrg.fit(X_test, y_test)
# print(lrg.best_params_)
# print(lrg.best_score_)

# Model Evaluation using Test set


In [11]:
# Load test file
test_df = pd.read_csv('loan_test.csv')

In [12]:
test_df['due_date'] = pd.to_datetime(test_df['due_date'])
test_df['effective_date'] = pd.to_datetime(test_df['effective_date'])
test_df['dayofweek'] = test_df['effective_date'].dt.dayofweek
test_df['weekend'] = test_df['dayofweek'].apply(lambda x: 1 if (x>3)  else 0)
test_df['Gender'].replace(to_replace=['male', 'female'], value=[0,1],inplace=True)

# set features and paramters similar to testing parameters.
Feature1 = test_df[['Principal', 'terms', 'age', 'Gender', 'weekend']]
Feature1 = pd.concat([Feature1, pd.get_dummies(test_df['education'])], axis=1)

# this feature dropped from testing.
Feature1.drop(['Master or Above'], axis = 1, inplace=True)
X1 = Feature1
y1 = test_df['loan_status'].values
X1 = preprocessing.StandardScaler().fit(X1).transform(X1)


In [13]:
# KNN 
yhatKNN = knn.predict(X1)
KNNJaccard = jcs(y1, yhatKNN, pos_label='PAIDOFF')
KNNF1 = f1s(y1, yhatKNN, average='weighted')
print("KNN Jaccard Score: %.2f" % KNNJaccard)
print("KNN F1-score: %.2f" % KNNF1 )

# Decision Tree
yhatDEC = tree.predict(X1)
DTJaccard = jcs(y1, yhatDEC, pos_label='PAIDOFF')
DTF1 = f1s(y1, yhatDEC, average='weighted')
print("DTree Jaccard Score: %.2f" % DTJaccard)
print("DTree F1-score: %.2f" % DTF1 )


# Support Vector Machine
yhatSVM = svm.predict(X1)
SVMJaccard = jcs(y1, yhatSVM, pos_label='PAIDOFF')
SVMF1 = f1s(y1, yhatSVM, average='weighted')
print("SVM Jaccard score: %.2f" % SVMJaccard)
print("SVM F1-score: %.2f" % SVMF1)

# Logistic Regression
yhatLOG = lrg.predict(X1)
yhatLOGproba = lrg.predict_proba(X1)
LogRJaccard = jcs(y1, yhatLOG, pos_label='PAIDOFF')
LogRF1 = f1s(y1, yhatLOG, average='weighted')
Logloss = lls(y1, yhatLOGproba)
print("LOG Jaccard score: %.4f" % LogRJaccard)
print("LOG F1-score: %.4f" % LogRF1)
print("LOG: %.2f" % Logloss)


KNN Jaccard Score: 0.67
KNN F1-score: 0.65
DTree Jaccard Score: 0.72
DTree F1-score: 0.76
SVM Jaccard score: 0.76
SVM F1-score: 0.76
LOG Jaccard score: 0.7451
LOG F1-score: 0.7144
LOG: 0.47


# Report

#### Final result, with a train test split of 20% test, and 'masters and above' droped

| Algorithm          | Jaccard | F1-score | LogLoss |
| ------------------ | ------- | -------- | ------- |
| KNN                | 0.67    | 0.65     | NA      |
| Decision Tree      | 0.72    | 0.76     | NA      |
| SVM                | 0.76    | 0.76     | NA      |
| LogisticRegression | 0.75    | 0.71     | 0.47    |





