In [1]:
#Import libraries
import os
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

import eli5
from eli5.sklearn import PermutationImportance

%matplotlib inline

In [2]:
#Load data
df_train = pd.read_csv('../processed data/df_train.csv')

In [3]:
#D_* = Delinquency variables
#S_* = Spend variables
#P_* = Payment variables
#B_* = Balance variables
#R_* = Risk variables

DATA PREPROCESSING PART

In [4]:
df_train = df_train.drop(columns = ['S_2'])
df_train = pd.get_dummies(df_train, columns = ['D_63', 'D_64'])

In [5]:
columns_name = df_train.columns.tolist()
for i in ['D_63_CL',
 'D_63_CO',
 'D_63_CR',
 'D_63_XL',
 'D_63_XM',
 'D_63_XZ',
 'D_64_-1',
 'D_64_0',
 'D_64_O',
 'D_64_R',
 'D_64_U',
 'target']:
    columns_name.remove(i)

In [6]:
#apply the StandardScaler to the numeric columns which are not dummy
ct = ColumnTransformer([
        ('somename', StandardScaler(), columns_name)
    ], remainder='passthrough')
scaled_df = ct.fit_transform(df_train)
all_columns = df_train.columns
scaled_df = pd.DataFrame(scaled_df, columns=all_columns)

In [7]:
y = df_train.target
X = df_train.drop(columns = ['target'])
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

MODELING PART: The metrics we use to evaluate performance is f1 score and confusion matrix. Assume that our goal is to predict as precisely as possible in both default and non-default case (i.e., we do not favor true positive nor true negative), we choose the following methods for our model:

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, f1_score

Method 1: Logistic Regression

In [9]:
lr = LogisticRegression()
cross_val_score(lr, X_train, y_train, scoring="f1", cv = 10).mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.7384672352995632

Method 2: Gaussian Naive Bayes

In [10]:
gnb = GaussianNB()
cross_val_score(gnb, X_train, y_train, scoring="f1", cv = 10).mean()

0.6968165337948162

Method 3: LDA

In [11]:
lda = LinearDiscriminantAnalysis()
cross_val_score(lda, X_train, y_train, scoring="f1", cv = 10).mean()

0.7358437451324035

Method 4: Random forest

In [12]:
rf = RandomForestClassifier()
cross_val_score(rf, X_train, y_train, cv=10).mean()


0.9214687500000001

Method 5: Boosting

In [13]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
cross_val_score(gb, X_train, y_train, cv = 10).mean()

0.7684923076923077

It seems that random forest performs the best, so we choose this model to tune. The most important hyperparameter to tune is number of features. 

In [None]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(RandomForestClassifier(random_state=0),
                           {
                              'max_features':np.arange(0.1,1.0,0.1)
                            },cv=3, scoring="f1",verbose=1,n_jobs=-1
                           )
grid_search.fit(X_train,y_train)

In [15]:
grid_search.best_params_

{'max_features': 0.2}

We now apply the model to test data set. 

In [25]:
model = RandomForestClassifier(max_features = 0.2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_score(y_test, y_pred)

0.8672869735553379

In [26]:
confusion_matrix(y_test, y_pred)

array([[11374,   509],
       [  575,  3542]])