### Import modules

In [1]:
# numpy, pandas, scipy, math, matplotlib
import numpy as np
import pandas as pd
import scipy 
from math import sqrt
import matplotlib.pyplot as plt


### Import dataset
* The dataset we upload here has been pre-processed from the previous EDA section.
* We use df.info() to make sure that all the data are in the numerical form. 

In [None]:
rawData = pd.read_csv('new_credit.csv')
rawData.head()

In [3]:
rawData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
LIMIT_BAL                     30000 non-null int64
SEX                           30000 non-null int64
EDUCATION                     30000 non-null int64
MARRIAGE                      30000 non-null int64
AGE                           30000 non-null int64
PAY_0                         30000 non-null int64
PAY_2                         30000 non-null int64
PAY_3                         30000 non-null int64
PAY_4                         30000 non-null int64
PAY_5                         30000 non-null int64
PAY_6                         30000 non-null int64
BILL_AMT1                     30000 non-null int64
BILL_AMT2                     30000 non-null int64
BILL_AMT3                     30000 non-null int64
BILL_AMT4                     30000 non-null int64
BILL_AMT5                     30000 non-null int64
BILL_AMT6                     30000 non-null int64
PAY_AMT1                

### Build training and testing sets
* We selected all the features besides "default payment next month" as independent variables. 
* We selected "default payment next month" as dependent variable, the one we need to predict. 
* train_test_split() function is used to randomly split training and testing datasets.

In [None]:
# Specify features
features = rawData.iloc[:,0:23]
print('Summary of features')
features.head()

In [43]:
# Specify dependent variable
depVar = rawData['default payment next month']

In [12]:
from sklearn.model_selection import train_test_split

# Use cross validation to establish train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, depVar, test_size=0.25)

In [13]:
# use the shape function to double check that the split was made as needed:
X_train.shape, X_test.shape

((22500, 23), (7500, 23))

### Build the models
+ First we use the datasets we created by train_test_split() function to build the models.
+ We will use four different classification classifiers. 
  1. Logistic Regression
  2. Random Forest
  3. Gradient Boosting
  4. Supportive Vector Machine
  

#### Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
modelLR = LogisticRegression()

In [19]:
modelLR.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [20]:
modelLR.score(X_train, y_train)

0.7791111111111111

#### Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
modelRF = RandomForestClassifier(n_estimators=100)

In [29]:
modelRF.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [30]:
modelRF.score(X_train, y_train)

0.9968

#### Gradient Boosting

In [31]:
from sklearn.ensemble import GradientBoostingClassifier

In [105]:
GB = GradientBoostingClassifier()

In [106]:
modelGB = GB.fit(X_train, y_train)

In [107]:
modelGB.score(X_train, y_train)

0.8268888888888889

#### SVM

In [81]:
from sklearn.svm import SVC

In [82]:
modelSVM = SVC()

In [83]:
modelSVM.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [84]:
modelSVM.score(X_train, y_train)

0.9904444444444445

### Use cross validation
* Random Forest and SVM models using train_test_split are overfitting. 
* We are going to try Cross Validation to avoid overfitting. 

In [49]:
from sklearn.model_selection import cross_val_score

In [58]:
cross_val_score(LogisticRegression(random_state=0), X_train, y_train, cv=3)



array([0.7789628 , 0.77893333, 0.77917056])

In [72]:
cross_val_score(RandomForestClassifier(n_estimators=25, random_state=0), X_train, y_train, cv=3)

array([0.81509132, 0.81026667, 0.80570743])

In [91]:
GB_cv = GradientBoostingClassifier(n_estimators=25, random_state=0)
scores = cross_val_score(GB_cv, X_train, y_train, cv=3)
scores

array([0.82442341, 0.82426667, 0.81490865])

In [85]:
cross_val_score(SVC(random_state=0), X_train, y_train, cv=3)



array([0.78189575, 0.7792    , 0.78090412])

### Make prediction and evaluate 
* From last step, Gradient Boosting model has the best result, this is what we will use to make the predictions.  
* We will make the predictions in two ways.
  1. Use cross_val_predict() function.
  2. Use train_test_split method gradient boosting model. 
* We will use confusion matrix and classification report to evaluate the predictions. 

#### Use cross_val_predict to make the predictions. 

In [98]:
from sklearn.model_selection import cross_val_predict
modelGB_cv = GB_cv.fit(X_train, y_train)
predictions = cross_val_predict(modelGB_cv, X_test, y_test, cv=3)

In [99]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[5524,  309],
       [1044,  623]])

The accuracy is 0.8196. 

In [101]:
from sklearn.metrics import classification_report
report = classification_report(y_test, predictions)
report

'              precision    recall  f1-score   support\n\n           0       0.84      0.95      0.89      5833\n           1       0.67      0.37      0.48      1667\n\n   micro avg       0.82      0.82      0.82      7500\n   macro avg       0.75      0.66      0.69      7500\nweighted avg       0.80      0.82      0.80      7500\n'

#### Use train_test_split method gradient boosting model to make the predictions. 

In [109]:
predictions_GB = modelGB.predict(X_test)
confusion_matrix(y_test, predictions_GB)

array([[5516,  317],
       [1024,  643]])

The accuracy is 0.8212.

In [110]:
classification_report(y_test, predictions_GB)

'              precision    recall  f1-score   support\n\n           0       0.84      0.95      0.89      5833\n           1       0.67      0.39      0.49      1667\n\n   micro avg       0.82      0.82      0.82      7500\n   macro avg       0.76      0.67      0.69      7500\nweighted avg       0.80      0.82      0.80      7500\n'

### We chose Gradient Boosting model using train_test_split() function as our final model based on the highest accuracy.