In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data Loading

In [None]:
# Load the data

df = pd.read_csv('../input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv')
df.sample(5)

## Exploratory Data Analysis

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Payment delay description
df[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']].describe()

They all present an undocumented label -2. If 1,2,3, etc are the months of delay, 0 should be labeled 'pay duly' and every negative value should be seen as a 0. But we will get to that later.

In [None]:
# Bill Statement description
df[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].describe()

What is the reason for negative bill amount. This needs to be investigated.

In [None]:
#Previous Payment Description
df[['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']].describe()

In [None]:
df.LIMIT_BAL.describe()

Range is set very broadly, and needs to be investigated.

In [None]:
df = df.rename(columns={'default.payment.next.month': 'def_pay', 
                        'PAY_0': 'PAY_1'})
df.head()

## Target Distribution

This dataset employed a binary variable, default payment (Yes = 1, No = 0), as the response variable.


In [None]:
# I am interested in having a general idea of the default probability
df.def_pay.sum() / len(df.def_pay)

In [None]:
plt.hist(df['def_pay'])

In [None]:
df.def_pay.value_counts()

# Finding Features Which are Highly Correlated with Target

In [None]:
cor = df.corr()
#Correlation with output variable
cor_target = abs(cor["def_pay"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.05]
relevant_features

# Finding Columns with NULL Values

As we see here, there are no null values in the dataset.

In [None]:
df.isnull().sum(axis=0)

## Decision Tree Classifier with default Dataset

In [None]:
y = df['def_pay'].copy()
df=df.drop(['def_pay'],axis=1)
X=df.copy()
X.head()

In [None]:
#importing libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import train_test_split

In [None]:
# create training and testing vars
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
classifier = DecisionTreeClassifier(max_depth=10, random_state=14) 
# training the classifier
classifier.fit(X_train, y_train)
# do our predictions on the test
predictions = classifier.predict(X_test)
# see how good we did on the test
accuracy_score(y_true = y_test, y_pred = predictions)

## Decision Tree Classifier with Selected Features

In [None]:
sel_feature = ['LIMIT_BAL','PAY_1','PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'PAY_AMT1',
               'PAY_AMT2',  'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
X = df[sel_feature].copy()
X.head()

In [None]:
y.head()

In [None]:
# create training and testing vars
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
def decisiontree(X_train,y_train,X_test,y_test):
    classifier = DecisionTreeClassifier(max_depth=10, random_state=14) 
    # training the classifier
    classifier.fit(X_train, y_train)
    # do our predictions on the test
    predictions = classifier.predict(X_test)
    # see how good we did on the test
    prob = classifier.predict(X_test)
    return accuracy_score(y_true = y_test, y_pred = predictions), predictions, prob

In [None]:
score,predictions, prob = decisiontree(X_train,y_train,X_test,y_test)
print("Decision Tree Prediction Score : ", score )

## Analyzing Classification Performance

We can see that accuracy score is not accurately reflecting our classificaiton performance. Confusion matrix and classification reports show poor precision, recall, and F1 Score for class 1 as compared to clss 0.

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

def classification_performance(y_true,y_pred):
    
    print("Confusion Matrix\n")
    print(confusion_matrix(y_true, y_pred))
    # Recall
    print("\nRecall Score\n")
    print(recall_score(y_true, y_pred, average=None))
    # Precision
    print("\nPrecision Score\n")
    print(precision_score(y_true, y_pred, average=None))
    # Method 1: sklearn
    print("\n F1 Score\n")
    print(f1_score(y_true, y_pred, average=None))
    

In [None]:
print("Classification Performance \n")
classification_performance(y_test,predictions)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

In confusion matrix given above, we can see poor scores for minority class 1. We need to take steps to improve score for minority class.

In [None]:
dectree_auc = roc_auc_score(y_test, prob)

print("Decision Tree AUC for Imbalanced Dataset : ",dectree_auc)

## Addressing Class Imbalance with Stratification

In [None]:
# create training and testing vars
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y,stratify=y, test_size=0.2)

In [None]:
score,predictions1,prob1 = decisiontree(X_train1,y_train1,X_test1,y_test1)

print("Decision Tree Prediction Score : ", score )
print("Classification Performance \n")

classification_performance(y_test1,predictions1)

In [None]:
print(classification_report(y_test1, predictions1))

Here, we can see some improvement in precision, recall and f1-score for class 1; however results are still disappointing despite use of stratification. We need to look for other strategies to resolve this issue.

In [None]:
dectree_sm_auc = roc_auc_score(y_test1, prob1)

print("Decision Tree AUC for SM : ",dectree_sm_auc)

## Addressing Class Imbalance with SMOTE

In [None]:
from collections import Counter
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_sm, y_sm = smote.fit_sample(X, y)
# summarize the new class distribution
counter = Counter(y_sm)
print(counter)
#print(type)

In [None]:
# create training and testing vars
X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(X_sm, y_sm,stratify=y_sm, test_size=0.2)

In [None]:
score,predictions_sm, prob1 = decisiontree(X_train_sm,y_train_sm,X_test_sm,y_test_sm)

print("Decision Tree Prediction Score : ", score )
print("Classification Performance \n")

classification_performance(y_test_sm,predictions_sm)

In [None]:
print(classification_report(y_test_sm, predictions_sm))

After using SMOTE to balance number of samples for minority class; we can see that we have been able to improve classification performance for class 1 for all relevant performance measures(precision, recall and f1-score). However f1-score is now around 0.73 dropping from 0.82.

In [None]:
dectree_sm_auc = roc_auc_score(y_test_sm, prob1)

print("Decision Tree AUC for Smoote approach : ",dectree_sm_auc)

AUC is also improved from 0.66 to 0.72 after using AUC.

## Combining SMOTE with random Under-Sampling of the majority class

In [None]:
from imblearn.pipeline import Pipeline
#from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# define pipeline
over = SMOTE(sampling_strategy=0.8)
under = RandomUnderSampler(sampling_strategy=0.8)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
X_sm2, y_sm2 = pipeline.fit_resample(X, y)


In [None]:
# create training and testing vars
X_train_sm2, X_test_sm2, y_train_sm2, y_test_sm2 = train_test_split(X_sm2, y_sm2,stratify=y_sm2, test_size=0.2)

In [None]:
score,predictions_sm2,prob2 = decisiontree(X_train_sm2,y_train_sm2,X_test_sm2,y_test_sm2)

print("Decision Tree Prediction Score : ", score )
print("Classification Performance \n")

classification_performance(y_test_sm2,predictions_sm2)

In [None]:
print(classification_report(y_test_sm2, predictions_sm2))

## ROC Curve and AUC Score

In [None]:

score,predictions_sm2,prob2 = decisiontree(X_train_sm2,y_train_sm2,X_test_sm2,y_test_sm2)

dectree_sm2_auc = roc_auc_score(y_test_sm2, prob2)
dectree_sm_auc = roc_auc_score(y_test_sm, prob1)
dectree_auc = roc_auc_score(y_test, prob)

print("Decision Tree AUC for Imbalanced Dataset : ",dectree_auc)
print("Decision Tree AUC for Complexing Smote Approach : ",dectree_sm2_auc)
print("Decision Tree AUC for Simple Smote Approach : ",dectree_sm_auc)

In [None]:
from matplotlib import pyplot
# calculate roc curves
im_fpr, im_tpr, _ = roc_curve(y_test, prob)
pyplot.plot(im_fpr, im_tpr, linestyle='--', label='Decision Tree with Imbalanced Dataset')

ns_fpr, ns_tpr, _ = roc_curve(y_test_sm2, prob2)
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='Decision Tree with SMOTE and UnderSampling')
sm_fpr, sm_tpr, _ = roc_curve(y_test_sm, prob1)
pyplot.plot(sm_fpr, sm_tpr, linestyle='solid', label='Decision Tree with SMOTE')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

## AUC using Cross-Validation

In [None]:

from sklearn.model_selection import cross_val_score

classifier = DecisionTreeClassifier(max_depth=5, random_state=14)

cv_scores = cross_val_score(classifier,X,y, cv=3, scoring='roc_auc')
print(cv_scores)

In [None]:
from sklearn.model_selection import cross_val_score

classifier = DecisionTreeClassifier(max_depth=10, random_state=14)

cv_scores = cross_val_score(classifier,X,y, cv=3, scoring='roc_auc')
print(cv_scores)

## Using GridSearch Cross Validation for Tuning Parameters to Improve Classification Performance

In [None]:
from sklearn.model_selection import GridSearchCV
params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4],'max_depth': [5,10,15]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, verbose=1, cv=3,scoring='roc_auc')
grid_search_cv.fit(X, y)
print(grid_search_cv)
# summarize the results of the grid search
print(grid_search_cv.best_score_)

In [None]:
print("Best Score :  ",grid_search_cv.best_score_)
print("Best Parameters : ",grid_search_cv.best_estimator_)


## Using RandomSearch Cross Validation for Tuning Parameters to Improve Classification Performance

In [None]:
from sklearn.model_selection import RandomizedSearchCV
params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4],'max_depth': [15,25,35]}
random_search_cv = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params, verbose=1, cv=3, scoring='roc_auc')
random_search_cv.fit(X, y)
print(random_search_cv)
# summarize the results of the grid search
print(random_search_cv.best_score_)

In [None]:
print("Best Score :  ",random_search_cv.best_score_)
print("Best Parameters : ",random_search_cv.best_estimator_)


## References

https://campus.datacamp.com/courses/supervised-learning-with-scikit-learn/fine-tuning-your-model?ex=7
