In [None]:
!git clone https://github.com/rhfo3218/LG_ML_tutorial.git
import os
os.chdir('/content/LG_ML_tutorial/4. Imbalance_data_classification')

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix

from utils import SMOTE, RUS, decision_plot

In [2]:
bank = pd.read_csv('Personal Loan.csv')
input_idx = [1,2,3,5,6,7,8,10,11,12,13]
target_idx = 9

X = np.array(bank.iloc[:, input_idx])
y = np.array(bank.iloc[:, target_idx])

X = X[y < 2,:]
y = y[y < 2]

## Stratified Random Partitioning


In [3]:
sss = StratifiedShuffleSplit(n_splits=1,train_size=0.7)

for train_index, test_index in sss.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

## model training (original)

In [4]:
model = DecisionTreeClassifier()
model.fit(X_train,y_train)

pred_y = model.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_true=y_test,y_pred=pred_y).ravel()
tpr = tp/(tp+fn)
fpr = fp/(tn+fp)
auc = (1+tpr-fpr)/2

print("TPR: {:.3f}, FPR: {:.3f}, AUC: {:.3f}".format(tpr,fpr,auc))

TPR: 0.909, FPR: 0.019, AUC: 0.945


## model training (SMOTE)

In [5]:
c = Counter(y_train)
ir = np.int(np.floor(c[0]/c[1]))

smt = SMOTE(X_train,y_train,K=5,dup_size=ir)
smt_X = np.vstack((X,smt))
smt_y = np.concatenate((y,np.ones(smt.shape[0])))

model = DecisionTreeClassifier()
model.fit(smt_X,smt_y)

pred_y = model.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_true=y_test,y_pred=pred_y).ravel()
tpr = tp/(tp+fn)
fpr = fp/(tn+fp)
auc = (1+tpr-fpr)/2

print("TPR: {:.3f}, FPR: {:.3f}, AUC: {:.3f}".format(tpr,fpr,auc))

TPR: 1.000, FPR: 0.000, AUC: 1.000


## model training (RUS)

In [6]:
rus_X,rus_y = RUS(X,y,p=0.5)

model = DecisionTreeClassifier()
model.fit(rus_X,rus_y)

pred_y = model.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_true=y_test,y_pred=pred_y).ravel()
tpr = tp/(tp+fn)
fpr = fp/(tn+fp)
auc = (1+tpr-fpr)/2

print("TPR: {:.3f}, FPR: {:.3f}, AUC: {:.3f}".format(tpr,fpr,auc))

TPR: 1.000, FPR: 0.019, AUC: 0.990


## model training (Cost sensitive)

In [7]:
model = DecisionTreeClassifier(class_weight="balanced")
model.fit(X_train,y_train)

pred_y = model.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_true=y_test,y_pred=pred_y).ravel()
tpr = tp/(tp+fn)
fpr = fp/(tn+fp)
auc = (1+tpr-fpr)/2

print("TPR: {:.3f}, FPR: {:.3f}, AUC: {:.3f}".format(tpr,fpr,auc))

TPR: 0.883, FPR: 0.006, AUC: 0.939
