In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
#import lightgbm as lgb

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedBaggingClassifier

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, roc_auc_score, precision_score

import warnings
warnings.filterwarnings('ignore')

In [78]:
def eval_metrics(actual, predicted, predicted_probability):
    print("Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(actual, predicted)))
    print("")
    print("For Class 1")
    print("f1 Score :", f1_score(actual, predicted))
    print("Precision Score :",precision_score(actual, predicted))
    print("Recall Score :",recall_score(actual, predicted))
    print("")
    print("For Class 0")
    print("f1 Score :", f1_score(1-np.array(actual), 1-np.array(predicted)))
    print("Precision Score :",precision_score(1-np.array(actual), 1-np.array(predicted)))
    print("Recall Score :",recall_score(1-np.array(actual), 1-np.array(predicted)))
    print("")
    print("AUC :",  roc_auc_score(actual, predicted_probability))

In [2]:
df = pd.read_csv('SampleData.csv')
print(df.shape)
df.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
df.Class.value_counts()

Class
0    284315
1       492
Name: Time, dtype: int64

## Basic Model 

In [57]:
Xtrain, Xtest, ytrain, ytest = train_test_split(df.drop(columns = 'Class'), df.Class, test_size=0.20, random_state=27)
print("Train Shape", Xtrain.shape)
print("Train Class Distribution")
print(ytrain.value_counts())
print("")
print("Test Shape", Xtest.shape)
print("Test Class Distribution")
print(ytest.value_counts())


Train Shape (227845, 30)
Train Class Distribution
0    227457
1       388
Name: Class, dtype: int64

Test Shape (56962, 30)
Test Class Distribution
0    56858
1      104
Name: Class, dtype: int64


In [49]:
# Lets Predict All 0 (Most Frequent)
pred_0 = [0]*len(ytest)

# Accuracy
print('Test score: ', accuracy_score(ytest, pred_0))
eval_metrics(ytest, pred_0)

Test score:  0.9981742214107651
Confusion Matrix
       0  1
0  56858  0
1    104  0

For Class 1
f1 Score : 0.0
Precision Score : 0.0
Recall Score : 0.0

For Class 0
f1 Score : 0.9990862765770515
Precision Score : 0.9981742214107651
Recall Score : 1.0

AUC : 0.5


# Logistic Regression

In [6]:
lrclf = LogisticRegression().fit(Xtrain, ytrain)
pred_test_lr = lrclf.predict(Xtest)
# Checking accuracy
accuracy_score(ytest, pred_test_lr)

0.9991046662687406

In [7]:
pd.DataFrame(pred_test_lr)[0].value_counts()

0    56885
1       77
Name: 0, dtype: int64

### Now We will use different Evalution Metrics
- Confusion Matrix (Precision, Recall)
- F1 Score
- Precision Recall
- AUROC

In [58]:
eval_metrics(ytest, pred_test_lr)

Confusion Matrix
       0   1
0  56846  12
1     39  65

For Class 1
f1 Score : 0.7182320441988951
Precision Score : 0.8441558441558441
Recall Score : 0.625

For Class 0
f1 Score : 0.9995516207590797
Precision Score : 0.9993144062582403
Recall Score : 0.9997889479053079

AUC : 0.812394473952654


## Findings
- Low F1 Score
- Many Missclassifications (low recall score)

# Resampling Techniques 

## Undersampling Majority Class 

In [13]:
X = pd.concat([Xtrain, ytrain], axis=1)
zero = X[X.Class == 0]
one = X[X.Class == 1]

In [14]:
# Undersampling Zeros
zero_under = resample(zero, replace = False, n_samples = int(round(len(one)*1.25,0)), random_state = 111)
# Final Undersampled Data
undersampled = pd.concat([zero_under, one]).reset_index(drop = True)
undersampled.shape

(873, 31)

In [15]:
us_lrclf = LogisticRegression().fit(undersampled.drop(columns = 'Class'), undersampled.Class)
pred_test_lr_us = us_lrclf.predict(Xtest)
# Checking accuracy
accuracy_score(ytest, pred_test_lr_us)

0.9804957691092308

In [59]:
eval_metrics(ytest, pred_test_lr_us)

Confusion Matrix
       0     1
0  55762  1096
1     15    89

For Class 1
f1 Score : 0.13809154383242822
Precision Score : 0.0751054852320675
Recall Score : 0.8557692307692307

For Class 0
f1 Score : 0.9901362809073557
Precision Score : 0.9997310719472184
Recall Score : 0.9807239086847936

AUC : 0.9182465697270122


## Oversampling Minority Class 

In [17]:
# Over Sampling Class 1
one_over = resample(one, replace = True, n_samples = int(round(len(zero)*0.9,0)), random_state = 111)
# Oversampled Data
oversampled = pd.concat([zero, one_over]).reset_index(drop = True)
oversampled.shape

(432168, 31)

In [60]:
os_lrclf = LogisticRegression().fit(oversampled.drop(columns = 'Class'), oversampled.Class)
pred_test_lr_os = os_lrclf.predict(Xtest)
# Checking accuracy
accuracy_score(ytest, pred_test_lr_os)

0.9854639935395527

In [61]:
eval_metrics(ytest, pred_test_lr_os)

Confusion Matrix
       0    1
0  56044  814
1     14   90

For Class 1
f1 Score : 0.17857142857142858
Precision Score : 0.09955752212389381
Recall Score : 0.8653846153846154

For Class 0
f1 Score : 0.9926671153778029
Precision Score : 0.9997502586606729
Recall Score : 0.9856836329100567

AUC : 0.925534124147336


## Generate Synthetic Samples

In [20]:
sm = SMOTE(random_state=27, ratio=1.0)
Xtrain_sm, ytrain_sm = sm.fit_sample(Xtrain, ytrain)

In [23]:
print("Normal Data Shape", Xtrain.shape)
print("Data After Synthetic Data", Xtrain_sm.shape)

Normal Data Shape (227845, 30)
Data After Synthetic Data (454914, 30)


In [24]:
sm_lrclf = LogisticRegression().fit(Xtrain_sm, ytrain_sm)
pred_test_lr_sm = sm_lrclf.predict(Xtest)

In [25]:
print("Accuracy : ", accuracy_score(ytest, pred_test_lr_sm))
eval_metrics(ytest, pred_test_lr_sm)

Accuracy :  0.9834977704434535
f1 Score : 0.1622103386809269
Confusion Matrix
       0    1
0  55931  927
1     13   91
Recall Score : 0.875
AUC : 0.9293481128425198


## Cluster Centroids UnderSampling

In [30]:
from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids(ratio={0: 10})
Xtrain_cc, ytrain_cc = cc.fit_sample(Xtrain, ytrain)

In [31]:
cc_lrclf = LogisticRegression().fit(Xtrain_cc, ytrain_cc)
pred_test_lr_cc = cc_lrclf.predict(Xtest)

In [32]:
print("Accuracy : ", accuracy_score(ytest, pred_test_lr_cc))
eval_metrics(ytest, pred_test_lr_cc)

Accuracy :  0.12436361082827148
f1 Score : 0.004152857085812403
Confusion Matrix
      0      1
0  6980  49878
1     0    104
Recall Score : 1.0
AUC : 0.5613809842062683


# Change in algorithms 

## Balanced Bagging Classifier

In [26]:
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier

#Create an object of the classifier.
bbclf = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                sampling_strategy='auto',
                                replacement=False,
                                random_state=0)

#Train the classifier.
bbclf.fit(Xtrain, ytrain)
pred_test_bb = bbclf.predict(Xtest)

In [62]:
print("Accuracy : ", accuracy_score(ytest, pred_test_bb))
eval_metrics(ytest, pred_test_bb)

Accuracy :  0.9803553246023665
Confusion Matrix
       0     1
0  55752  1106
1     13    91

For Class 1
f1 Score : 0.13989239046887011
Precision Score : 0.07602339181286549
Recall Score : 0.875

For Class 0
f1 Score : 0.9900641964785167
Precision Score : 0.9997668788666726
Recall Score : 0.980548031939217

AUC : 0.9277740159696085


## Random Forest 

In [28]:
# Random Forest
rfclf = RandomForestClassifier(n_estimators=10).fit(Xtrain, ytrain)
# predict on test set
pred_test_rf = rfclf.predict(Xtest)

In [63]:
print("Accuracy : ", accuracy_score(ytest, pred_test_rf))
eval_metrics(ytest, pred_test_rf)

Accuracy :  0.9995786664794073
Confusion Matrix
       0   1
0  56856   2
1     22  82

For Class 1
f1 Score : 0.8723404255319148
Precision Score : 0.9761904761904762
Recall Score : 0.7884615384615384

For Class 0
f1 Score : 0.9997889850179362
Precision Score : 0.9996132072154436
Recall Score : 0.9999648246508847

AUC : 0.8942131815562114


## XGBoost 

In [82]:
import xgboost as xgb
params1 = {
    'booster':'gbtree',
    'objective': 'binary:logistic',
    'eval_metric' : 'auc',
    'max_depth':5,
    'subsample':0.8,
    'colsample_bytree': 0.7,
    'eta':0.03,
    'silent':0,
    'seed': 2009
}

params2 = {
    'booster':'gbtree',
    'objective': 'binary:logistic',
    'eval_metric' : 'auc',
    'max_depth':5,
    'subsample':0.8,
    'colsample_bytree': 0.7,
    'eta':0.03,
    'silent':0,
    'scale_pos_weight' : 1000,
    'seed': 2009
}
Xdev, Xval, ydev, yval = train_test_split(Xtrain, ytrain, test_size=0.20, random_state=27)
xgdev = xgb.DMatrix(Xdev, label=ydev)
xgval = xgb.DMatrix(Xval, label=yval)
watchlist = [(xgdev, 'train'), (xgval, 'valid')]
xgtest = xgb.DMatrix(Xtest)
evals={}
xgbclf1 = xgb.train(params1, xgdev, 10000, watchlist, early_stopping_rounds = 10, verbose_eval = 100)
pred_test_xgb1 = xgbclf1.predict(xgtest)
pred_test_xgb1_class = np.where(pred_test_xgb1 >= 0.5, 1, 0)
xgbclf2 = xgb.train(params2, xgdev, 10000, watchlist, early_stopping_rounds = 10, verbose_eval = 100)
pred_test_xgb2 = xgbclf2.predict(xgtest)
pred_test_xgb2_class = np.where(pred_test_xgb2 >= 0.5, 1, 0)

[0]	train-auc:0.926082	valid-auc:0.903362
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 10 rounds.
Stopping. Best iteration:
[3]	train-auc:0.926078	valid-auc:0.921461

[0]	train-auc:0.970125	valid-auc:0.956507
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 10 rounds.
Stopping. Best iteration:
[12]	train-auc:0.999055	valid-auc:0.974017



In [83]:
print("Accuracy : ", accuracy_score(ytest, pred_test_xgb1_class))
eval_metrics(ytest, pred_test_xgb1_class, pred_test_xgb1)

Accuracy :  0.9995435553526912
Confusion Matrix
       0   1
0  56853   5
1     21  83

For Class 1
f1 Score : 0.8645833333333335
Precision Score : 0.9431818181818182
Recall Score : 0.7980769230769231

For Class 0
f1 Score : 0.9997713923961594
Precision Score : 0.9996307627386856
Recall Score : 0.9999120616272117

AUC : 0.9131857163730427


In [84]:
print("Accuracy : ", accuracy_score(ytest, pred_test_xgb2_class))
eval_metrics(ytest, pred_test_xgb2_class, pred_test_xgb2)

Accuracy :  0.990045995575998
Confusion Matrix
       0    1
0  56306  552
1     15   89

For Class 1
f1 Score : 0.2389261744966443
Precision Score : 0.13884555382215288
Recall Score : 0.8557692307692307

For Class 0
f1 Score : 0.994990236704689
Precision Score : 0.9997336695016069
Recall Score : 0.9902916036441661

AUC : 0.9694557899977543


## LightGBM 

In [88]:
## Lets look into parameters of Lightgbm
import lightgbm as lgb
lgb_params1 = {
    "objective": "binary",
    "max_bin": 50,
    "metric": "auc",
    "boosting": "gbdt",
    "num_leaves": 5,
    "max_depth": -1,
    "min_child_weight": 50,
    "learning_rate": 0.05,
    "bagging_fraction": 0.5,
    "feature_fraction": 0.2,
    "bagging_frequency": 5,
    "bagging_seed": 2019,
    "verbosity": 500,
    "min_data_in_leaf": 750,
    "min_data_in_bin": 50,
    "random_seed": 2019
    }

lgb_params2 = {
    "objective": "binary",
    "max_bin": 50,
    "metric": "auc",
    "boosting": "gbdt",
    "num_leaves": 5,
    "max_depth": -1,
    "min_child_weight": 50,
    "learning_rate": 0.05,
    #"bagging_fraction": 0.5,
    "feature_fraction": 0.2,
    "bagging_frequency": 5,
    "bagging_seed": 2019,
    "verbosity": 500,
    "min_data_in_leaf": 750,
    "min_data_in_bin": 50,
    'pos_bagging_fraction':0.8,
    'neg_bagging_fraction':0.2,
    "random_seed": 2019
    }
lgdev = lgb.Dataset(Xdev, label=ydev)
lgval = lgb.Dataset(Xval, label=yval)
lgtest = lgb.Dataset(Xtest)
evals_result = {}

lgbclf1 = lgb.train(lgb_params1, lgdev, 10000, valid_sets=[lgval], early_stopping_rounds=10, verbose_eval=None, evals_result=evals_result)
lgbclf2 = lgb.train(lgb_params2, lgdev, 10000, valid_sets=[lgval], early_stopping_rounds=10, verbose_eval=None, evals_result=evals_result)

pred_test_lgb1 = lgbclf1.predict(Xtest)
pred_test_lgb1_class = np.where(pred_test_lgb1 >= 0.017, 1, 0)

pred_test_lgb2 = lgbclf2.predict(Xtest)
pred_test_lgb2_class = np.where(pred_test_lgb2 >= 0.017, 1, 0)




In [98]:
pred_test_lgb1_class = np.where(pred_test_lgb1 >= 0.017, 1, 0)
pred_test_lgb2_class = np.where(pred_test_lgb2 >= 0.017, 1, 0)

In [99]:
print("Accuracy : ", accuracy_score(ytest, pred_test_lgb1_class))
eval_metrics(ytest, pred_test_lgb1_class, pred_test_lgb1)

Accuracy :  0.9992802219023208
Confusion Matrix
       0   1
0  56841  17
1     24  80

For Class 1
f1 Score : 0.7960199004975124
Precision Score : 0.8247422680412371
Recall Score : 0.7692307692307693

For Class 0
f1 Score : 0.9996394748643634
Precision Score : 0.9995779477710367
Recall Score : 0.9997010095325196

AUC : 0.970577173363061


In [100]:
print("Accuracy : ", accuracy_score(ytest, pred_test_lgb2_class))
eval_metrics(ytest, pred_test_lgb2_class, pred_test_lgb2)

Accuracy :  0.9992802219023208
Confusion Matrix
       0   1
0  56841  17
1     24  80

For Class 1
f1 Score : 0.7960199004975124
Precision Score : 0.8247422680412371
Recall Score : 0.7692307692307693

For Class 0
f1 Score : 0.9996394748643634
Precision Score : 0.9995779477710367
Recall Score : 0.9997010095325196

AUC : 0.970577173363061
