In [24]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [25]:
dataset = pd.read_csv('creditcard.csv')
X = dataset.drop(['Time','Class'], axis=1)
y = dataset['Class']

# SMOTE and RandomUnderSampler

In [26]:
print('Original dataset shape %s' % Counter(y))

over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)

X_res, y_res = over.fit_resample(X, y)
print('oversampled dataset shape %s' % Counter(y_res))

X_res, y_res = under.fit_resample(X_res, y_res)
print('undersampled dataset shape %s' % Counter(y_res))
X_res

Original dataset shape Counter({0: 284315, 1: 492})
oversampled dataset shape Counter({0: 284315, 1: 28431})
undersampled dataset shape Counter({0: 56862, 1: 28431})


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,-0.288644,0.474731,1.080665,-1.119274,0.295908,0.056141,0.579372,0.162401,0.170653,-0.928885,...,-0.087638,0.032338,0.055886,0.230444,0.726429,-1.025912,-0.912560,0.224794,0.238287,43.000000
1,-0.947856,0.495617,0.339040,-1.819031,-0.224803,-0.042607,-0.210098,0.709135,-1.170484,0.405571,...,0.025093,0.360126,1.110914,-0.037896,-0.263549,-0.365538,-0.279933,0.405696,0.209707,5.000000
2,1.159657,0.134954,0.585037,0.506047,-0.362031,-0.321897,-0.134882,0.084110,-0.186849,0.067560,...,-0.139834,-0.168162,-0.504580,0.182394,0.210676,0.077123,0.095190,-0.018086,0.006852,1.780000
3,-2.492758,-2.588163,1.671693,-2.439837,-0.258032,-0.920751,0.450961,0.136687,1.738587,-2.673990,...,1.175338,0.590345,0.793448,0.792941,0.079805,0.852970,-0.707992,-0.056386,0.130218,404.090000
4,-1.695565,1.026970,1.124062,-0.215105,0.366588,0.538813,0.747538,0.072287,0.509853,0.241162,...,0.013833,-0.385732,-0.486837,-0.252577,-0.904552,0.501181,0.337408,-0.387132,-0.371266,49.180000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85288,-3.654374,3.363035,-7.526656,7.780785,-0.451066,-1.977493,-3.162340,-0.266446,-1.571107,-2.485314,...,-0.181229,0.541733,0.715010,0.386696,-0.617501,-0.432380,0.360900,-2.653777,0.421509,1.000000
85289,-1.408700,1.438140,-1.008564,1.647141,-0.794834,-0.551009,-1.865879,0.564645,-1.258706,-2.396785,...,0.101045,0.608281,0.568741,-0.196049,-0.324177,0.405478,-0.275739,-0.181050,-0.047365,19.536391
85290,-8.994161,6.155081,-11.533119,8.078497,-8.502171,-2.530864,-11.094113,5.914387,-6.647917,-11.320075,...,-0.094387,2.101042,-0.005590,-0.000011,0.294520,0.088598,0.223640,0.202704,-0.288090,116.451243
85291,-1.489900,2.179493,-1.313661,1.466664,-0.545221,-1.225320,-1.691404,-0.514755,-1.320276,-2.907863,...,0.245606,1.223454,0.003716,-0.171000,0.122291,0.001801,-0.507772,0.327377,-0.096929,1.000000


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.25)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     14274
           1       0.98      0.99      0.98      7050

    accuracy                           0.99     21324
   macro avg       0.99      0.99      0.99     21324
weighted avg       0.99      0.99      0.99     21324

