In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
df = pd.read_csv('creditcard.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [4]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
# normalize the amount column
df['normAmount'] = StandardScaler().fit_transform(np.array(df['Amount']).reshape(-1,1))

# drop Time and Amount (not relevant for prediction purpose)
df = df.drop(['Time', 'Amount'], axis =1)

df['Class'].value_counts()
# there are 492 fraud transactions

0    284315
1       492
Name: Class, dtype: int64

In [6]:
x = df.drop(['Class'], axis=1)
y = df['Class']

In [7]:
xtr, xts, ytr, yts = train_test_split(x,y, test_size=.2, random_state=0)

print('Number transactions xtrain : ', xtr.shape)
print('Number transactions ytrain : ', ytr.shape)
print('Number transactions xtest : ', xts.shape)
print('Number transactions ytest : ', yts.shape)

Number transactions xtrain :  (227845, 29)
Number transactions ytrain :  (227845,)
Number transactions xtest :  (56962, 29)
Number transactions ytest :  (56962,)


In [8]:
# LogisticRegression
LRG = LogisticRegression().fit(xtr,ytr.ravel())
yp_LRG = LRG.predict(xts)
print(classification_report(yts, yp_LRG))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56861
           1       0.88      0.63      0.74       101

    accuracy                           1.00     56962
   macro avg       0.94      0.82      0.87     56962
weighted avg       1.00      1.00      1.00     56962



In [9]:
# Decision Tree
DTC = DecisionTreeClassifier().fit(xtr,ytr.ravel())
yp_DTC = DTC.predict(xts)
print(classification_report(yts, yp_DTC))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56861
           1       0.74      0.77      0.76       101

    accuracy                           1.00     56962
   macro avg       0.87      0.89      0.88     56962
weighted avg       1.00      1.00      1.00     56962



In [10]:
# # Random Forest
# RFC = RandomForestClassifier().fit(xtr,ytr.ravel())
# yp_RFC = RFC.predict(xts)
# print(classification_report(yts, yp_RFC))

KeyboardInterrupt: 

In [None]:
# # KNN 
# KNN = KNeighborsClassifier().fit(xtr,ytr.ravel())
# yp_KNN = KNN.predict(xts)
# print(classification_report(yts, yp_KNN))

### SMOTE

In [10]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
xtr_res, ytr_res = sm.fit_sample(xtr, ytr.ravel())

In [11]:
print('Before OverSampling, counts of label 1 : {}'.format(sum(ytr == 1)))
print('Before OverSampling, counts of label 0 : {} \n'.format(sum(ytr == 0)))

print('Before OverSampling, shape of xtrain : {}'.format(xtr.shape))
print('Before OverSampling, shape of ytrain : {} \n'.format(xtr.shape))

print('After OverSampling, counts of label 1 : {}'.format(sum(ytr_res == 1)))
print('After OverSampling, counts of label 0 : {} \n'.format(sum(ytr_res == 0)))

print('After OverSampling, shape of xtrain : {}'.format(xtr_res.shape))
print('After OverSampling, shape of ytrain : {} \n'.format(xtr_res.shape))

Before OverSampling, counts of label 1 : 391
Before OverSampling, counts of label 0 : 227454 

Before OverSampling, shape of xtrain : (227845, 29)
Before OverSampling, shape of ytrain : (227845, 29) 

After OverSampling, counts of label 1 : 227454
After OverSampling, counts of label 0 : 227454 

After OverSampling, shape of xtrain : (454908, 29)
After OverSampling, shape of ytrain : (454908, 29) 



In [12]:
# LogisticRegression
LRG1 = LogisticRegression().fit(xtr_res, ytr_res.ravel())
yp_LRG1 = LRG.predict(xts)

print(classification_report(yts, yp_LRG1))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56861
           1       0.88      0.63      0.74       101

    accuracy                           1.00     56962
   macro avg       0.94      0.82      0.87     56962
weighted avg       1.00      1.00      1.00     56962



In [14]:
# Decision Tree
DTC1 = DecisionTreeClassifier().fit(xtr_res,ytr_res.ravel())
yp_DTC1 = DTC1.predict(xts)

print(classification_report(yts, yp_DTC1))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56861
           1       0.44      0.81      0.57       101

    accuracy                           1.00     56962
   macro avg       0.72      0.91      0.79     56962
weighted avg       1.00      1.00      1.00     56962



### Near Miss Alghorithm

In [24]:
from imblearn.under_sampling import NearMiss
nr = NearMiss()
xtr_miss, ytr_miss = nr.fit_sample(xtr, ytr.ravel())

In [25]:
print('Before OverSampling, counts of label 1 : {}'.format(sum(ytr == 1)))
print('Before OverSampling, counts of label 0 : {} \n'.format(sum(ytr == 0)))

print('Before OverSampling, shape of xtrain : {}'.format(xtr.shape))
print('Before OverSampling, shape of ytrain : {} \n'.format(xtr.shape))

print('After OverSampling, counts of label 1 : {}'.format(sum(ytr_miss == 1)))
print('After OverSampling, counts of label 0 : {} \n'.format(sum(ytr_miss == 0)))

print('After OverSampling, shape of xtrain : {}'.format(xtr_miss.shape))
print('After OverSampling, shape of ytrain : {} \n'.format(xtr_miss.shape))

Before OverSampling, counts of label 1 : 391
Before OverSampling, counts of label 0 : 227454 

Before OverSampling, shape of xtrain : (227845, 29)
Before OverSampling, shape of ytrain : (227845, 29) 

After OverSampling, counts of label 1 : 391
After OverSampling, counts of label 0 : 391 

After OverSampling, shape of xtrain : (782, 29)
After OverSampling, shape of ytrain : (782, 29) 



In [26]:
lr2 = LogisticRegression().fit(xtr_miss, ytr_miss.ravel())
pred2 = lr2.predict(xts)

print(classification_report(yts, pred2))

              precision    recall  f1-score   support

           0       1.00      0.62      0.76     56861
           1       0.00      0.95      0.01       101

    accuracy                           0.62     56962
   macro avg       0.50      0.78      0.39     56962
weighted avg       1.00      0.62      0.76     56962

