In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler, StandardScaler

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score,precision_score,recall_score,classification_report,confusion_matrix,roc_curve,auc

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

In [4]:
df = pd.read_csv('creditcard.csv')
df.shape
df.head(5)
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
Time      284807 non-null float64
V1        284807 non-null float64
V2        284807 non-null float64
V3        284807 non-null float64
V4        284807 non-null float64
V5        284807 non-null float64
V6        284807 non-null float64
V7        284807 non-null float64
V8        284807 non-null float64
V9        284807 non-null float64
V10       284807 non-null float64
V11       284807 non-null float64
V12       284807 non-null float64
V13       284807 non-null float64
V14       284807 non-null float64
V15       284807 non-null float64
V16       284807 non-null float64
V17       284807 non-null float64
V18       284807 non-null float64
V19       284807 non-null float64
V20       284807 non-null float64
V21       284807 non-null float64
V22       284807 non-null float64
V23       284807 non-null float64
V24       284807 non-null float64
V25       284807 non-null float64
V26  

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [5]:
df['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [8]:
rb = RobustScaler()
df['Time'] = rb.fit_transform(df['Time'].values.reshape(-1, 1))
df['Amount'] = rb.fit_transform(df['Amount'].values.reshape(-1, 1))
df.head(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-0.994983,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,1.783274,0
1,-0.994983,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.269825,0
2,-0.994972,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,4.983721,0
3,-0.994972,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,1.418291,0
4,-0.99496,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0.670579,0


In [9]:
X = df.drop('Class', axis = 1)
Y = df['Class']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 78)

In [10]:
smote = SMOTE()
print(sum(y_train == 1))
print(sum(y_train == 0))
x_train, y_train = smote.fit_sample(x_train, y_train)
print(sum(y_train == 1))
print(sum(y_train == 0))

383
227462
227462
227462


In [11]:
x_df = pd.DataFrame(x_train)
x_df['Class'] = y_train
print(x_df.head(5))

          0         1         2         3         4         5         6  \
0 -0.980991 -1.478848  2.650762 -1.128494  0.143886  0.787187 -0.434307   
1 -0.370575 -0.961646  0.720420  1.555834  1.372691  0.365506  0.033306   
2 -0.165345  1.138068  0.168176 -0.011986  0.838160  0.037683  0.191980   
3  0.590432 -1.299693  1.115263  0.106270 -0.050075  2.207099  4.590913   
4  0.862604 -0.175200 -1.062354  2.901493 -0.737400 -1.764376  2.282972   

          7         8         9  ...        21        22        23        24  \
0  0.998478 -0.166596  0.873590  ... -0.172135  0.291094 -0.077622 -0.996453   
1  0.618918 -0.034659 -0.370459  ... -0.004809  0.242853 -0.029480  0.109876   
2 -0.244405  0.190853  0.300872  ... -0.119256 -0.086043 -0.122040 -0.347818   
3 -0.369212  1.603642 -0.146152  ... -0.498101 -1.430112 -0.098914  0.631922   
4 -1.925457  0.650142 -1.289010  ...  0.001642  0.907272 -0.318481  0.323129   

         25        26        27        28        29  Class  
0 -0.06

In [16]:
from sklearn.linear_model import LogisticRegression
from time import time
# classifier = LogisticRegression(random_state = 0, solver = 'liblinear')
classifier = RandomForestClassifier(random_state = 42)
t1 = time()
classifier.fit(x_train, y_train)
t2 = time()
print(t2 - t1)
predictions = classifier.predict(x_test)



49.396711587905884


In [17]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predictions)
print(cm)

[[56844     9]
 [   18    91]]


In [18]:
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef
print(precision_score(y_test, predictions))
print(recall_score(y_test, predictions))
print(f1_score(y_test, predictions))
print(matthews_corrcoef(y_test, predictions))

0.91
0.8348623853211009
0.8708133971291867
0.8713876774667013
