In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Pandas options
pd.set_option('display.max_colwidth', 1000, 'display.max_rows', None, 'display.max_columns', None)

# Plotting options
%matplotlib inline
mpl.style.use('ggplot')
sns.set(style='whitegrid')

In [2]:
transactions = pd.read_csv('european creditcard dataset.csv')

In [3]:
transactions.shape

(284807, 31)

In [4]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [5]:
transactions.isnull().any().any()

False

In [6]:
transactions['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [7]:
transactions['Class'].value_counts(normalize=True)

0    0.998273
1    0.001727
Name: Class, dtype: float64

In [8]:
X = transactions.drop(labels='Class', axis=1) # Features
y = transactions.loc[:,'Class']#response

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [11]:
X_train.shape

(199364, 30)

In [12]:
X_test.shape

(85443, 30)

In [13]:
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

Number transactions X_train dataset:  (199364, 30)
Number transactions y_train dataset:  (199364,)
Number transactions X_test dataset:  (85443, 30)
Number transactions y_test dataset:  (85443,)


In [14]:
from sklearn.feature_selection import SelectPercentile

In [15]:
select = SelectPercentile(percentile = 74)

In [16]:
select.fit(X_train,y_train)

In [17]:
X_train_selected=select.transform(X_train)

In [18]:
X_test_selected=select.transform(X_test)

In [19]:
print('X_train.shape is :{}'.format(X_train.shape))

X_train.shape is :(199364, 30)


In [20]:
print('X_train_selected.shape is :{}'.format(X_train_selected.shape))

X_train_selected.shape is :(199364, 22)


In [28]:
from imblearn.over_sampling import SMOTE

In [29]:
sm = SMOTE(random_state = 0)

In [31]:
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

In [32]:
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

In [33]:
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))

After OverSampling, the shape of train_X: (398040, 30)


In [34]:
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

After OverSampling, the shape of train_y: (398040,) 



In [37]:
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1)))

After OverSampling, counts of label '1': 199020


In [38]:
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))

After OverSampling, counts of label '0': 199020


In [39]:
from sklearn.preprocessing import StandardScaler

In [40]:
stdscaler=StandardScaler()

In [41]:
X=stdscaler.fit_transform(X)

In [42]:
X

array([[-1.99658302, -0.69424232, -0.04407492, ...,  0.33089162,
        -0.06378115,  0.24496426],
       [-1.99658302,  0.60849633,  0.16117592, ..., -0.02225568,
         0.04460752, -0.34247454],
       [-1.99656197, -0.69350046, -0.81157783, ..., -0.13713686,
        -0.18102083,  1.16068593],
       ...,
       [ 1.6419735 ,  0.98002374, -0.18243372, ...,  0.01103672,
        -0.0804672 , -0.0818393 ],
       [ 1.6419735 , -0.12275539,  0.32125034, ...,  0.26960398,
         0.31668678, -0.31324853],
       [ 1.64205773, -0.27233093, -0.11489898, ..., -0.00598394,
         0.04134999,  0.51435531]])

In [43]:
from xgboost import XGBClassifier

In [44]:
model = XGBClassifier()

In [45]:
model.fit(X_train,y_train.ravel())

In [46]:
predictions=model.predict(X_test)

In [47]:
from sklearn.metrics import confusion_matrix, classification_report

In [48]:
# prinpredictions=rf.predict(X_test)t classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.93      0.83      0.88       148

    accuracy                           1.00     85443
   macro avg       0.97      0.92      0.94     85443
weighted avg       1.00      1.00      1.00     85443



In [49]:
from sklearn.metrics import accuracy_score

In [50]:
accuracy_score(y_test, predictions)

0.999602073897218

In [51]:
from sklearn.metrics import matthews_corrcoef

In [52]:
MCC=matthews_corrcoef(y_test,predictions)

In [53]:
print(" Matthews correlation coefficient is{}".format(MCC))

 Matthews correlation coefficient is0.8798151092388432
