In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report



In [2]:
# Load the dataset
data = pd.read_csv("creditcard.csv")

In [3]:
data.shape

(284807, 31)

In [4]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
data.columns.to_list()

['Time',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'Amount',
 'Class']

In [6]:
data[['Amount','Time','Class']].head(5)

Unnamed: 0,Amount,Time,Class
0,149.62,0.0,0
1,2.69,0.0,0
2,378.66,1.0,0
3,123.5,1.0,0
4,69.99,2.0,0


In [7]:
# Data preprocessing
# Standardize the 'Amount' column
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

In [8]:
data[['Amount','Time','Class']].head(5)

Unnamed: 0,Amount,Time,Class
0,0.244964,0.0,0
1,-0.342475,0.0,0
2,1.160686,1.0,0
3,0.140534,1.0,0
4,-0.073403,2.0,0


In [9]:
# Split the data into features and labels
X = data.drop(['Time', 'Class'], axis=1)
y = data['Class']

In [10]:
X.head(5)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.244964
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.342475
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.160686
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.140534
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,-0.073403


In [11]:
y.value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [12]:
data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# describes info about train and test set
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

Number transactions X_train dataset:  (227845, 29)
Number transactions y_train dataset:  (227845,)
Number transactions X_test dataset:  (56962, 29)
Number transactions y_test dataset:  (56962,)


SMOTE (Synthetic Minority Oversampling Technique) – Oversampling:
SMOTE (synthetic minority oversampling technique) is one of the most commonly used oversampling methods to solve the imbalance problem. It aims to balance class distribution by randomly increasing minority class examples by replicating them. SMOTE synthesises new minority instances between existing minority instances. It generates the virtual training records by linear interpolation for the minority class. These synthetic training records are generated by randomly selecting one or more of the k-nearest neighbors for each example in the minority class. After the oversampling process, the data is reconstructed and several classification models can be applied for the processed data.

In [15]:
print('Before OverSampling, the shape of train_X: {}'.format(X_train.shape))
print('Before OverSampling, the shape of train_y: {} \n'.format(y_train.shape))
  
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))

Before OverSampling, counts of label '1': 394
Before OverSampling, counts of label '0': 227451 



In [20]:
# import SMOTE module from imblearn library
# pip install imblearn (if you don't have imblearn in your system)
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

In [21]:
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))
  
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))


After OverSampling, the shape of train_X: (454902, 29)
After OverSampling, the shape of train_y: (454902,) 

After OverSampling, counts of label '1': 227451
After OverSampling, counts of label '0': 227451


In [23]:
# Initialize PyCaret setup
from pycaret.classification import *
exp1 = setup(data, target='Class', fix_imbalance=True, imputation_type='iterative', session_id=123,)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Class
2,Target type,Binary
3,Original data shape,"(284807, 31)"
4,Transformed data shape,"(483483, 31)"
5,Transformed train set shape,"(398040, 31)"
6,Transformed test set shape,"(85443, 31)"
7,Numeric features,30
8,Preprocess,True
9,Imputation type,iterative


In [24]:
# Compare models and select the best
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9996,0.9739,0.8433,0.907,0.8734,0.8732,0.8741,12.669
rf,Random Forest Classifier,0.9995,0.9697,0.8431,0.8806,0.8607,0.8604,0.861,49.4
lightgbm,Light Gradient Boosting Machine,0.9992,0.9695,0.8492,0.739,0.7864,0.786,0.7898,4.085
dt,Decision Tree Classifier,0.9983,0.8991,0.7997,0.5028,0.6169,0.616,0.633,6.148
dummy,Dummy Classifier,0.9983,0.5,0.0,0.0,0.0,0.0,0.0,0.358
gbc,Gradient Boosting Classifier,0.9956,0.9803,0.8839,0.268,0.4098,0.4082,0.4841,110.902
ada,Ada Boost Classifier,0.9919,0.9786,0.8927,0.1636,0.2763,0.2742,0.3799,21.237
nb,Naive Bayes,0.9917,0.9711,0.7767,0.1451,0.2444,0.2422,0.3334,0.485
ridge,Ridge Classifier,0.9888,0.0,0.8171,0.1146,0.2008,0.1984,0.3033,0.436
lda,Linear Discriminant Analysis,0.9888,0.9732,0.8171,0.1146,0.2009,0.1984,0.3033,1.619


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

In [25]:
best_model

In [29]:
ET = create_model('et')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9993,0.9799,0.8,0.8235,0.8116,0.8113,0.8114
1,0.9995,0.9518,0.8,0.9333,0.8615,0.8613,0.8639
2,0.9995,0.9811,0.8,0.9032,0.8485,0.8482,0.8498
3,0.9996,0.9818,0.8857,0.9118,0.8986,0.8984,0.8985
4,0.9997,0.9483,0.8824,0.9375,0.9091,0.9089,0.9094
5,0.9995,0.9821,0.7941,0.9,0.8438,0.8435,0.8452
6,0.9996,0.9812,0.8824,0.9091,0.8955,0.8953,0.8954
7,0.9996,0.9507,0.8824,0.8824,0.8824,0.8822,0.8822
8,0.9995,0.9828,0.7941,0.9,0.8438,0.8435,0.8452
9,0.9998,0.9996,0.9118,0.9688,0.9394,0.9393,0.9397


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# Tune the best model
tuned_model, tuner = tune_model(ET, return_tuner=True)

Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [None]:
type(tuned_model), type(tuner)

In [None]:
print(tuner)

In [None]:
# Make predictions on the test set
y_pred = predict_model(tuned_model, data=X_test)

In [None]:
# Evaluate the model
conf_matrix = confusion_matrix(y_test, y_pred['Label'])
accuracy = accuracy_score(y_test, y_pred['Label'])
class_report = classification_report(y_test, y_pred['Label'])

In [None]:
print("Confusion Matrix:\n", conf_matrix)
print("Accuracy:", accuracy)
print("Classification Report:\n", class_report)

In [None]:
# Visualize the confusion matrix
def plot_confusion_matrix(cm, labels):
    plt.figure(figsize=(6, 6))
    plt.imshow(cm, interpolation='nearest', cmap=plt.get_cmap('Blues'))
    plt.title('Confusion Matrix')
    plt.colorbar()

    tick_marks = np.arange(len(labels))
    plt.xticks(tick_marks, labels, rotation=45)
    plt.yticks(tick_marks, labels)

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

labels = ['Legitimate', 'Fraud']
plot_confusion_matrix(conf_matrix, labels)
plt.show()