# Credit Card Fraud Detection

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
import pickle

In [2]:
# Loading dataset
raw_data_url = 'https://github.com/sheenmsn/Credit_Card_Fraud_Detection/raw/main/Data/creditcard.csv'
dataset = pd.read_csv(raw_data_url)

In [3]:
# Display basic information about the dataset
dataset

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [4]:
# Check if any duplicate rows exist in the dataframe
duplicates_exist = dataset.duplicated().any()
print('Duplicate rows exist:', duplicates_exist)
print(f'Number of duplicate rows: {dataset.duplicated().sum()}')
dataset.drop_duplicates(inplace=True)

Duplicate rows exist: True
Number of duplicate rows: 1081


Removed the duplicate rows

In [5]:
dataset.shape

(283726, 31)

# Predictive models

In [6]:
# Dropping the column Time as it is not impactful
dataset = dataset.drop(['Time'], axis=1)
dataset.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [7]:
# Scaling the feature "Amount" for better result
sc = StandardScaler()
amount = dataset['Amount'].values

dataset['Amount'] = sc.fit_transform(amount.reshape(-1, 1))
dataset.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.2442,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.342584,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.1589,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.139886,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,-0.073813,0


In [8]:
# Define target and predictor values
x = dataset.drop('Class', axis=1)
y = dataset['Class']

### Split data in train and, test set

In [9]:
VALID_SIZE = 0.20
TEST_SIZE = 0.20
RANDOM_STATE = 42

In [10]:
# Splitting data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=TEST_SIZE, random_state=RANDOM_STATE, shuffle=True )

# SMOTE (Synthetic Minority Over-sampling Technique)

In [11]:
from imblearn.over_sampling import SMOTE

# Resample the dataset using SMOTE
smote = SMOTE(random_state=RANDOM_STATE)

# Resample the training dataset using SMOTE
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

### Skewness Correction and Transformation with PowerTransformer

In [12]:
# Correcting skewness using PowerTransformer

ptf = PowerTransformer(method='yeo-johnson', standardize=True, copy=True)
x_train_smote = ptf.fit_transform(x_train_smote)
x_test = ptf.transform(x_test)

## CatBoostClassifier Hyperparameter Tuning

In [13]:
cbc_param_grid = {
    'iterations': [500],
    'learning_rate': [0.01, 0.05],
    'depth': [6, 8]
}

In [14]:
# Initialize CatBoostClassifier
cbc = CatBoostClassifier(random_seed=RANDOM_STATE,
                         bagging_temperature=0.2,
                         od_type='Iter',
                         od_wait=100,
                         eval_metric='AUC')

In [15]:
cbc_grid_search = GridSearchCV(estimator=cbc,
                               param_grid=cbc_param_grid,
                               scoring='roc_auc',
                               cv=3,
                               n_jobs=-1,
                               verbose=1)

In [16]:
cbc_grid_search.fit(x_train_smote, y_train_smote)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
0:	total: 364ms	remaining: 3m 1s
1:	total: 607ms	remaining: 2m 31s
2:	total: 830ms	remaining: 2m 17s
3:	total: 1.04s	remaining: 2m 9s
4:	total: 1.26s	remaining: 2m 4s
5:	total: 1.47s	remaining: 2m
6:	total: 1.67s	remaining: 1m 57s
7:	total: 1.87s	remaining: 1m 55s
8:	total: 2.08s	remaining: 1m 53s
9:	total: 2.28s	remaining: 1m 51s
10:	total: 2.5s	remaining: 1m 51s
11:	total: 2.69s	remaining: 1m 49s
12:	total: 2.9s	remaining: 1m 48s
13:	total: 3.09s	remaining: 1m 47s
14:	total: 3.29s	remaining: 1m 46s
15:	total: 3.47s	remaining: 1m 44s
16:	total: 3.67s	remaining: 1m 44s
17:	total: 3.88s	remaining: 1m 43s
18:	total: 4.09s	remaining: 1m 43s
19:	total: 4.28s	remaining: 1m 42s
20:	total: 4.51s	remaining: 1m 42s
21:	total: 4.71s	remaining: 1m 42s
22:	total: 4.89s	remaining: 1m 41s
23:	total: 5.09s	remaining: 1m 40s
24:	total: 5.28s	remaining: 1m 40s
25:	total: 5.46s	remaining: 1m 39s
26:	total: 5.67s	remaining: 1m 39s
27:	total: 5.8

234:	total: 55.2s	remaining: 1m 2s
235:	total: 55.4s	remaining: 1m 1s
236:	total: 55.6s	remaining: 1m 1s
237:	total: 55.8s	remaining: 1m 1s
238:	total: 56s	remaining: 1m 1s
239:	total: 56.1s	remaining: 1m
240:	total: 56.3s	remaining: 1m
241:	total: 56.5s	remaining: 1m
242:	total: 56.7s	remaining: 59.9s
243:	total: 56.8s	remaining: 59.6s
244:	total: 57s	remaining: 59.3s
245:	total: 57.2s	remaining: 59s
246:	total: 57.4s	remaining: 58.8s
247:	total: 57.5s	remaining: 58.5s
248:	total: 57.7s	remaining: 58.1s
249:	total: 57.8s	remaining: 57.8s
250:	total: 58s	remaining: 57.6s
251:	total: 58.2s	remaining: 57.3s
252:	total: 58.4s	remaining: 57s
253:	total: 58.6s	remaining: 56.7s
254:	total: 58.7s	remaining: 56.4s
255:	total: 58.9s	remaining: 56.1s
256:	total: 59.1s	remaining: 55.9s
257:	total: 59.3s	remaining: 55.6s
258:	total: 59.4s	remaining: 55.3s
259:	total: 59.6s	remaining: 55s
260:	total: 59.8s	remaining: 54.7s
261:	total: 59.9s	remaining: 54.5s
262:	total: 1m	remaining: 54.2s
263:	tota

468:	total: 1m 35s	remaining: 6.29s
469:	total: 1m 35s	remaining: 6.09s
470:	total: 1m 35s	remaining: 5.88s
471:	total: 1m 35s	remaining: 5.67s
472:	total: 1m 35s	remaining: 5.46s
473:	total: 1m 35s	remaining: 5.26s
474:	total: 1m 36s	remaining: 5.05s
475:	total: 1m 36s	remaining: 4.85s
476:	total: 1m 36s	remaining: 4.64s
477:	total: 1m 36s	remaining: 4.44s
478:	total: 1m 36s	remaining: 4.23s
479:	total: 1m 36s	remaining: 4.03s
480:	total: 1m 36s	remaining: 3.82s
481:	total: 1m 36s	remaining: 3.62s
482:	total: 1m 37s	remaining: 3.42s
483:	total: 1m 37s	remaining: 3.21s
484:	total: 1m 37s	remaining: 3.01s
485:	total: 1m 37s	remaining: 2.81s
486:	total: 1m 37s	remaining: 2.6s
487:	total: 1m 37s	remaining: 2.4s
488:	total: 1m 37s	remaining: 2.2s
489:	total: 1m 37s	remaining: 2s
490:	total: 1m 38s	remaining: 1.8s
491:	total: 1m 38s	remaining: 1.6s
492:	total: 1m 38s	remaining: 1.4s
493:	total: 1m 38s	remaining: 1.2s
494:	total: 1m 38s	remaining: 996ms
495:	total: 1m 38s	remaining: 796ms
49

In [17]:
# Access the best-tuned model
best_cbc = cbc_grid_search.best_estimator_

In [20]:
with open('classifier.pkl', 'wb') as file:
          pickle.dump(best_cbc, file)