#CREDIT CARD FRAUD DETECTION

###IMPORTING THE LIBRARIES

In [456]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

Loading the dataset

In [457]:
data = pd.read_csv("/content/training_data.csv")
labels = pd.read_csv("/content/train_data_classlabels.csv")

Merging data labels into the training data

In [458]:
data['Class'] = labels

Data infromation

In [459]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57116 entries, 0 to 57115
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    57116 non-null  int64  
 1   V1      57116 non-null  float64
 2   V2      57116 non-null  float64
 3   V3      57116 non-null  float64
 4   V4      57116 non-null  float64
 5   V5      57116 non-null  float64
 6   V6      57116 non-null  float64
 7   V7      57116 non-null  float64
 8   V8      57116 non-null  float64
 9   V9      57116 non-null  float64
 10  V10     57116 non-null  float64
 11  V11     57116 non-null  float64
 12  V12     57116 non-null  float64
 13  V13     57116 non-null  float64
 14  V14     57116 non-null  float64
 15  V15     57116 non-null  float64
 16  V16     57116 non-null  float64
 17  V17     57116 non-null  float64
 18  V18     57116 non-null  float64
 19  V19     57116 non-null  float64
 20  V20     57116 non-null  float64
 21  V21     57116 non-null  float64
 22

Checking for missing values

In [460]:
data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [461]:
data.isnull().sum().sum()

0

We have no missing values!

Checking distribution of legit and fraudulent transactions

In [462]:
data['Class'].value_counts()

0.0    56974
1.0      142
Name: Class, dtype: int64

The data is very unbalanced!!

0 => Normal Transactions

1 => Fraudulent Transactions

Separating the data for analysis

In [463]:
legit = data[data.Class == 0]
fraud = data[data.Class ==1]

In [464]:
print(legit.shape)
print(fraud.shape)

(56974, 31)
(142, 31)


Stastical measures

In [465]:
legit.Amount.describe()

count    56974.000000
mean        96.947941
std        270.511706
min          0.000000
25%          7.610000
50%         26.315000
75%         87.935000
max      19656.530000
Name: Amount, dtype: float64

In [466]:
fraud.Amount.describe()

count     142.000000
mean       94.179930
std       227.637806
min         0.000000
25%         1.000000
50%         3.860000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

Comparing the values

In [467]:
data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,34996.197195,-0.231008,-0.034844,0.704876,0.156111,-0.257629,0.105005,-0.093744,0.04454,0.02164,...,0.043223,-0.031459,-0.106587,-0.037273,0.006573,0.134248,0.021278,0.001243,0.001023,96.947941
1.0,29734.647887,-6.983068,5.108252,-9.589043,5.54433,-5.168831,-2.231451,-7.328468,3.65638,-3.290294,...,0.425579,0.83012,-0.245639,-0.287868,-0.080482,0.278531,0.123823,0.627499,0.063417,94.17993


The data is highly unbalanced. So we will use the technique of under-sampling to overcome the unbalanced data.

There are 142 instanced of fraud in the data, so we will sample the 142 instances of legit transactions.

In [468]:
legit_sample = legit.sample(n=142)

Concatenating both data frames

In [469]:
newdata = pd.concat([legit_sample,fraud], axis=0)

In [470]:
newdata['Class'].value_counts()

0.0    142
1.0    142
Name: Class, dtype: int64

New data is uniformly distributed!

In [471]:
newdata.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,35073.176056,-0.387248,-0.025383,0.761409,0.269129,-0.195475,0.161447,-0.033701,0.216792,-0.08332,...,0.13442,-0.012314,-0.141399,-0.006972,0.009738,0.10687,-0.045521,-0.0334,0.011543,112.021197
1.0,29734.647887,-6.983068,5.108252,-9.589043,5.54433,-5.168831,-2.231451,-7.328468,3.65638,-3.290294,...,0.425579,0.83012,-0.245639,-0.287868,-0.080482,0.278531,0.123823,0.627499,0.063417,94.17993


Splitting the new data again into features & targets

In [472]:
X = newdata.drop(columns = 'Class', axis = 1)
Y = newdata['Class']

In [473]:
print(X)

        Time         V1         V2         V3         V4         V5        V6  \
38978   8244   1.448056  -0.186388  -0.525712  -1.525213  -0.031220 -0.747490   
51017  27780  -1.378503  -0.354806   2.665691  -1.016387   0.218326  0.001238   
54240   1043   0.896741  -0.630224   0.405024   0.582719  -0.939855 -0.568730   
18087  44031  -0.428961  -1.035180   0.986849  -1.391227  -2.428162  1.020413   
12689  33176   0.908840  -0.560144   0.358584   0.943081   0.041097  1.585233   
...      ...        ...        ...        ...        ...        ...       ...   
54172  53451   0.385108   1.217620  -1.953872   2.087076  -1.144225 -0.576888   
55181  48533   1.243848   0.524526  -0.538884   1.209196   0.479538 -0.197429   
55691  36170  -5.685013   5.776516  -7.064977   5.902715  -4.715564 -1.755633   
56147  21662 -18.018561  10.558600 -24.667741  11.786180 -10.564657 -2.645681   
56464  41233 -10.645800   5.918307 -11.671043   8.807369  -7.975501 -3.586806   

              V7        V8 

In [474]:
print(Y)

38978    0.0
51017    0.0
54240    0.0
18087    0.0
12689    0.0
        ... 
54172    1.0
55181    1.0
55691    1.0
56147    1.0
56464    1.0
Name: Class, Length: 284, dtype: float64


In [475]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [476]:
print(X.shape,X_train.shape, X_test.shape)

(284, 30) (227, 30) (57, 30)


#MODEL TRAINING

LOGISTIC REGRESSION

In [477]:
model = LogisticRegression(max_iter=1000)

Training the logistic regression model with training data

In [478]:
model.fit(X_train,Y_train)

#ACCURACY SCORE

In [479]:
X_prediction = model.predict(X_train)

In [480]:
accuracy = accuracy_score(X_prediction, Y_train)
precision = precision_score(X_prediction, Y_train)
recall = recall_score(X_prediction, Y_train)
f1 = f1_score(X_prediction,Y_train)

#For binary classification tasks, you can use roc_auc_score
roc_auc = roc_auc_score(X_prediction,Y_train)

In [481]:
print(accuracy,f1,roc_auc)

0.9647577092511013 0.9649122807017544 0.9647570253066293


In [482]:
X_pred_test = model.predict(X_test)

In [483]:
accuracy = accuracy_score(X_pred_test, Y_test)
precision = precision_score(X_pred_test, Y_test)
recall = recall_score(X_pred_test, Y_test)
f1 = f1_score(X_pred_test,Y_test)

#For binary classification tasks, you can use roc_auc_score
roc_auc = roc_auc_score(X_pred_test,Y_test)

In [484]:
print(accuracy,f1,roc_auc)

0.9649122807017544 0.9642857142857143 0.9649014778325125
