Dataset can be downloaded here: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.svm import SVC 



from sklearn.metrics import confusion_matrix, accuracy_score, f1_score


Preprocessing

In [2]:
df = pd.read_csv("creditcard.csv")


In [3]:
df.drop(['Time'], axis = 1, inplace = True)
df.drop_duplicates(inplace = True )

In [4]:
print(df.shape)
df.head()

(275663, 30)


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
#may have to scale the data here 

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 275663 entries, 0 to 284806
Data columns (total 30 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   V1      275663 non-null  float64
 1   V2      275663 non-null  float64
 2   V3      275663 non-null  float64
 3   V4      275663 non-null  float64
 4   V5      275663 non-null  float64
 5   V6      275663 non-null  float64
 6   V7      275663 non-null  float64
 7   V8      275663 non-null  float64
 8   V9      275663 non-null  float64
 9   V10     275663 non-null  float64
 10  V11     275663 non-null  float64
 11  V12     275663 non-null  float64
 12  V13     275663 non-null  float64
 13  V14     275663 non-null  float64
 14  V15     275663 non-null  float64
 15  V16     275663 non-null  float64
 16  V17     275663 non-null  float64
 17  V18     275663 non-null  float64
 18  V19     275663 non-null  float64
 19  V20     275663 non-null  float64
 20  V21     275663 non-null  float64
 21  V22     27

In [7]:
length = len(df)
fraud = df[df.Class == 1]
normal = df[df.Class == 0]
num_of_normal = len(normal)
num_of_fraud = len(fraud)
print("Total Number of Transactions: " + str(length))
print("Number of Nromal Transactions: " + str(num_of_normal))
print("Number of Fradulent Transactions: " + str(num_of_fraud))



Total Number of Transactions: 275663
Number of Nromal Transactions: 275190
Number of Fradulent Transactions: 473


In [8]:
print("Fraud Transaction Details: ")
fraud.Amount.describe()


Fraud Transaction Details: 


count     473.000000
mean      123.871860
std       260.211041
min         0.000000
25%         1.000000
50%         9.820000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [9]:
print("Normal Transaction Details")
normal.Amount.describe()

Normal Transaction Details


count    275190.000000
mean         90.521154
std         253.198478
min           0.000000
25%           6.390000
50%          23.750000
75%          79.900000
max       25691.160000
Name: Amount, dtype: float64

Accounting for Imbalance of Data

In [10]:
df.groupby('Class').agg('mean').reset_index()

Unnamed: 0,Class,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0,-0.029792,-0.008288,0.037131,-0.012054,-0.005596,-0.011768,0.017497,-0.007346,-0.00805,...,0.002717,0.001781,0.005689,-0.001779,-0.006696,-0.00489,-0.000327,0.001557,0.000771,90.521154
1,1,-4.49828,3.405965,-6.729599,4.472591,-2.957197,-1.432518,-5.175912,0.953255,-2.522124,...,0.405043,0.46655,0.086639,-0.096464,-0.106643,0.040615,0.050456,0.213774,0.07827,123.87186


In [11]:
normal_sample = normal.sample(n = 473)
final_df = pd.concat([normal_sample, fraud], axis = 0)
final_df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
197418,2.125583,-1.218413,-1.223323,-0.94467,-0.747042,-0.447428,-0.795058,-0.092597,0.268236,0.773379,...,0.154219,0.296289,0.009041,-1.02121,-0.081355,-0.118337,-0.028324,-0.054418,77.0,0
144467,-1.208132,1.193465,1.079276,-0.024208,-0.092767,-0.508047,0.294579,0.383045,-0.50656,-0.369748,...,-0.143177,-0.424021,0.239769,0.061885,-0.19815,0.08456,-0.080736,0.047373,10.78,0
74562,1.085681,-0.034836,1.406841,1.367242,-0.913172,0.087906,-0.594181,0.177649,0.78773,-0.26164,...,-0.006906,0.244098,0.058323,0.412093,0.31537,-0.400964,0.092722,0.039325,9.99,0
15950,-1.131488,0.309998,0.96253,-1.056051,1.976894,1.656518,0.372685,0.722463,-0.767661,-0.612447,...,-0.368569,-1.161899,0.1156,-2.241664,0.065591,0.100221,0.18335,0.006171,32.9,0
89122,1.477345,-0.387312,-0.623647,-1.017179,-0.084849,-0.477762,-0.1689,-0.181194,-1.301582,0.863743,...,0.1706,0.380699,-0.304712,-0.674766,0.894528,-0.036789,-0.036023,-0.019153,15.0,0


In [12]:
final_df["Class"].value_counts()

0    473
1    473
Name: Class, dtype: int64

In [13]:
final_df.groupby("Class").agg("mean").reset_index()

Unnamed: 0,Class,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0,0.008793,-0.068911,0.092433,0.081579,0.01118,-0.026609,0.049121,-0.014614,-0.022285,...,-0.027769,-0.022282,0.066444,-0.032995,0.003143,-0.019196,0.022599,-0.008102,0.033825,88.591543
1,1,-4.49828,3.405965,-6.729599,4.472591,-2.957197,-1.432518,-5.175912,0.953255,-2.522124,...,0.405043,0.46655,0.086639,-0.096464,-0.106643,0.040615,0.050456,0.213774,0.07827,123.87186


Training and Testing Split

In [14]:
X = final_df.drop('Class', axis = 1).values
y = final_df["Class"].values 
print(X.shape)
print(y.shape)

(946, 29)
(946,)


In [15]:
xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 1)

Testing Models

Logistic Regression

In [16]:
lr = LogisticRegression()
lr.fit(xTrain, yTrain)
lr_predictions = lr.predict(xTest)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
print("Accuracy Score of Logistic Regression Model is: {}".format(accuracy_score(yTest, lr_predictions)))
print("F1 Score of Logistic Regression Model is: {}".format(f1_score(yTest, lr_predictions)))

Accuracy Score of Logistic Regression Model is: 0.968421052631579
F1 Score of Logistic Regression Model is: 0.967741935483871


In [18]:
confusion_matrix(yTest, lr_predictions, labels = [0, 1])

array([[94,  1],
       [ 5, 90]])

Decision Tree

In [19]:
dt = DecisionTreeClassifier(max_depth = 4, criterion = 'entropy')
dt.fit(xTrain, yTrain)
dt_predictions = dt.predict(xTest)

In [20]:
print("Accuracy Score of Decision Tree Model is: {}".format(accuracy_score(yTest, dt_predictions)))
print("F1 Score of Decision Tree Model is: {}".format(f1_score(yTest, dt_predictions)))

Accuracy Score of Decision Tree Model is: 0.9263157894736842
F1 Score of Decision Tree Model is: 0.9230769230769231


In [21]:
confusion_matrix(yTest, dt_predictions, labels = [0, 1])

array([[92,  3],
       [11, 84]])

KNN

In [22]:
n = 7
KNN = KNeighborsClassifier(n_neighbors = n)
KNN.fit(xTrain, yTrain)
knn_predictions = KNN.predict(xTest)

In [23]:
print("Accuracy Score of KNN Model is: {}".format(accuracy_score(yTest, knn_predictions)))
print("F1 Score of KNN Model is: {}".format(f1_score(yTest, knn_predictions)))

Accuracy Score of KNN Model is: 0.8842105263157894
F1 Score of KNN Model is: 0.8804347826086957


In [24]:
confusion_matrix(yTest, knn_predictions, labels = [0, 1])

array([[87,  8],
       [14, 81]])

SVM

In [25]:
svm = SVC()
svm.fit(xTrain, yTrain)
svm_predictions = svm.predict(xTest)

In [26]:
print("Accuracy Score of SVM Model is: {}".format(accuracy_score(yTest, svm_predictions)))
print("F1 Score of SVM Model is: {}".format(f1_score(yTest, svm_predictions)))

Accuracy Score of SVM Model is: 0.8052631578947368
F1 Score of SVM Model is: 0.7885714285714286


In [27]:
confusion_matrix(yTest, svm_predictions, labels = [0, 1])

array([[84, 11],
       [26, 69]])

Random Forest 

In [28]:
rf = RandomForestClassifier(max_depth = 4)
rf.fit(xTrain, yTrain)
rf_predictions = rf.predict(xTest)

In [29]:
print("Accuracy Score of Random Forest Model is: {}".format(accuracy_score(yTest, rf_predictions)))
print("F1 Score of Random Forest Model is: {}".format(f1_score(yTest, rf_predictions)))

Accuracy Score of Random Forest Model is: 0.9631578947368421
F1 Score of Random Forest Model is: 0.9621621621621621


In [30]:
confusion_matrix(yTest, rf_predictions, labels = [0, 1])

array([[94,  1],
       [ 6, 89]])

Test on Whole Dataset 

In [31]:
total_x = df.drop('Class', axis = 1).values
total_y = df["Class"].values 

lr_pred = lr.predict(total_x)



In [32]:
print("Accuracy Score of Logistic Regression Model is:{}".format(accuracy_score(total_y, lr_pred)))
print("F1 Score of Logistic Regression Model is: {}".format(f1_score(total_y, lr_pred)))

Accuracy Score of Logistic Regression Model is: 0.9669923058226892
F1 Score of Logistic Regression Model is: 0.08672086720867209


In [33]:
confusion_matrix(total_y, lr_pred, labels = [0, 1])

array([[266132,   9058],
       [    41,    432]])