<a href="https://colab.research.google.com/github/sid884/Calculator/blob/main/Credit_Card_Fraud_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [5]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [6]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
97137,66085,0.998571,-0.046655,0.644378,1.114925,-0.665698,-0.627024,-0.002057,0.01811,-0.052156,...,-0.171808,-0.784227,0.095822,0.455048,0.167887,-0.72906,0.004417,0.037612,89.0,0.0
97138,66085,-1.326193,0.549467,1.220272,1.286509,0.473532,-0.681876,-0.249255,0.444731,-0.768583,...,0.088777,0.029885,-0.123943,-0.092548,-0.159851,-0.360097,0.318036,0.007246,3.6,0.0
97139,66086,1.230983,-0.22452,-0.345196,0.212802,1.586953,3.997378,-1.145351,1.068038,0.584379,...,0.067612,0.229977,-0.119921,1.019614,0.667317,-0.226637,0.071064,0.028365,1.0,0.0
97140,66086,1.241193,0.767604,-0.210715,1.297487,0.152102,-1.162435,0.389686,-0.321743,-0.288129,...,-0.036601,0.032307,-0.136263,0.308814,0.73834,-0.331821,0.040823,0.054137,1.0,0.0
97141,66087,0.310485,-2.576074,1.002015,0.011196,-2.280745,0.465648,-0.860224,0.156411,0.087629,...,,,,,,,,,,


In [7]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97142 entries, 0 to 97141
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    97142 non-null  int64  
 1   V1      97142 non-null  float64
 2   V2      97142 non-null  float64
 3   V3      97142 non-null  float64
 4   V4      97142 non-null  float64
 5   V5      97142 non-null  float64
 6   V6      97142 non-null  float64
 7   V7      97142 non-null  float64
 8   V8      97142 non-null  float64
 9   V9      97142 non-null  float64
 10  V10     97142 non-null  float64
 11  V11     97142 non-null  float64
 12  V12     97142 non-null  float64
 13  V13     97142 non-null  float64
 14  V14     97142 non-null  float64
 15  V15     97142 non-null  float64
 16  V16     97142 non-null  float64
 17  V17     97142 non-null  float64
 18  V18     97142 non-null  float64
 19  V19     97141 non-null  float64
 20  V20     97141 non-null  float64
 21  V21     97141 non-null  float64
 22

In [8]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [9]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,96919
1.0,222


In [10]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [11]:
print(legit.shape)
print(fraud.shape)

(96919, 31)
(222, 31)


In [12]:
# statistical measures of the data
legit.Amount.describe()

Unnamed: 0,Amount
count,96919.0
mean,98.31027
std,265.983851
min,0.0
25%,7.58
50%,26.61
75%,89.345
max,19656.53


In [13]:
fraud.Amount.describe()

Unnamed: 0,Amount
count,222.0
mean,114.488243
std,255.373074
min,0.0
25%,1.0
50%,7.805
75%,99.99
max,1809.68


In [14]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,41730.096658,-0.249732,-0.043534,0.694778,0.151455,-0.269573,0.098435,-0.093862,0.050744,-0.036243,...,0.043675,-0.031999,-0.108137,-0.036618,0.009741,0.131925,0.026549,-0.0007,0.001378,98.31027
1.0,36541.941441,-6.044462,4.134072,-7.932926,4.915738,-4.386432,-1.796113,-6.30049,2.722455,-2.896811,...,0.345305,0.715188,-0.125165,-0.26545,-0.105791,0.205945,0.103589,0.523395,0.037908,114.488243


In [16]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [17]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
21228,31546,1.116461,-0.246055,1.109227,0.620605,-0.661535,0.680203,-0.78685,0.358416,0.637853,...,-0.029275,0.124945,0.002499,-0.262912,0.20613,0.348305,0.031257,0.011221,10.0,0.0
55867,47161,1.222418,-1.135515,0.126621,-0.039243,0.723991,4.750807,-1.895529,1.270656,0.445189,...,-0.53575,-0.774665,-0.041533,0.994451,0.588122,-0.26385,0.127456,0.039159,30.2,0.0
30359,35886,0.813475,-0.399457,0.096493,1.3391,-0.098217,0.394587,0.185858,0.108528,0.033567,...,0.032263,-0.095031,-0.266256,-0.27864,0.613928,-0.328201,0.003201,0.032411,180.0,0.0
93751,64570,-0.696579,-4.670129,-1.199386,-0.316504,-2.371137,-0.204587,0.664737,-0.405138,-1.779145,...,0.489786,-0.657236,-1.038159,0.11017,0.151629,-0.150693,-0.188077,0.200109,1143.92,0.0
72062,54550,1.186198,0.539193,0.702982,2.165813,0.153421,0.408914,-0.125172,0.112662,-0.933307,...,-0.203654,-0.647213,0.030426,-0.530292,0.290011,-0.22725,0.007281,0.0167,3.03,0.0


In [18]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
95534,65358,1.193916,-0.571085,0.742522,-0.014588,-0.624561,0.832162,-0.83335,0.272897,1.169425,...,-0.049502,0.207265,-0.265272,-0.679294,0.511812,1.246604,-0.028671,-0.006112,31.91,1.0
95597,65385,-2.923827,1.524837,-3.018758,3.289291,-5.755542,2.218276,-0.509995,-3.569444,-1.016592,...,-0.511657,-0.122724,-4.288639,0.563797,-0.949451,-0.204532,1.510206,-0.324706,1354.25,1.0
96341,65728,1.227614,-0.668974,-0.271785,-0.58944,-0.604795,-0.350285,-0.486365,-0.010809,-0.794944,...,-0.026055,-0.295255,-0.180459,-0.436539,0.494649,-0.283738,-0.001128,0.035075,98.01,1.0
96789,65936,-3.593476,0.781442,-1.822448,0.605761,-1.194656,-0.517195,-1.722523,0.12889,0.014963,...,0.351792,0.391249,-0.252875,-0.498042,0.010172,0.909929,-1.478767,0.722673,101.5,1.0
96994,66037,0.286302,1.399345,-1.682503,3.864377,-1.185373,-0.341732,-2.53938,0.768378,-1.547882,...,0.352456,-0.243678,-0.194079,-0.172201,0.742237,0.12779,0.569731,0.291206,7.53,1.0


In [19]:
new_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,492
1.0,222


In [20]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,41840.802846,-0.24033,-0.015483,0.681016,0.235575,-0.299826,0.023346,-0.059299,0.053129,-0.110104,...,0.060706,-0.045431,-0.120207,-0.024708,-0.031532,0.144189,0.062239,-0.020359,-0.007047,103.280935
1.0,36541.941441,-6.044462,4.134072,-7.932926,4.915738,-4.386432,-1.796113,-6.30049,2.722455,-2.896811,...,0.345305,0.715188,-0.125165,-0.26545,-0.105791,0.205945,0.103589,0.523395,0.037908,114.488243


In [21]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [22]:
print(X)

        Time        V1        V2        V3        V4        V5        V6  \
21228  31546  1.116461 -0.246055  1.109227  0.620605 -0.661535  0.680203   
55867  47161  1.222418 -1.135515  0.126621 -0.039243  0.723991  4.750807   
30359  35886  0.813475 -0.399457  0.096493  1.339100 -0.098217  0.394587   
93751  64570 -0.696579 -4.670129 -1.199386 -0.316504 -2.371137 -0.204587   
72062  54550  1.186198  0.539193  0.702982  2.165813  0.153421  0.408914   
...      ...       ...       ...       ...       ...       ...       ...   
95534  65358  1.193916 -0.571085  0.742522 -0.014588 -0.624561  0.832162   
95597  65385 -2.923827  1.524837 -3.018758  3.289291 -5.755542  2.218276   
96341  65728  1.227614 -0.668974 -0.271785 -0.589440 -0.604795 -0.350285   
96789  65936 -3.593476  0.781442 -1.822448  0.605761 -1.194656 -0.517195   
96994  66037  0.286302  1.399345 -1.682503  3.864377 -1.185373 -0.341732   

             V7        V8        V9  ...       V20       V21       V22  \
21228 -0.7868

In [23]:
print(Y)

21228    0.0
55867    0.0
30359    0.0
93751    0.0
72062    0.0
        ... 
95534    1.0
95597    1.0
96341    1.0
96789    1.0
96994    1.0
Name: Class, Length: 714, dtype: float64


In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [25]:
print(X.shape, X_train.shape, X_test.shape)

(714, 30) (571, 30) (143, 30)


In [26]:
model = LogisticRegression()

In [27]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation

Accuracy Score

In [28]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [29]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9597197898423818


In [30]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [31]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9370629370629371
