Importing the Dependencies

In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [18]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/creditcard.csv')

In [19]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [20]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
132779,80109,1.310383,0.148796,-0.106156,0.245652,0.376795,0.27666,-0.048826,0.027499,0.151769,...,-0.321307,-0.897167,-0.008168,-1.336089,0.327048,0.211107,-0.015559,0.001677,4.49,0.0
132780,80109,1.187695,0.161777,-0.102921,0.367072,8.2e-05,-0.725995,0.370336,-0.16948,-0.44123,...,-0.329042,-1.104087,0.058738,0.007578,0.288024,0.136426,-0.07515,0.003084,45.95,0.0
132781,80109,1.2104,-0.061808,0.93441,1.149472,-0.586856,0.269742,-0.582677,0.148328,0.963741,...,-0.118127,-0.134792,-0.107835,-0.459923,0.522964,-0.356392,0.07072,0.029614,9.99,0.0
132782,80110,-0.357977,-0.288527,1.506609,-2.484692,-1.031518,-0.910416,-0.259715,0.024064,-2.41357,...,-0.077703,0.151033,-0.00462,0.481458,-0.370021,-0.43937,0.380669,0.203272,15.0,0.0
132783,80110,-1.82405,0.75046,1.120322,1.158644,-0.184192,-0.196253,-0.008336,0.202622,-0.182532,...,-0.156936,-0.392619,-0.116801,-0.058439,0.238796,,,,,


In [21]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132784 entries, 0 to 132783
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    132784 non-null  int64  
 1   V1      132784 non-null  float64
 2   V2      132784 non-null  float64
 3   V3      132784 non-null  float64
 4   V4      132784 non-null  float64
 5   V5      132784 non-null  float64
 6   V6      132784 non-null  float64
 7   V7      132784 non-null  float64
 8   V8      132784 non-null  float64
 9   V9      132784 non-null  float64
 10  V10     132784 non-null  float64
 11  V11     132784 non-null  float64
 12  V12     132784 non-null  float64
 13  V13     132784 non-null  float64
 14  V14     132784 non-null  float64
 15  V15     132784 non-null  float64
 16  V16     132784 non-null  float64
 17  V17     132784 non-null  float64
 18  V18     132784 non-null  float64
 19  V19     132784 non-null  float64
 20  V20     132784 non-null  float64
 21  V21     13

In [22]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [23]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

0.0    132521
1.0       262
Name: Class, dtype: int64

This Dataset is highly unblanced

0 --> Normal Transaction

1 --> fraudulent transaction

In [26]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [27]:
print(legit.shape)
print(fraud.shape)

(132521, 31)
(262, 31)


In [28]:
# statistical measures of the data
legit.Amount.describe()

count    132521.000000
mean         92.152747
std         250.206959
min           0.000000
25%           6.150000
50%          24.350000
75%          82.000000
max       19656.530000
Name: Amount, dtype: float64

In [29]:
fraud.Amount.describe()

count     262.000000
mean      116.235115
std       245.933637
min         0.000000
25%         1.000000
50%        10.685000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [30]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,50251.345153,-0.234172,-0.005197,0.692795,0.133152,-0.278166,0.082265,-0.106357,0.059549,-0.080086,...,0.041305,-0.04033,-0.115562,-0.034117,0.012655,0.130888,0.022872,-1.9e-05,0.002266,92.152747
1.0,42119.370229,-5.644679,3.962258,-7.190628,4.521182,-4.002069,-1.48968,-5.96575,1.512608,-2.610845,...,0.240131,1.263063,-0.315132,-0.117179,-0.104567,0.200913,0.05637,0.491164,0.081891,116.235115


Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

Number of Fraudulent Transactions --> 492

In [31]:
legit_sample = legit.sample(n=492)

Concatenating two DataFrames

In [32]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [33]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
107263,70361,-0.549179,0.575333,2.110009,-0.189921,0.173191,0.849991,0.045435,0.304159,-0.136092,...,-0.014764,0.122829,-0.33194,-0.769249,0.040107,0.346045,0.096002,0.091456,11.5,0.0
113813,73216,-0.656677,1.206945,1.026225,0.812685,-0.007804,-0.175646,0.227742,0.542883,-1.07973,...,0.290754,0.747568,-0.084986,0.237667,-0.372742,-0.332788,0.063738,0.082275,1.23,0.0
31809,36525,0.994686,-0.169346,-0.351909,0.983237,0.443766,0.458832,0.273602,-0.051538,-0.043587,...,0.106452,0.086611,-0.395891,-1.297837,0.754673,-0.182551,0.008624,0.03246,156.98,0.0
5749,6103,-0.2951,2.607655,-2.742706,1.11294,1.756604,-0.291342,0.711233,0.000777,1.566567,...,-0.448236,-0.403346,-8.6e-05,-1.742925,-0.111752,-0.322949,0.780449,0.282922,8.99,0.0
70800,54024,-2.506095,2.711808,0.01496,-0.238708,-0.80356,0.554755,-2.185303,-4.456167,-1.047871,...,-2.729477,0.273574,0.46167,-0.054243,-0.171112,0.024735,-0.003207,-0.049727,11.99,0.0


In [34]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
124115,77182,-1.410852,2.268271,-2.297554,1.871331,0.248957,-1.208799,-1.358648,1.102916,-1.317364,...,0.155381,-0.61488,-0.196126,-0.464376,0.118473,-0.484537,0.373596,0.187657,1.0,1.0
124176,77202,-0.356326,1.435305,-0.813564,1.993117,2.055878,-0.543579,0.487691,0.085449,-0.536352,...,-0.312863,-0.687874,-0.267003,-1.15848,0.27146,-0.155397,0.114328,0.101526,1.0,1.0
125342,77627,-7.13906,2.773082,-6.757845,4.446456,-5.464428,-1.713401,-6.485365,3.409395,-3.053493,...,1.30325,-0.016118,-0.87667,0.38223,-1.054624,-0.614606,-0.766848,0.409424,106.9,1.0
128479,78725,-4.312479,1.886476,-2.338634,-0.475243,-1.185444,-2.112079,-2.122793,0.272565,0.290273,...,0.550541,-0.06787,-1.114692,0.269069,-0.020572,-0.963489,-0.918888,0.001454,60.0,1.0
131272,79540,-0.114361,1.036129,1.984405,3.128243,-0.740344,1.548619,-1.701284,-2.203842,-1.242265,...,-1.032935,1.196428,-0.112857,0.254719,0.696668,0.48237,0.129969,0.223924,0.2,1.0


In [None]:
new_dataset['Class'].value_counts()

1    492
0    492
Name: Class, dtype: int64

In [35]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,50250.111789,-0.304285,0.166305,0.695658,0.115457,-0.353293,0.079374,-0.137187,0.095672,0.03571,...,0.07887,-0.061674,-0.099609,-0.044536,0.009136,0.140059,0.016446,0.027069,0.007822,78.382398
1.0,42119.370229,-5.644679,3.962258,-7.190628,4.521182,-4.002069,-1.48968,-5.96575,1.512608,-2.610845,...,0.240131,1.263063,-0.315132,-0.117179,-0.104567,0.200913,0.05637,0.491164,0.081891,116.235115


Splitting the data into Features & Targets

In [None]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [None]:
print(X)

            Time        V1        V2  ...       V27       V28  Amount
203131  134666.0 -1.220220 -1.729458  ...  0.173995 -0.023852  155.00
95383    65279.0 -1.295124  0.157326  ...  0.317321  0.105345   70.00
99706    67246.0 -1.481168  1.226490  ... -0.546577  0.076538   40.14
153895  100541.0 -0.181013  1.395877  ... -0.229857 -0.329608  137.04
249976  154664.0  0.475977 -0.573662  ...  0.058961  0.012816   19.60
...          ...       ...       ...  ...       ...       ...     ...
279863  169142.0 -1.927883  1.125653  ...  0.292680  0.147968  390.00
280143  169347.0  1.378559  1.289381  ...  0.389152  0.186637    0.76
280149  169351.0 -0.676143  1.126366  ...  0.385107  0.194361   77.89
281144  169966.0 -3.113832  0.585864  ...  0.884876 -0.253700  245.00
281674  170348.0  1.991976  0.158476  ...  0.002988 -0.015309   42.53

[984 rows x 30 columns]


In [None]:
print(Y)

203131    0
95383     0
99706     0
153895    0
249976    0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64


Split the data into Training data & Testing Data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(984, 30) (787, 30) (197, 30)


Model Training

Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Model Evaluation

Accuracy Score

In [None]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9415501905972046


In [None]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9390862944162437
