Importing the Dependencies

In [127]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model  import LogisticRegression
from sklearn.metrics import accuracy_score


In [128]:
# Loading the Dataset to Pandas DataFrame
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [129]:
# First Five rows of Dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [130]:
# Last Five rows of Dataset
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
204076,135100.0,2.126725,-0.105402,-1.510722,0.165252,0.37654,-0.534798,0.131359,-0.228139,0.648901,...,-0.355017,-0.88868,0.22012,-1.069895,-0.181889,0.266874,-0.070569,-0.071622,1.98,0.0
204077,135100.0,-0.582696,2.109869,0.699058,4.35085,0.524572,0.436313,0.671204,0.087439,-1.980439,...,-0.131965,-0.288737,0.194044,1.115286,-0.979408,-0.089576,-0.227109,0.015287,13.26,0.0
204078,135101.0,1.988614,-0.20724,-0.259236,0.440332,-0.592331,-0.694481,-0.372261,-0.152909,1.108784,...,-0.170658,-0.280954,0.351013,-0.037285,-0.381279,-0.624344,0.042626,-0.028725,7.47,0.0
204079,135102.0,1.862102,-0.124052,-1.989752,0.382609,0.473032,-0.674517,0.298621,-0.282416,0.802053,...,-0.204158,-0.511441,0.077874,0.388335,0.007896,-0.12098,-0.019579,0.006155,108.51,1.0
204080,135102.0,0.245859,0.925437,-0.542877,-0.891049,1.585514,0.074328,1.095022,-0.146891,-0.213699,...,,,,,,,,,,


In [131]:
# Dataset Infomation
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204081 entries, 0 to 204080
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    204081 non-null  float64
 1   V1      204081 non-null  float64
 2   V2      204081 non-null  float64
 3   V3      204081 non-null  float64
 4   V4      204081 non-null  float64
 5   V5      204081 non-null  float64
 6   V6      204081 non-null  float64
 7   V7      204081 non-null  float64
 8   V8      204081 non-null  float64
 9   V9      204081 non-null  float64
 10  V10     204081 non-null  float64
 11  V11     204081 non-null  float64
 12  V12     204080 non-null  float64
 13  V13     204080 non-null  float64
 14  V14     204080 non-null  float64
 15  V15     204080 non-null  float64
 16  V16     204080 non-null  float64
 17  V17     204080 non-null  float64
 18  V18     204080 non-null  float64
 19  V19     204080 non-null  float64
 20  V20     204080 non-null  float64
 21  V21     20

In [132]:
# Checking the number of Missing Values in each Column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       1
V13       1
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [133]:
credit_card_data.shape

(204081, 31)

In [134]:
# Distribution of legit Transaction & Fraudulent Transaction
credit_card_data['Class'].value_counts()

0.0    203688
1.0       392
Name: Class, dtype: int64

This Dataset is Highly Unbalanced







0 == Normal Transaction


1 == Fraudulent Transaction

In [135]:
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]


In [136]:
print(legit.shape)
print(fraud.shape)

(203688, 31)
(392, 31)


In [137]:
# Statistical measures of the Data
legit.Amount.describe()

count    203688.000000
mean         89.698841
std         248.388033
min           0.000000
25%           5.990000
50%          23.000000
75%          79.190000
max       19656.530000
Name: Amount, dtype: float64

In [138]:
fraud.Amount.describe()

count     392.000000
mean      123.485765
std       257.455297
min         0.000000
25%         1.000000
50%        12.310000
75%       105.915000
max      2125.870000
Name: Amount, dtype: float64

In [139]:
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,71931.358926,-0.085236,-0.006647,0.310657,0.061002,-0.094003,0.045763,-0.032985,0.012967,0.013841,...,0.017287,-0.014904,-0.042991,-0.014317,0.004019,0.057168,0.005541,0.001063,0.001671,89.698841
1.0,62596.510204,-5.586024,4.158763,-7.671289,4.751863,-4.024293,-1.396896,-6.519177,0.663336,-2.750902,...,0.395749,0.777634,-0.01669,-0.047536,-0.065304,0.066588,0.038878,0.181626,0.059742,123.485765


Under Sampling

Build a sample dataset containing similar distribution of Normal transaction and Fraudlent transaction

Number of Fraudulent Transaction == 88

In [140]:
legit_sample = legit.sample(n=88)

concatenating two dataframe

In [141]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [142]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
168295,119156.0,2.35396,-1.275507,-1.721977,-1.815677,-0.542605,-0.497751,-0.799995,-0.305477,-1.537262,...,0.045437,0.552762,-0.009383,0.055954,0.196338,0.084339,-0.018459,-0.056019,35.99,0.0
10981,18898.0,-1.413647,2.62303,-0.590587,3.034058,-0.319054,-0.284531,-0.296967,1.147875,-0.364765,...,0.083967,0.457851,0.250646,0.034032,-0.936005,0.103,0.323074,0.221523,9.87,0.0
96692,65887.0,-1.837129,-0.860606,0.931402,-0.503311,-1.601545,-0.063339,-1.372122,1.010606,-1.067833,...,0.526237,1.208618,-0.385332,0.098418,-0.539129,-0.058342,0.079373,-0.31126,151.0,0.0
121184,76101.0,-0.963647,0.678023,2.075913,-0.706831,0.565468,0.966352,0.496846,0.540341,-0.478735,...,-0.117869,-0.385623,0.08279,-0.683447,-0.267812,0.16878,0.045267,0.050105,19.95,0.0
131926,79792.0,-1.109977,0.770865,1.827618,-0.429157,0.515795,-1.248315,0.761164,-0.209602,-0.009781,...,-0.353584,-0.945411,-0.246011,0.288438,0.183612,-0.004892,-0.035473,-0.100632,1.29,0.0


In [143]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
203324,134766.0,-0.079653,3.22201,-3.724201,6.037345,0.583395,-0.691346,-1.799885,-2.627781,-4.001338,...,-0.664694,1.138556,-0.350753,-0.287467,0.808889,0.823962,0.668497,0.59561,1.0,1.0
203328,134769.0,-0.967767,2.098019,-5.222929,6.514573,-4.187674,2.114178,0.948701,-2.448427,-3.203666,...,-0.843268,0.796739,1.314312,-0.352887,-1.770706,0.098133,0.956769,0.162777,925.31,1.0
203700,134928.0,1.204934,3.23807,-6.010324,5.720847,1.5484,-2.321064,-0.78188,0.076619,-2.976249,...,0.098341,-0.845866,-0.031228,0.421146,0.388361,0.056035,0.491828,0.340847,0.0,1.0
204064,135095.0,0.232512,0.938944,-4.64778,3.079844,-1.902655,-1.041408,-1.020407,0.547069,-1.10599,...,0.911373,1.042929,0.999394,0.90126,-0.452093,0.192959,0.180859,-0.029315,345.0,1.0
204079,135102.0,1.862102,-0.124052,-1.989752,0.382609,0.473032,-0.674517,0.298621,-0.282416,0.802053,...,-0.204158,-0.511441,0.077874,0.388335,0.007896,-0.12098,-0.019579,0.006155,108.51,1.0


In [144]:
new_dataset['Class'].value_counts()

1.0    392
0.0     88
Name: Class, dtype: int64

In [145]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,71054.375,-0.002853,-0.014633,0.470928,-0.048589,-0.157074,0.014944,0.074384,0.100971,-0.116311,...,0.025318,-0.001698,-0.041163,-0.086723,-0.008029,0.043771,0.013663,0.036958,0.023807,93.575568
1.0,62596.510204,-5.586024,4.158763,-7.671289,4.751863,-4.024293,-1.396896,-6.519177,0.663336,-2.750902,...,0.395749,0.777634,-0.01669,-0.047536,-0.065304,0.066588,0.038878,0.181626,0.059742,123.485765


Splitting the data into Features and Target

In [146]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [147]:
print(X)

            Time        V1        V2        V3        V4        V5        V6  \
168295  119156.0  2.353960 -1.275507 -1.721977 -1.815677 -0.542605 -0.497751   
10981    18898.0 -1.413647  2.623030 -0.590587  3.034058 -0.319054 -0.284531   
96692    65887.0 -1.837129 -0.860606  0.931402 -0.503311 -1.601545 -0.063339   
121184   76101.0 -0.963647  0.678023  2.075913 -0.706831  0.565468  0.966352   
131926   79792.0 -1.109977  0.770865  1.827618 -0.429157  0.515795 -1.248315   
...          ...       ...       ...       ...       ...       ...       ...   
203324  134766.0 -0.079653  3.222010 -3.724201  6.037345  0.583395 -0.691346   
203328  134769.0 -0.967767  2.098019 -5.222929  6.514573 -4.187674  2.114178   
203700  134928.0  1.204934  3.238070 -6.010324  5.720847  1.548400 -2.321064   
204064  135095.0  0.232512  0.938944 -4.647780  3.079844 -1.902655 -1.041408   
204079  135102.0  1.862102 -0.124052 -1.989752  0.382609  0.473032 -0.674517   

              V7        V8        V9  .

In [148]:
print(Y)

168295    0.0
10981     0.0
96692     0.0
121184    0.0
131926    0.0
         ... 
203324    1.0
203328    1.0
203700    1.0
204064    1.0
204079    1.0
Name: Class, Length: 480, dtype: float64


Split the data intto Training data and Testing data

In [149]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2 )

In [150]:
print(X.shape, X_train.shape, X_test.shape)

(480, 30) (384, 30) (96, 30)


Model Training

Logistic Regression

In [151]:
model = LogisticRegression()

In [152]:
#training the LogisticRegression Model
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation

Accuracy Score

In [153]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [154]:
print('Accuracy Score on Training data:', training_data_accuracy )

Accuracy Score on Training data: 0.953125


In [155]:
# accuray on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [156]:
print('Accuracy Score on Test data:', test_data_accuracy )

Accuracy Score on Test data: 0.8958333333333334
