In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [33]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv("/content/creditcard.csv")

In [34]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [35]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
134758,80949,0.120139,2.063931,-1.959708,1.420907,1.014997,-1.218084,0.818542,-0.015111,-0.162497,...,-0.17088,0.119557,0.11909,-0.12353,-0.507578,-0.447859,0.029418,-0.383972,0.89,0.0
134759,80949,1.111266,-0.124477,1.374073,0.881283,-0.962547,0.009063,-0.614334,0.125082,0.730352,...,-0.027973,0.196891,0.085341,0.480376,0.176152,0.321774,0.040823,0.030144,11.5,0.0
134760,80950,-1.192845,1.15494,1.20006,-1.334743,-0.44222,-0.687518,0.130546,0.567359,0.107404,...,-0.052138,-0.161462,0.0238,0.083906,-0.365689,0.758129,0.250869,0.167202,0.92,0.0
134761,80950,1.22837,-0.003381,0.949419,1.1357,-0.535512,0.285022,-0.567674,0.129072,0.902107,...,-0.119032,-0.097548,-0.104393,-0.436793,0.540798,-0.357359,0.075025,0.029051,1.0,0.0
134762,80950,-2.160444,-1.423523,1.890974,0.837779,2.516062,-1.968813,-0.950919,0.092957,-0.667883,...,-0.023305,,,,,,,,,


In [36]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134763 entries, 0 to 134762
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    134763 non-null  int64  
 1   V1      134763 non-null  float64
 2   V2      134763 non-null  float64
 3   V3      134763 non-null  float64
 4   V4      134763 non-null  float64
 5   V5      134763 non-null  float64
 6   V6      134763 non-null  float64
 7   V7      134763 non-null  float64
 8   V8      134763 non-null  float64
 9   V9      134763 non-null  float64
 10  V10     134763 non-null  float64
 11  V11     134763 non-null  float64
 12  V12     134763 non-null  float64
 13  V13     134763 non-null  float64
 14  V14     134763 non-null  float64
 15  V15     134763 non-null  float64
 16  V16     134763 non-null  float64
 17  V17     134763 non-null  float64
 18  V18     134763 non-null  float64
 19  V19     134763 non-null  float64
 20  V20     134763 non-null  float64
 21  V21     13

In [37]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [38]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,134500
1.0,262


In [39]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [40]:
print(legit.shape)
print(fraud.shape)

(134500, 31)
(262, 31)


In [41]:
# statistical measures of the data
legit.Amount.describe()

Unnamed: 0,Amount
count,134500.0
mean,91.86632
std,249.299295
min,0.0
25%,6.08
50%,24.15
75%,81.78
max,19656.53


In [42]:
fraud.Amount.describe()

Unnamed: 0,Amount
count,262.0
mean,116.235115
std,245.933637
min,0.0
25%,1.0
50%,10.685
75%,99.99
max,1809.68


In [43]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,50696.770424,-0.234794,-0.001072,0.691212,0.13294,-0.278003,0.081094,-0.106011,0.060959,-0.082546,...,0.041169,-0.040662,-0.116121,-0.034232,0.012646,0.13065,0.02307,-0.00042,0.002115,91.86632
1.0,42119.370229,-5.644679,3.962258,-7.190628,4.521182,-4.002069,-1.48968,-5.96575,1.512608,-2.610845,...,0.240131,1.263063,-0.315132,-0.117179,-0.104567,0.200913,0.05637,0.491164,0.081891,116.235115


In [44]:
legit_sample = legit.sample(n=492)

In [45]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [46]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
2116,1630,-0.89425,0.599229,1.229748,0.0623,0.294846,-0.83655,0.490236,0.062386,-0.494916,...,-0.23157,-0.572177,0.384992,0.258845,-0.113908,0.06039,0.164126,0.039707,8.93,0.0
121411,76180,-0.542529,1.279636,1.028701,0.867031,-0.217305,-0.242168,0.024148,0.504564,-1.123884,...,-0.118337,-0.497958,-0.022916,-0.039085,-0.490289,0.309466,-0.011012,0.062235,1.29,0.0
78828,57725,-0.745049,-0.218414,2.458401,0.396635,-0.301907,0.462248,-0.102227,0.117808,-1.692489,...,-0.09578,0.158632,-0.053747,-0.037516,0.074095,-0.134845,-0.051733,-0.131413,71.0,0.0
115589,73925,-0.517225,1.348357,0.676354,0.233518,0.988781,-0.829791,1.026212,-0.072073,-0.669311,...,0.091734,0.318267,-0.209859,-0.055573,-0.119768,-0.522978,0.172009,0.18636,1.0,0.0
54443,46452,1.073892,0.076592,0.806747,1.366138,-0.510551,-0.250982,-0.060741,0.013041,0.416176,...,-0.199682,-0.409928,0.082413,0.38857,0.380974,-0.539248,0.053986,0.033499,35.09,0.0


In [47]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
124115,77182,-1.410852,2.268271,-2.297554,1.871331,0.248957,-1.208799,-1.358648,1.102916,-1.317364,...,0.155381,-0.61488,-0.196126,-0.464376,0.118473,-0.484537,0.373596,0.187657,1.0,1.0
124176,77202,-0.356326,1.435305,-0.813564,1.993117,2.055878,-0.543579,0.487691,0.085449,-0.536352,...,-0.312863,-0.687874,-0.267003,-1.15848,0.27146,-0.155397,0.114328,0.101526,1.0,1.0
125342,77627,-7.13906,2.773082,-6.757845,4.446456,-5.464428,-1.713401,-6.485365,3.409395,-3.053493,...,1.30325,-0.016118,-0.87667,0.38223,-1.054624,-0.614606,-0.766848,0.409424,106.9,1.0
128479,78725,-4.312479,1.886476,-2.338634,-0.475243,-1.185444,-2.112079,-2.122793,0.272565,0.290273,...,0.550541,-0.06787,-1.114692,0.269069,-0.020572,-0.963489,-0.918888,0.001454,60.0,1.0
131272,79540,-0.114361,1.036129,1.984405,3.128243,-0.740344,1.548619,-1.701284,-2.203842,-1.242265,...,-1.032935,1.196428,-0.112857,0.254719,0.696668,0.48237,0.129969,0.223924,0.2,1.0


In [48]:
new_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,492
1.0,262


In [49]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,53408.014228,-0.108546,-0.009198,0.70795,0.094155,-0.292462,0.189516,-0.055426,0.113645,-0.110215,...,0.062454,0.004451,-0.090469,-0.056718,-0.003288,0.130078,0.014721,-0.005039,-0.009277,97.325366
1.0,42119.370229,-5.644679,3.962258,-7.190628,4.521182,-4.002069,-1.48968,-5.96575,1.512608,-2.610845,...,0.240131,1.263063,-0.315132,-0.117179,-0.104567,0.200913,0.05637,0.491164,0.081891,116.235115


In [50]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [51]:
print(X)

         Time        V1        V2        V3        V4        V5        V6  \
2116     1630 -0.894250  0.599229  1.229748  0.062300  0.294846 -0.836550   
121411  76180 -0.542529  1.279636  1.028701  0.867031 -0.217305 -0.242168   
78828   57725 -0.745049 -0.218414  2.458401  0.396635 -0.301907  0.462248   
115589  73925 -0.517225  1.348357  0.676354  0.233518  0.988781 -0.829791   
54443   46452  1.073892  0.076592  0.806747  1.366138 -0.510551 -0.250982   
...       ...       ...       ...       ...       ...       ...       ...   
124115  77182 -1.410852  2.268271 -2.297554  1.871331  0.248957 -1.208799   
124176  77202 -0.356326  1.435305 -0.813564  1.993117  2.055878 -0.543579   
125342  77627 -7.139060  2.773082 -6.757845  4.446456 -5.464428 -1.713401   
128479  78725 -4.312479  1.886476 -2.338634 -0.475243 -1.185444 -2.112079   
131272  79540 -0.114361  1.036129  1.984405  3.128243 -0.740344  1.548619   

              V7        V8        V9  ...       V20       V21       V22  \


In [52]:
print(Y)

2116      0.0
121411    0.0
78828     0.0
115589    0.0
54443     0.0
         ... 
124115    1.0
124176    1.0
125342    1.0
128479    1.0
131272    1.0
Name: Class, Length: 754, dtype: float64


In [53]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [54]:
print(X.shape, X_train.shape, X_test.shape)

(754, 30) (603, 30) (151, 30)


In [55]:
model = LogisticRegression()

In [56]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [57]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [58]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9585406301824212


In [59]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [60]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9668874172185431
