In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
#loading datasets to pandas
credit_card_data=pd.read_csv('/content/creditcard.csv.csv')

In [5]:
# display first 5 rows
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [6]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
217850,141066.0,-0.259829,0.564607,-0.196756,-0.444065,0.487357,-1.209572,0.607188,-0.044463,0.064676,...,-0.181718,-0.492476,0.092356,-0.087076,-0.912439,0.031397,-0.354536,-0.309259,1.98,0.0
217851,141066.0,2.055863,-0.287366,-0.686354,0.20563,-0.146008,-0.064728,-0.466015,-0.018369,1.293906,...,-0.240923,-0.501231,0.248438,-1.097865,-0.289774,-0.555,0.040091,-0.041624,3.74,0.0
217852,141067.0,0.317595,-3.477513,-1.561692,-0.182863,-1.588414,-0.131552,0.428889,-0.19363,1.226662,...,0.747245,0.305072,-0.641818,-0.278629,-0.665032,0.617244,-0.205433,0.087356,888.3,0.0
217853,141067.0,-1.907794,-2.067306,1.242712,-1.31162,1.699461,-1.345724,-1.273915,-0.310157,-0.02208,...,-0.30014,0.172267,-0.567616,-0.329426,-1.080322,-0.516887,0.541941,-0.207201,42.0,0.0
217854,141069.0,-2.453156,0.48481,1.87405,-2.952909,-0.800719,-0.340965,-3.030323,-4.964019,1.617891,...,,,,,,,,,,


In [7]:
#Dataset Information
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217855 entries, 0 to 217854
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    217855 non-null  float64
 1   V1      217855 non-null  float64
 2   V2      217855 non-null  float64
 3   V3      217855 non-null  float64
 4   V4      217855 non-null  float64
 5   V5      217855 non-null  float64
 6   V6      217855 non-null  float64
 7   V7      217855 non-null  float64
 8   V8      217855 non-null  float64
 9   V9      217855 non-null  float64
 10  V10     217854 non-null  float64
 11  V11     217854 non-null  float64
 12  V12     217854 non-null  float64
 13  V13     217854 non-null  float64
 14  V14     217854 non-null  float64
 15  V15     217854 non-null  float64
 16  V16     217854 non-null  float64
 17  V17     217854 non-null  float64
 18  V18     217854 non-null  float64
 19  V19     217854 non-null  float64
 20  V20     217854 non-null  float64
 21  V21     21

In [8]:
#checking numbers of missing values in each column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       1
V11       1
V12       1
V13       1
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [11]:
# distribution of normal transaction (0) and fraud transaction(1)
credit_card_data['Class'].value_counts()

0.0    217451
1.0       403
Name: Class, dtype: int64

In [13]:
#separating data for analysis
normal=credit_card_data[credit_card_data.Class==0]
fraud=credit_card_data[credit_card_data.Class==1]


In [14]:
print(normal.shape)
print(fraud.shape)

(217451, 31)
(403, 31)


In [15]:
#statistical measures of data
normal.Amount.describe()

count    217451.000000
mean         90.384426
std         249.610731
min           0.000000
25%           6.000000
50%          23.280000
75%          79.855000
max       19656.530000
Name: Amount, dtype: float64

In [16]:
fraud.Amount.describe()

count     403.000000
mean      125.873325
std       258.859233
min         0.000000
25%         1.000000
50%        14.460000
75%       106.270000
max      2125.870000
Name: Amount, dtype: float64

In [17]:
#compare values of both transaction
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,76123.843261,-0.066694,-0.0155,0.242639,0.045146,-0.070279,0.039057,-0.021068,0.005875,0.010082,...,0.012654,-0.010018,-0.030581,-0.01186,0.002122,0.045486,0.00371,0.000125,0.001838,90.384426
1.0,64681.270471,-5.487115,4.050452,-7.57606,4.689855,-3.904513,-1.370212,-6.396308,0.661419,-2.714696,...,0.390654,0.782512,0.007675,-0.042564,-0.084868,0.056909,0.041572,0.191745,0.063601,125.873325


In [None]:
# similar distribution of normal and fraud transaction


No.of fraud-->403

In [19]:
normal_sample = normal.sample(n=403)

In [20]:
#Concatenate 2 dataframes

In [21]:
new_datasets=pd.concat([normal_sample,fraud],axis=0)

In [22]:
new_datasets.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
83130,59703.0,0.986616,-1.067303,0.864926,0.151555,-0.809678,1.56392,-1.068451,0.570969,-0.058511,...,-0.334271,-0.16158,0.139195,-0.616889,0.008736,0.633029,0.079176,0.016074,66.95,0.0
138095,82483.0,-1.320091,-0.360154,-0.3882,0.306254,-0.453692,1.541223,1.79866,0.474634,-1.339025,...,-0.111559,-1.401405,1.119522,-1.788666,-0.080361,0.194986,-0.277971,-0.057733,489.95,0.0
15803,27244.0,1.153057,-0.008461,0.543321,0.526325,-0.530719,-0.365179,-0.257492,0.189088,0.094496,...,-0.187908,-0.658358,0.202334,0.159884,0.013186,0.101135,-0.030759,0.003328,4.49,0.0
200835,133601.0,1.840201,-0.111553,-3.224608,0.435429,2.878511,3.254956,-0.058971,0.741232,0.08537,...,-0.167673,-0.628481,0.144301,0.556329,0.108852,-0.761712,0.01953,-0.014005,86.98,0.0
48742,43763.0,1.302199,-1.803525,1.481264,-0.925304,-2.316655,0.762524,-2.021571,0.40038,-0.407903,...,-0.262467,-0.032481,-0.084425,0.031633,0.371557,-0.081121,0.09658,0.024406,52.95,0.0


In [23]:
new_datasets.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
214662,139767.0,0.467992,1.100118,-5.607145,2.204714,-0.578539,-0.1742,-3.454201,1.102823,-1.065016,...,0.983481,0.899876,-0.285103,-1.929717,0.319869,0.170636,0.851798,0.372098,120.54,1.0
214775,139816.0,-0.395582,-0.751792,-1.984666,-0.203459,1.903967,-1.430289,-0.076548,-0.99226,0.756307,...,1.377515,2.151787,0.189225,0.772943,-0.872443,-0.200612,0.356856,0.032113,0.69,1.0
215132,139951.0,-2.921944,-0.228062,-5.877289,2.201884,-1.93544,0.631141,-1.245106,1.511348,-1.899987,...,1.441622,0.895528,1.385511,-2.028024,0.509131,0.172643,0.726781,0.234514,723.21,1.0
215953,140293.0,0.951025,3.252926,-5.039105,4.632411,3.014501,-1.34957,0.98094,-1.819539,-2.099049,...,1.404524,-0.760549,0.358292,-1.185942,-1.286177,0.000365,0.169662,0.108276,0.77,1.0
215984,140308.0,-4.861747,-2.72266,-4.656248,2.502005,-2.008346,0.615422,-3.48568,1.878856,-1.116268,...,1.138876,1.033664,-0.806199,-1.511046,-0.191731,0.080999,1.215152,-0.923142,592.9,1.0


In [24]:
new_datasets['Class'].value_counts()

0.0    403
1.0    403
Name: Class, dtype: int64

In [25]:
new_datasets.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,76214.593052,0.179359,-0.007108,0.211342,0.049068,-0.058993,-0.034409,0.013537,-0.0192,0.022758,...,0.008326,0.020333,-0.048065,-0.039169,-0.015006,0.021197,0.035308,3.2e-05,-0.000136,85.278114
1.0,64681.270471,-5.487115,4.050452,-7.57606,4.689855,-3.904513,-1.370212,-6.396308,0.661419,-2.714696,...,0.390654,0.782512,0.007675,-0.042564,-0.084868,0.056909,0.041572,0.191745,0.063601,125.873325


In [26]:
#splitting data into features and target

In [27]:
X=new_datasets.drop(columns='Class',axis=1)
Y=new_datasets['Class']


In [28]:
print(X)

            Time        V1        V2        V3        V4        V5        V6  \
83130    59703.0  0.986616 -1.067303  0.864926  0.151555 -0.809678  1.563920   
138095   82483.0 -1.320091 -0.360154 -0.388200  0.306254 -0.453692  1.541223   
15803    27244.0  1.153057 -0.008461  0.543321  0.526325 -0.530719 -0.365179   
200835  133601.0  1.840201 -0.111553 -3.224608  0.435429  2.878511  3.254956   
48742    43763.0  1.302199 -1.803525  1.481264 -0.925304 -2.316655  0.762524   
...          ...       ...       ...       ...       ...       ...       ...   
214662  139767.0  0.467992  1.100118 -5.607145  2.204714 -0.578539 -0.174200   
214775  139816.0 -0.395582 -0.751792 -1.984666 -0.203459  1.903967 -1.430289   
215132  139951.0 -2.921944 -0.228062 -5.877289  2.201884 -1.935440  0.631141   
215953  140293.0  0.951025  3.252926 -5.039105  4.632411  3.014501 -1.349570   
215984  140308.0 -4.861747 -2.722660 -4.656248  2.502005 -2.008346  0.615422   

              V7        V8        V9  .

In [29]:
print(Y)

83130     0.0
138095    0.0
15803     0.0
200835    0.0
48742     0.0
         ... 
214662    1.0
214775    1.0
215132    1.0
215953    1.0
215984    1.0
Name: Class, Length: 806, dtype: float64


In [41]:
#Split data into Training & Testing data
X_train,X_test, Y_train, Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [40]:
print(X.shape,X_train.shape,X_test.shape)

(806, 30) (644, 30) (162, 30)


Model Training


Logistic Regression


In [33]:
model=LogisticRegression()

In [43]:
#training logistic regression model with traning data
model.fit(X_train, Y_train)

Model Evaluation[Accuracy Score]

In [46]:
#Accuracy on training data
X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,Y_train)
print('Accuracy on Training data: ', training_data_accuracy)

Accuracy on Training data:  0.9487577639751553


In [47]:
#Accuracy on test data
X_test_prediction=model.predict(X_test)
testing_data_accuracy=accuracy_score(X_test_prediction,Y_test)
print('Accuracy on Testing data: ', testing_data_accuracy)

Accuracy on Testing data:  0.9259259259259259
