In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
# http://localhost:8889/edit/creditcard.csv
credit_card_data = pd.read_csv('creditcard.csv')

In [4]:
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [5]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
130797,79434,-5.466097,2.517599,-2.343553,4.215289,-3.280618,-0.287955,-2.296688,3.464467,-2.399703,...,0.359007,-0.339049,0.336488,0.373767,-0.522598,-0.237532,-2.033451,-0.874593,39.32,0.0
130798,79434,-0.400431,1.081795,1.398292,-0.004797,-0.018701,-0.773994,0.616732,0.061465,-0.666232,...,-0.198177,-0.540952,0.001058,0.483626,-0.218053,0.041827,0.24153,0.090224,4.49,0.0
130799,79435,-0.061008,-0.744399,1.800945,-2.150292,-1.040646,0.448124,-0.344005,-0.007536,-2.311161,...,0.059198,0.617976,-0.030908,-0.313231,-0.141417,-0.14207,-0.096524,-0.202511,79.0,0.0
130800,79435,1.06454,-0.651524,0.504264,-0.180169,-0.255126,1.240949,-0.660998,0.327149,0.820702,...,-0.178904,-0.367111,-0.159203,-1.226536,0.230462,1.026093,-0.029286,0.00401,89.99,0.0
130801,79435,-0.743444,0.539346,2.515842,-1.960928,-0.421662,-0.724772,0.537915,-0.156173,1.29279,...,-0.054806,0.321604,-0.410774,0.417508,,,,,,


In [6]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130802 entries, 0 to 130801
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    130802 non-null  int64  
 1   V1      130802 non-null  float64
 2   V2      130802 non-null  float64
 3   V3      130802 non-null  float64
 4   V4      130802 non-null  float64
 5   V5      130802 non-null  float64
 6   V6      130802 non-null  float64
 7   V7      130802 non-null  float64
 8   V8      130802 non-null  float64
 9   V9      130802 non-null  float64
 10  V10     130802 non-null  float64
 11  V11     130802 non-null  float64
 12  V12     130802 non-null  float64
 13  V13     130802 non-null  float64
 14  V14     130802 non-null  float64
 15  V15     130802 non-null  float64
 16  V16     130802 non-null  float64
 17  V17     130802 non-null  float64
 18  V18     130802 non-null  float64
 19  V19     130802 non-null  float64
 20  V20     130802 non-null  float64
 21  V21     13

In [7]:
#checking the number of missing value in each column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [8]:
# distribution of legit trasactions and fraudulent transactions
credit_card_data['Class'].value_counts()

0.0    130540
1.0       261
Name: Class, dtype: int64

In [9]:
# This datset if highly unbalanced 
# 0 --> Normal Transaction  
# 1 --> Fraudulent transaction

In [10]:
#Separating data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [11]:
print(legit.shape)
print(fraud.shape)

(130540, 31)
(261, 31)


In [12]:
# Statistical measures of the data
legit.Amount.describe()

count    130540.000000
mean         92.443479
std         250.815557
min           0.000000
25%           6.240000
50%          24.560000
75%          82.500000
max       19656.530000
Name: Amount, dtype: float64

In [13]:
fraud.Amount.describe()

count     261.000000
mean      116.679693
std       246.300626
min         0.000000
25%         1.000000
50%        11.380000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [14]:
# compare the values for both  transactions 
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,49803.748514,-0.234955,-0.006371,0.692647,0.135422,-0.278173,0.083854,-0.106118,0.059408,-0.079864,...,0.041665,-0.040103,-0.115137,-0.034182,0.012281,0.130839,0.023708,-0.000204,0.002147,92.443479
1.0,41975.996169,-5.665868,3.97347,-7.225782,4.526519,-4.014566,-1.501321,-5.982089,1.526847,-2.616088,...,0.238243,1.27186,-0.320923,-0.117196,-0.105943,0.199014,0.054738,0.492548,0.081347,116.679693


In [15]:
#Under-Sampling 
# Build a sample dataset containg similar distribution of normal trasaction and Fradulent Trasactions
# Number of Fradulent Trasactions --> 492

In [17]:
legit_sample = legit.sample(n=492)

In [18]:
# Concatenating 2 Dataframes

In [19]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)
# axis = 0 --> row
# axis = 1 --> column

In [20]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
30452,35931,1.203544,0.091122,0.587003,0.568025,-0.66528,-0.823855,-0.147294,0.020266,0.015686,...,-0.211397,-0.725939,0.169451,0.486144,0.105828,0.069967,-0.042981,0.007292,0.89,0.0
16737,28099,1.202876,0.033211,0.588177,0.362524,-0.608848,-0.677965,-0.103443,-0.050257,0.116552,...,-0.180281,-0.427299,0.074373,0.586817,0.277263,0.41434,-0.045149,0.002924,4.0,0.0
89824,62739,-0.632546,-0.619568,1.467558,-2.458384,-1.051699,-0.524877,0.039404,-0.184701,-2.218055,...,-0.299184,-0.538152,0.232679,-0.180416,-0.310779,-0.572291,0.058764,0.109491,104.65,0.0
94290,64816,-0.701785,0.664195,1.463476,1.267059,0.173381,0.135206,0.188853,0.196207,-0.103084,...,0.036407,0.429133,-0.275888,0.030917,-0.129656,-0.203838,0.348912,0.184191,25.0,0.0
14229,25279,1.270905,-0.69177,0.441762,-0.674287,-0.883968,-0.122326,-0.690639,0.10622,-0.805621,...,0.062893,0.133394,0.00749,0.036242,0.356116,-0.290758,0.016649,0.005117,32.51,0.0


In [21]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
124087,77171,1.11856,1.291858,-1.298805,2.135772,0.772204,-1.147291,0.390578,-0.107072,-0.038339,...,-0.346374,-0.663588,-0.102326,0.017911,0.650302,-0.332366,0.105949,0.128124,1.0,1.0
124115,77182,-1.410852,2.268271,-2.297554,1.871331,0.248957,-1.208799,-1.358648,1.102916,-1.317364,...,0.155381,-0.61488,-0.196126,-0.464376,0.118473,-0.484537,0.373596,0.187657,1.0,1.0
124176,77202,-0.356326,1.435305,-0.813564,1.993117,2.055878,-0.543579,0.487691,0.085449,-0.536352,...,-0.312863,-0.687874,-0.267003,-1.15848,0.27146,-0.155397,0.114328,0.101526,1.0,1.0
125342,77627,-7.13906,2.773082,-6.757845,4.446456,-5.464428,-1.713401,-6.485365,3.409395,-3.053493,...,1.30325,-0.016118,-0.87667,0.38223,-1.054624,-0.614606,-0.766848,0.409424,106.9,1.0
128479,78725,-4.312479,1.886476,-2.338634,-0.475243,-1.185444,-2.112079,-2.122793,0.272565,0.290273,...,0.550541,-0.06787,-1.114692,0.269069,-0.020572,-0.963489,-0.918888,0.001454,60.0,1.0


In [22]:
new_dataset['Class'].value_counts()

0.0    492
1.0    261
Name: Class, dtype: int64

In [23]:
new_dataset.groupby('Class').mean()
#Important step --> As we get know if we got a good sample or bad sample

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,50435.760163,-0.348853,0.012842,0.715247,0.141603,-0.253146,0.040009,-0.157998,0.097396,0.020707,...,0.033834,-0.047754,-0.110688,-0.046181,0.015631,0.122983,-0.010869,-0.029283,-0.03591,81.783516
1.0,41975.996169,-5.665868,3.97347,-7.225782,4.526519,-4.014566,-1.501321,-5.982089,1.526847,-2.616088,...,0.238243,1.27186,-0.320923,-0.117196,-0.105943,0.199014,0.054738,0.492548,0.081347,116.679693


In [24]:
#Splitting the data into  features and Targets

In [25]:
X = new_dataset.drop(columns='Class', axis = 1) 
Y = new_dataset['Class']

In [26]:
print(X)

         Time        V1        V2        V3        V4        V5        V6  \
30452   35931  1.203544  0.091122  0.587003  0.568025 -0.665280 -0.823855   
16737   28099  1.202876  0.033211  0.588177  0.362524 -0.608848 -0.677965   
89824   62739 -0.632546 -0.619568  1.467558 -2.458384 -1.051699 -0.524877   
94290   64816 -0.701785  0.664195  1.463476  1.267059  0.173381  0.135206   
14229   25279  1.270905 -0.691770  0.441762 -0.674287 -0.883968 -0.122326   
...       ...       ...       ...       ...       ...       ...       ...   
124087  77171  1.118560  1.291858 -1.298805  2.135772  0.772204 -1.147291   
124115  77182 -1.410852  2.268271 -2.297554  1.871331  0.248957 -1.208799   
124176  77202 -0.356326  1.435305 -0.813564  1.993117  2.055878 -0.543579   
125342  77627 -7.139060  2.773082 -6.757845  4.446456 -5.464428 -1.713401   
128479  78725 -4.312479  1.886476 -2.338634 -0.475243 -1.185444 -2.112079   

              V7        V8        V9  ...       V20       V21       V22  \


In [27]:
print(Y)

30452     0.0
16737     0.0
89824     0.0
94290     0.0
14229     0.0
         ... 
124087    1.0
124115    1.0
124176    1.0
125342    1.0
128479    1.0
Name: Class, Length: 753, dtype: float64


In [28]:
#Split the data into Training data and Testing data

In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)
#Features present in X
#Labels present in Y

In [30]:
print(X.shape, X_train.shape, X_test.shape)

(753, 30) (602, 30) (151, 30)


In [31]:
#Model training --> Logistic Regression for binary classfication problems

In [32]:
model = LogisticRegression()

In [33]:
# Training the Logistic Regression with Training Data
model.fit(X_train, Y_train)

In [34]:
# Model Evaluation
#Accuracy score

In [35]:
# accuracy on training data 
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction , Y_train)

In [37]:
#Accuracy score on Training data:
print('Accuracy on Training data:', training_data_accuracy)

Accuracy on Training data: 0.9651162790697675


In [38]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [39]:
#Accuracy score on Test data:
print('Accuracy on Test data:', test_data_accuracy)

Accuracy on Test data: 0.9271523178807947
