In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [183]:
df = pd.read_csv('/content/creditcard.csv', low_memory=False)

In [173]:
df.head()

# Class = 0 Legit Transaction and Class = 1 Fraudulent Transaction

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [184]:
df.shape

(213969, 31)

In [115]:
# Checking for any Null values
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [116]:
df.dropna(inplace=True)

In [117]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [180]:
# Checking how many transaction are valid or not
df['Class'].value_counts()

Class
0.0    207641
1.0       393
Name: count, dtype: int64

This is a highly unbalanced data set as we have a lot of legit transactions and only few fraudulent transactions. In this case, the ML model won't be able to find patterns between the features of fraudulent data.

In [188]:
# Seperating the data
legit = df[df['Class'] == 0]
fraud = df[df['Class'] == 1]

In [189]:
legit['Amount'].describe()

count    213570.000000
mean         90.156783
std         248.673400
min           0.000000
25%           6.000000
50%          23.165000
75%          79.640000
max       19656.530000
Name: Amount, dtype: float64

In [191]:
fraud['Amount'].describe()

count     398.000000
mean      123.841307
std       257.528158
min         0.000000
25%         1.000000
50%        13.385000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [193]:
# Comparing values of legit and fraud transaction
df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,74958.173451,-0.071181,-0.013179,0.261446,0.049393,-0.076991,0.041862,-0.02447,0.008461,0.010928,...,0.013739,-0.011748,-0.034001,-0.012882,0.002689,0.048307,0.00397,0.000165,0.00179,90.156783
1.0,63734.71608,-5.539063,4.099702,-7.613034,4.720286,-3.95456,-1.383136,-6.45837,0.665504,-2.735173,...,0.391298,0.776397,-0.002832,-0.045213,-0.071156,0.061446,0.041531,0.185812,0.064843,123.841307


To balance this data set, we will use a technique called **Under Sampling**.

Defination --> Undersampling is a technique to balance uneven datasets by keeping all of the data in the minority class (which is the fraudulent transaction in our case) and decreasing the size of the majority class (legit transaction).

To do this, we will randomly fetch the data from legit transaction that is of the same size as of fruadulent data. Once we have the data set with equal sizes, we will simply merge them.

In [194]:
legit_sample = legit.sample(n=398)  # It will randomly select 398 rows and not the first 398 rows

In [195]:
legit_sample.shape

(398, 31)

Concatenating the 2 DataFrames

In [196]:
new_df = pd.concat([legit_sample, fraud], axis=0) # axis = 0 because we want to do the operation row vise

In [197]:
new_df.shape

(796, 31)

In [198]:
new_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
5795,6218.0,-0.118651,-0.113796,2.143746,-0.702922,-1.087093,0.248096,-0.81413,0.237576,0.519827,...,0.30577,1.096381,-0.158215,-0.029327,-0.539411,-0.084274,0.165921,0.14639,21.0,0.0
114143,73353.0,-0.515911,1.087821,0.894219,0.254652,0.703771,-0.534441,0.567108,-0.138955,-0.510294,...,-0.242431,-0.539368,-0.093666,-0.467843,-0.536587,0.143332,0.284218,0.21685,4.49,0.0
185543,126714.0,1.881735,-1.431173,-1.758298,-0.954805,-0.383492,-0.210231,-0.348483,-0.050727,-0.353284,...,0.064165,-0.281488,0.054576,0.163065,-0.129633,-0.361192,-0.08343,-0.042695,182.7,0.0
57478,47925.0,-2.490287,-0.260394,1.853891,1.899267,0.853351,-0.727073,-0.200017,0.655494,-1.106321,...,-0.108057,-0.474958,0.583079,0.480543,0.220761,-0.355668,-0.125249,-0.047782,0.76,0.0
50705,44608.0,-1.237382,-0.766434,2.279785,0.500941,-0.739507,-0.310394,-0.189466,0.25753,-1.632671,...,0.004442,-0.095109,0.259487,0.429688,0.193336,-0.194582,0.035728,0.104604,158.0,0.0


In [199]:
new_df.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
208651,137211.0,0.630579,1.183631,-5.066283,2.179903,-0.703376,-0.103614,-3.49035,1.094734,-0.717418,...,0.621622,0.043807,0.102711,-0.601505,0.127371,-0.163009,0.853792,0.356503,39.45,1.0
212516,138894.0,-1.298443,1.9481,-4.509947,1.305805,-0.019486,-0.509238,-2.643398,1.283545,-2.515356,...,1.178032,1.360989,-0.272013,-0.325948,0.290703,0.841295,0.643094,0.201156,0.01,1.0
212644,138942.0,-2.356348,1.74636,-6.374624,1.772205,-3.439294,1.457811,-0.362577,1.443791,-1.927359,...,0.857942,0.621203,0.964817,-0.619437,-1.732613,0.108361,1.130828,0.415703,727.91,1.0
213092,139107.0,-4.6665,-3.95232,0.206094,5.153525,5.229469,0.93904,-0.635033,-0.704506,-0.234786,...,-0.664263,1.821422,0.113563,-0.759673,-0.502304,0.630639,-0.51388,0.729526,22.47,1.0
213116,139117.0,-3.975939,-1.244939,-3.707414,4.544772,4.050676,-3.407679,-5.063118,1.007042,-3.190158,...,1.059737,-0.037395,0.348707,-0.162929,0.410531,-0.123612,0.877424,0.667568,8.3,1.0


In [201]:
new_df['Class'].value_counts()

Class
0.0    398
1.0    398
Name: count, dtype: int64

In [200]:
# Comparing values of legit and fraud transaction from the new dataset
new_df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,76070.407035,-0.075529,-0.099223,0.176198,0.069078,-0.117049,-0.085163,-0.080301,-0.098861,0.106051,...,0.027217,0.000825,-0.024865,-0.022892,-0.004535,0.092941,-0.008553,-0.034085,-0.00675,100.879347
1.0,63734.71608,-5.539063,4.099702,-7.613034,4.720286,-3.95456,-1.383136,-6.45837,0.665504,-2.735173,...,0.391298,0.776397,-0.002832,-0.045213,-0.071156,0.061446,0.041531,0.185812,0.064843,123.841307


**Data splitting**

In [202]:
X = new_df.drop(columns='Class', axis=1)
y = new_df['Class']

In [206]:
### Splitting it into training data and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=.2, stratify=y)

In [207]:
# Model training
model = LogisticRegression()

In [208]:
model.fit(X_train, y_train)

In [209]:
y_pred = model.predict(X_test)

In [210]:
acc = accuracy_score(y_test, y_pred)

In [212]:
print('Model\'s Accuracy Score is :', acc)

Model's Accuracy Score is : 0.93125
