importing libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/credit_data.csv.csv')

In [None]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [None]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
265355,161850.0,0.041717,0.895154,0.293817,-0.592113,0.452007,-1.101148,1.053337,-0.235284,-0.118808,...,-0.254548,-0.53416,0.067253,0.027309,-0.46942,0.137578,0.249559,0.098485,5.49,0.0
265356,161850.0,-0.578429,0.765023,-0.94213,-0.958078,1.000742,-1.754038,1.759567,-0.474654,-0.102327,...,0.214977,0.85995,-0.140255,0.02474,-0.270721,0.100395,0.520926,0.357215,79.93,0.0
265357,161850.0,-0.360885,1.067309,-0.139529,-0.523392,1.065073,-0.318496,0.84823,0.111054,-0.181236,...,0.078177,0.514656,-0.259133,0.632028,-0.217791,0.531122,0.373266,0.260223,3.79,0.0
265358,161851.0,0.03265,0.901763,0.303674,-0.597897,0.4939,-1.021908,1.050215,-0.22107,-0.149955,...,-0.245706,-0.491968,0.072549,-0.001659,-0.480461,0.14107,0.254167,0.098462,4.99,0.0
265359,161851.0,2.005486,-0.990444,-0.407233,-0.0,,,,,,...,,,,,,,,,,


In [None]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265360 entries, 0 to 265359
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    265360 non-null  float64
 1   V1      265360 non-null  float64
 2   V2      265360 non-null  float64
 3   V3      265360 non-null  float64
 4   V4      265360 non-null  float64
 5   V5      265359 non-null  float64
 6   V6      265359 non-null  float64
 7   V7      265359 non-null  float64
 8   V8      265359 non-null  float64
 9   V9      265359 non-null  float64
 10  V10     265359 non-null  float64
 11  V11     265359 non-null  float64
 12  V12     265359 non-null  float64
 13  V13     265359 non-null  float64
 14  V14     265359 non-null  float64
 15  V15     265359 non-null  float64
 16  V16     265359 non-null  float64
 17  V17     265359 non-null  float64
 18  V18     265359 non-null  float64
 19  V19     265359 non-null  float64
 20  V20     265359 non-null  float64
 21  V21     26

In [None]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        1
V6        1
V7        1
V8        1
V9        1
V10       1
V11       1
V12       1
V13       1
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [None]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

In [None]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [None]:
print(legit.shape)
print(fraud.shape)

(264879, 31)
(480, 31)


In [None]:
# statistical measures of the data
legit.Amount.describe()

count    264879.000000
mean         89.735848
std         248.783621
min           0.000000
25%           5.990000
50%          22.760000
75%          79.000000
max       19656.530000
Name: Amount, dtype: float64

In [None]:
fraud.Amount.describe()

count     480.000000
mean      121.408896
std       257.964187
min         0.000000
25%         1.000000
50%         8.590000
75%       105.080000
max      2125.870000
Name: Amount, dtype: float64

In [None]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,89567.073702,-0.006285,-0.022175,0.066019,0.003782,-0.01557,0.011799,0.000955,-0.001071,0.004547,...,0.001305,-0.00151,-0.005269,-0.00279,0.001555,0.011035,0.001352,-0.000839,0.000633,89.735848
1.0,78578.960417,-4.843074,3.715599,-7.127606,4.610425,-3.247162,-1.405621,-5.662963,0.616471,-2.619694,...,0.371746,0.739635,0.001402,-0.051203,-0.109934,0.045371,0.04746,0.162716,0.076705,121.408896


In [None]:
legit_sample = legit.sample(n=492)

In [None]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [None]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
170490,120205.0,-0.380305,0.683687,0.706221,-0.611497,0.911804,-1.226007,1.026518,-0.29958,-0.487089,...,-0.149839,-0.399987,0.020733,0.080582,-0.552822,0.113313,0.099305,0.176632,0.99,0.0
126478,78015.0,-0.68992,0.41036,1.545485,-0.806942,0.074947,-0.075629,0.800501,0.055424,-2.149119,...,-0.611834,-1.516234,0.282992,-0.108572,-0.328459,-1.00672,0.106027,0.119825,71.01,0.0
124574,77345.0,-1.206775,0.739502,1.054572,-0.67245,0.881683,-0.133699,0.427942,0.25203,-0.400189,...,-0.205096,-0.681843,-0.287623,-0.936591,-0.131499,-0.007043,-0.314518,-0.288187,1.29,0.0
190152,128708.0,-0.741133,0.579089,0.029175,-2.43945,0.16889,-0.65779,0.35854,0.381557,-1.689318,...,0.147169,0.244575,-0.129789,0.765421,0.116875,-0.424689,0.186871,0.114679,15.0,0.0
185906,126872.0,-1.142849,-0.032864,0.953088,2.991481,1.523578,0.267848,0.917333,-0.192484,-1.00465,...,-0.60579,-1.350487,0.925407,0.449216,-0.42958,-0.694112,-0.308517,-0.165711,83.23,0.0


In [None]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
262826,160665.0,-0.41734,4.700055,-7.521767,7.671884,0.260821,-2.646693,-2.854432,0.958783,-4.588536,...,0.6222,-0.437708,-0.090358,-0.742802,-0.312361,0.502575,0.82139,0.372379,0.77,1.0
263080,160791.0,2.132386,0.705608,-3.530759,0.514779,1.527175,-1.716268,1.132791,-0.574214,0.128904,...,0.163739,0.70391,-0.245076,0.460049,0.920281,-0.216586,-0.026219,-0.025001,1.0,1.0
263274,160870.0,-0.644278,5.002352,-8.252739,7.756915,-0.216267,-2.751496,-3.358857,1.406268,-4.403852,...,0.587728,-0.605759,0.033746,-0.75617,-0.008172,0.532772,0.66397,0.192067,0.77,1.0
263324,160895.0,-0.84829,2.719882,-6.19907,3.044437,-3.30191,-1.992117,-3.734902,1.520079,-2.548788,...,1.125229,0.805258,0.199119,0.035206,0.012159,0.601658,0.137468,-0.171397,127.14,1.0
263877,161154.0,-3.387601,3.977881,-6.978585,1.657766,-1.1005,-3.599487,-3.686651,1.942252,-3.065089,...,1.043587,0.262189,-0.479224,-0.326638,-0.156939,0.113807,0.354124,0.287592,0.38,1.0


In [None]:
new_dataset['Class'].value_counts()

0.0    492
1.0    480
Name: Class, dtype: int64

In [None]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,86149.745935,-0.016832,0.014033,0.071342,0.027006,-0.077245,-0.048806,0.0334,0.048106,-0.018088,...,0.049307,0.002046,-0.049286,0.014656,-0.042279,0.021241,0.047663,-0.003031,0.014207,95.689654
1.0,78578.960417,-4.843074,3.715599,-7.127606,4.610425,-3.247162,-1.405621,-5.662963,0.616471,-2.619694,...,0.371746,0.739635,0.001402,-0.051203,-0.109934,0.045371,0.04746,0.162716,0.076705,121.408896


In [None]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [None]:
print(X)

            Time        V1        V2        V3        V4        V5        V6  \
170490  120205.0 -0.380305  0.683687  0.706221 -0.611497  0.911804 -1.226007   
126478   78015.0 -0.689920  0.410360  1.545485 -0.806942  0.074947 -0.075629   
124574   77345.0 -1.206775  0.739502  1.054572 -0.672450  0.881683 -0.133699   
190152  128708.0 -0.741133  0.579089  0.029175 -2.439450  0.168890 -0.657790   
185906  126872.0 -1.142849 -0.032864  0.953088  2.991481  1.523578  0.267848   
...          ...       ...       ...       ...       ...       ...       ...   
262826  160665.0 -0.417340  4.700055 -7.521767  7.671884  0.260821 -2.646693   
263080  160791.0  2.132386  0.705608 -3.530759  0.514779  1.527175 -1.716268   
263274  160870.0 -0.644278  5.002352 -8.252739  7.756915 -0.216267 -2.751496   
263324  160895.0 -0.848290  2.719882 -6.199070  3.044437 -3.301910 -1.992117   
263877  161154.0 -3.387601  3.977881 -6.978585  1.657766 -1.100500 -3.599487   

              V7        V8        V9  .

In [None]:
print(Y)

170490    0.0
126478    0.0
124574    0.0
190152    0.0
185906    0.0
         ... 
262826    1.0
263080    1.0
263274    1.0
263324    1.0
263877    1.0
Name: Class, Length: 972, dtype: float64


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(972, 30) (777, 30) (195, 30)


In [None]:
model = LogisticRegression()

In [None]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

LogisticRegression()

In [None]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9472329472329473


In [None]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9179487179487179


In [None]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

LogisticRegression()