In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
credit_card_data = pd.read_csv('creditcard.csv')

In [3]:
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [5]:
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [6]:
credit_card_data['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

From the above code, it is clear that there are around 85055 transactions which are geniune and 204 transactions which are fraud. It is clear from here that the dataset is highly un-balanced.

In [7]:
legit_transactions = credit_card_data[credit_card_data.Class == 0]
fraud_transactions = credit_card_data[credit_card_data.Class == 1]

In [8]:
legit_transactions.shape

(284315, 31)

In [9]:
fraud_transactions.shape

(492, 31)

Now, since our dataset is highly unbalanced, therefore, we will use a technique called 'Under Sampling' to handle the problem of un-balanced data.

In [10]:
legit_transaction_sample = legit_transactions.sample(n=492) 
# here, n = 492 means we want 492 random samples out of the original legit_transaction.

In [11]:
new_transaction_dataset = pd.concat([legit_transaction_sample, fraud_transactions], axis=0)

In [12]:
new_transaction_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
191570,129324.0,-0.65919,0.300985,0.974375,-2.07407,-0.16551,-0.384472,1.126056,-2.427729,0.669159,...,0.813967,0.801664,-0.115847,0.064696,-0.573011,-0.522845,-2.175519,-1.601961,68.0,0
38415,39387.0,-1.003248,1.266156,1.309297,-1.353087,-0.096595,-1.060435,0.882678,-0.239831,0.520263,...,-0.142043,0.081006,0.011637,0.665971,-0.203335,0.745531,0.359884,0.047133,1.31,0
27098,34385.0,-1.622794,1.162741,0.995701,-1.363309,-0.381152,-1.237467,0.446202,-0.043135,1.018277,...,-0.29519,-0.332916,0.24554,0.399391,-0.003886,0.748958,0.575626,0.235932,3.07,0
189181,128291.0,-0.065335,1.105277,-0.510529,-0.593565,0.774591,-0.855645,1.08211,-0.252031,0.235357,...,0.29174,1.183954,-0.204945,-0.621305,-0.650454,-0.174268,0.618489,0.372499,9.95,0
234352,147939.0,2.155114,-0.680925,-2.751542,-1.247486,1.020226,0.167963,0.139779,-0.095954,-1.221846,...,0.569128,1.772001,-0.238138,-1.537839,0.588385,0.412609,-0.054327,-0.119714,20.0,0


In [13]:
new_transaction_dataset.shape

(984, 31)

In [14]:
new_transaction_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 191570 to 281674
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    984 non-null    float64
 1   V1      984 non-null    float64
 2   V2      984 non-null    float64
 3   V3      984 non-null    float64
 4   V4      984 non-null    float64
 5   V5      984 non-null    float64
 6   V6      984 non-null    float64
 7   V7      984 non-null    float64
 8   V8      984 non-null    float64
 9   V9      984 non-null    float64
 10  V10     984 non-null    float64
 11  V11     984 non-null    float64
 12  V12     984 non-null    float64
 13  V13     984 non-null    float64
 14  V14     984 non-null    float64
 15  V15     984 non-null    float64
 16  V16     984 non-null    float64
 17  V17     984 non-null    float64
 18  V18     984 non-null    float64
 19  V19     984 non-null    float64
 20  V20     984 non-null    float64
 21  V21     984 non-null    float64

Now, the new dataset is balanced with equal number of legit transactions and fraud transactions.

In [15]:
X = new_transaction_dataset.drop(columns="Class", axis=1)
y = new_transaction_dataset['Class']

In [16]:
X.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
191570,129324.0,-0.65919,0.300985,0.974375,-2.07407,-0.16551,-0.384472,1.126056,-2.427729,0.669159,...,-0.058502,0.813967,0.801664,-0.115847,0.064696,-0.573011,-0.522845,-2.175519,-1.601961,68.0
38415,39387.0,-1.003248,1.266156,1.309297,-1.353087,-0.096595,-1.060435,0.882678,-0.239831,0.520263,...,0.458939,-0.142043,0.081006,0.011637,0.665971,-0.203335,0.745531,0.359884,0.047133,1.31
27098,34385.0,-1.622794,1.162741,0.995701,-1.363309,-0.381152,-1.237467,0.446202,-0.043135,1.018277,...,0.259711,-0.29519,-0.332916,0.24554,0.399391,-0.003886,0.748958,0.575626,0.235932,3.07
189181,128291.0,-0.065335,1.105277,-0.510529,-0.593565,0.774591,-0.855645,1.08211,-0.252031,0.235357,...,0.119405,0.29174,1.183954,-0.204945,-0.621305,-0.650454,-0.174268,0.618489,0.372499,9.95
234352,147939.0,2.155114,-0.680925,-2.751542,-1.247486,1.020226,0.167963,0.139779,-0.095954,-1.221846,...,-0.147628,0.569128,1.772001,-0.238138,-1.537839,0.588385,0.412609,-0.054327,-0.119714,20.0


In [17]:
y.head()

191570    0
38415     0
27098     0
189181    0
234352    0
Name: Class, dtype: int64

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

In [19]:
model = LogisticRegression()

In [20]:
model.fit(X_train, y_train)

In [21]:
# Accuracy Socre of training data
X_train_prediction = model.predict(X_train)

print(f"Accuracy score of the training dataset: {accuracy_score(y_train, X_train_prediction)}")

Accuracy score of the training dataset: 0.9237611181702668


In [22]:
# Accuracy Socre of test data
X_test_prediction = model.predict(X_test)

print(f"Accuracy score of the test dataset: {accuracy_score(y_test, X_test_prediction)}")

Accuracy score of the test dataset: 0.9035532994923858


In [23]:
label_names = ['Not Fraud Transaction', 'Fraud Transaction']

y_test = y_test.reset_index(drop=True)

for i in range(len(X_test_prediction)):
    print(f"Predicted Result: {label_names[X_test_prediction[i].astype('int')]}. Actual Result: {label_names[y_test[i].astype('int')]}")

Predicted Result: Fraud Transaction. Actual Result: Fraud Transaction
Predicted Result: Not Fraud Transaction. Actual Result: Not Fraud Transaction
Predicted Result: Not Fraud Transaction. Actual Result: Not Fraud Transaction
Predicted Result: Not Fraud Transaction. Actual Result: Not Fraud Transaction
Predicted Result: Fraud Transaction. Actual Result: Fraud Transaction
Predicted Result: Not Fraud Transaction. Actual Result: Not Fraud Transaction
Predicted Result: Fraud Transaction. Actual Result: Fraud Transaction
Predicted Result: Fraud Transaction. Actual Result: Fraud Transaction
Predicted Result: Not Fraud Transaction. Actual Result: Not Fraud Transaction
Predicted Result: Fraud Transaction. Actual Result: Fraud Transaction
Predicted Result: Not Fraud Transaction. Actual Result: Not Fraud Transaction
Predicted Result: Fraud Transaction. Actual Result: Fraud Transaction
Predicted Result: Fraud Transaction. Actual Result: Fraud Transaction
Predicted Result: Not Fraud Transaction. A