<a href="https://colab.research.google.com/github/satyam-trimale/Credit-Card-Fraud-Detection/blob/main/CreditCardFraudDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [46]:
# load the dataset to a Pandas DF
credit_card_data = pd.read_csv('/content/creditcard.csv', on_bad_lines='skip')  # Skip lines with errors


In [47]:
# check the no. of missing values in each column
credit_card_data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [48]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,286301
1,492


In [49]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [50]:
print(legit.shape)
print(fraud.shape)

(286301, 31)
(492, 31)


In [22]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94197.781653,0.005543,-0.003799,0.017815,-0.008674,0.005547,0.002787,0.010646,-0.001562,0.00512,...,-0.000207,-0.000999,-0.000171,-0.000444,0.000487,0.000501,-0.000606,0.000137,-8e-05,88.104564
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [51]:
legit_sample = legit.sample(n=492)

In [52]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [25]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
107478,69550.0,-3.356786,2.18466,0.206339,0.104892,-0.381969,1.904202,-0.372775,1.080551,0.851977,...,-0.02173,0.217421,0.144842,-1.018211,-0.255423,0.406166,-0.805108,0.105086,40.61,0
272180,163956.0,2.087654,-0.81067,-1.312644,-0.650914,-0.637319,-0.548852,-0.946647,-0.024737,0.133986,...,0.243954,0.670709,0.109497,0.490964,-0.158228,-0.11264,0.017771,-0.001748,49.98,0
145850,85688.0,1.56411,-1.320279,-0.768283,-2.498886,0.69485,3.453992,-1.836682,0.856476,-1.736448,...,-0.116258,-0.056369,-0.032202,0.99697,0.503939,-0.087726,0.055615,0.019746,7.1,0
282945,169854.0,2.073301,0.221304,-1.671572,0.416995,0.475837,-0.879511,0.244396,-0.287318,0.362129,...,-0.352621,-0.868557,0.333532,0.591523,-0.233498,0.166763,-0.057063,-0.028278,1.79,0
57090,46782.0,-0.619922,1.063497,0.882043,-0.05422,-0.215742,-0.635434,0.439203,0.346245,-0.553962,...,-0.109926,-0.560723,0.125747,-0.147636,-0.603718,0.015631,-0.024341,0.094554,35.0,0


In [26]:
new_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,492
1,492


In [27]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,92663.878049,0.027758,-0.022556,0.041449,0.072548,-0.061644,0.06101,-0.01053,0.042404,-0.008683,...,-0.017685,0.008566,-0.018363,-0.016977,-0.015092,-0.022461,0.026205,-0.032463,0.037751,93.045183
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [53]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [54]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [55]:
model = LogisticRegression()

In [56]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [57]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [58]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9491740787801779


In [59]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [60]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9187817258883249
