In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score

In [3]:
# Load the dataset
df = pd.read_csv("creditcard.csv")   

# Display the first few rows
print(df.head())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [4]:
# Information about the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [5]:
# Statistical summary
print(df.describe())

                Time            V1            V2            V3            V4  \
count  284807.000000  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean    94813.859575  1.168375e-15  3.416908e-16 -1.379537e-15  2.074095e-15   
std     47488.145955  1.958696e+00  1.651309e+00  1.516255e+00  1.415869e+00   
min         0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00   
25%     54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01   
50%     84692.000000  1.810880e-02  6.548556e-02  1.798463e-01 -1.984653e-02   
75%    139320.500000  1.315642e+00  8.037239e-01  1.027196e+00  7.433413e-01   
max    172792.000000  2.454930e+00  2.205773e+01  9.382558e+00  1.687534e+01   

                 V5            V6            V7            V8            V9  \
count  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean   9.604066e-16  1.487313e-15 -5.556467e-16  1.213481e-16 -2.406331e-15   
std    1.380247e+00  1.332271e+00  1.23709

In [6]:
# Class distribution
print(df['Class'].value_counts())

0    284315
1       492
Name: Class, dtype: int64


In [7]:
# Separate the fraud and non-fraud cases
fraud = df[df["Class"] == 1]
not_fraud = df[df["Class"] == 0]

In [8]:
# Describe fraud and non-fraud cases
print(fraud.Amount.describe())
print('\n')
print(not_fraud.Amount.describe())

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64


count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64


In [9]:
# Sample non-fraud cases to balance the dataset
not_fraud = not_fraud.sample(n=492)
balanced_dataset = pd.concat([fraud, not_fraud], axis=0)

In [10]:
not_fraud=not_fraud.sample(n=492)
not_fraud

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
18622,29643.0,1.162824,-0.203606,1.265903,0.175545,-1.079738,-0.263563,-0.664427,0.059707,0.383574,...,0.157032,0.647834,-0.037049,0.685820,0.210982,1.072346,-0.028782,0.011228,10.00,0
55440,46943.0,-1.107651,0.830605,2.220587,2.817472,0.300731,0.680751,0.511332,0.011463,-0.947705,...,-0.310901,-0.488204,-0.038246,-0.028379,0.151279,0.084363,-0.065852,-0.039205,45.50,0
71905,54487.0,1.151394,0.523286,0.524369,2.518719,-0.151648,-0.574984,0.218123,-0.162845,-0.770597,...,0.134839,0.328851,-0.144390,0.409736,0.637163,0.143022,-0.019700,0.022677,28.91,0
229191,145830.0,-0.408411,-0.910457,1.538813,-1.970766,-1.076586,0.529643,0.395901,-0.042968,-0.619417,...,0.462526,1.092317,0.333551,-0.500690,-0.646192,-0.334729,-0.031832,-0.028201,225.99,0
208796,137276.0,-3.444902,-2.794933,0.601194,0.664410,5.360922,-2.082538,-1.733547,-0.086303,-0.378577,...,-0.575403,-1.114785,-0.993569,-0.139975,0.328416,0.507071,0.273506,-0.363062,12.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61460,49848.0,1.046798,0.030249,1.591626,2.948051,-0.867994,0.652750,-0.734797,0.368734,0.558370,...,0.060279,0.529165,-0.048790,0.433496,0.473213,0.225212,0.060268,0.025084,0.00,0
229690,146016.0,1.987829,-0.409274,-2.602372,-1.043196,1.447996,0.775112,0.310382,0.080824,0.253429,...,0.188886,0.755341,-0.031494,-0.821608,0.178317,1.154064,-0.103457,-0.107388,32.67,0
891,672.0,-0.347674,0.532592,-0.081980,0.176925,2.407053,4.030194,0.619624,0.721852,-0.306587,...,-0.118953,-0.130521,-0.058563,1.004381,0.148452,-0.268734,0.222647,-0.016316,94.86,0
100550,67546.0,-1.316337,1.341116,-0.492082,0.117286,2.141580,3.853339,-0.195283,1.276864,-0.317739,...,-0.124801,-0.210933,-0.256855,1.015796,0.376878,-0.276475,-0.585096,-0.417360,15.48,0


In [11]:
# Check the class distribution in the balanced dataset
print(balanced_dataset["Class"].value_counts())

1    492
0    492
Name: Class, dtype: int64


In [12]:
# Separate features and target variable
X = balanced_dataset.drop("Class", axis='columns')
y = balanced_dataset["Class"]

In [13]:
# Feature Scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [14]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

In [15]:
# Logistic Regression
log_model = LogisticRegression()
log_model.fit(X_train, y_train)

In [16]:
# Predict and evaluate
y_pred_log = log_model.predict(X_test)
print("Logistic Regression Accuracy: ", accuracy_score(y_test, y_pred_log))
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_log))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log))

Logistic Regression Accuracy:  0.9289340101522843
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.99      0.93        99
           1       0.99      0.87      0.92        98

    accuracy                           0.93       197
   macro avg       0.94      0.93      0.93       197
weighted avg       0.94      0.93      0.93       197

Confusion Matrix:
[[98  1]
 [13 85]]


In [17]:
# Decision Tree
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)

In [18]:
# Predict and evaluate
y_pred_tree = tree_model.predict(X_test)
print("Decision Tree Accuracy: ", accuracy_score(y_test, y_pred_tree))
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_tree))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_tree))

Decision Tree Accuracy:  0.9289340101522843
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.93      0.93        99
           1       0.93      0.93      0.93        98

    accuracy                           0.93       197
   macro avg       0.93      0.93      0.93       197
weighted avg       0.93      0.93      0.93       197

Confusion Matrix:
[[92  7]
 [ 7 91]]


In [19]:
# Random Forest
forest_model = RandomForestClassifier()
forest_model.fit(X_train, y_train)

In [20]:
# Predict and evaluate
y_pred_forest = forest_model.predict(X_test)
print("Random Forest Accuracy: ", accuracy_score(y_test, y_pred_forest))
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_forest))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_forest))

Random Forest Accuracy:  0.934010152284264
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.88      1.00      0.94        99
           1       1.00      0.87      0.93        98

    accuracy                           0.93       197
   macro avg       0.94      0.93      0.93       197
weighted avg       0.94      0.93      0.93       197

Confusion Matrix:
[[99  0]
 [13 85]]


In [21]:
# Cross-Validation
cv_scores = cross_val_score(log_model, X, y, cv=5)
print("Cross-Validation Scores: ", cv_scores)
print("Mean Cross-Validation Score: ", cv_scores.mean())

Cross-Validation Scores:  [0.96446701 0.93401015 0.91370558 0.94923858 0.92857143]
Mean Cross-Validation Score:  0.9379985496736767


In [22]:
# Hyperparameter Tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=forest_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [23]:
# Best parameters and best score
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

Best Parameters:  {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 100}
Best Score:  0.9440780456341209
