In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score

In [2]:
# Load the dataset
df = pd.read_csv("creditcard.csv")   

# Display the first few rows
print(df.head())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [3]:
# Information about the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [4]:
# Statistical summary
print(df.describe())

                Time            V1            V2            V3            V4  \
count  284807.000000  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean    94813.859575  1.168375e-15  3.416908e-16 -1.379537e-15  2.074095e-15   
std     47488.145955  1.958696e+00  1.651309e+00  1.516255e+00  1.415869e+00   
min         0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00   
25%     54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01   
50%     84692.000000  1.810880e-02  6.548556e-02  1.798463e-01 -1.984653e-02   
75%    139320.500000  1.315642e+00  8.037239e-01  1.027196e+00  7.433413e-01   
max    172792.000000  2.454930e+00  2.205773e+01  9.382558e+00  1.687534e+01   

                 V5            V6            V7            V8            V9  \
count  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean   9.604066e-16  1.487313e-15 -5.556467e-16  1.213481e-16 -2.406331e-15   
std    1.380247e+00  1.332271e+00  1.23709

In [5]:
# Class distribution
print(df['Class'].value_counts())

0    284315
1       492
Name: Class, dtype: int64


In [6]:
# Separate the fraud and non-fraud cases
fraud = df[df["Class"] == 1]
not_fraud = df[df["Class"] == 0]

In [7]:
# Describe fraud and non-fraud cases
print(fraud.Amount.describe())
print('\n')
print(not_fraud.Amount.describe())

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64


count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64


In [8]:
# Sample non-fraud cases to balance the dataset
not_fraud = not_fraud.sample(n=492)
balanced_dataset = pd.concat([fraud, not_fraud], axis=0)

In [9]:
not_fraud=not_fraud.sample(n=492)
not_fraud

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
233132,147452.0,-1.572039,-2.717777,2.193202,0.298922,0.014595,0.566195,-1.712554,0.549371,0.603909,...,0.115071,0.595251,0.684692,0.637317,-0.571178,-0.090682,0.230565,0.280669,187.62,0
101666,67909.0,-0.148951,-1.899986,-0.481143,1.953657,-0.860241,-0.340901,1.117055,-0.265810,0.026717,...,0.422627,-0.169244,-0.673986,0.449604,0.454068,-0.322949,-0.106699,0.131821,688.00,0
17188,28511.0,1.203230,-0.155904,0.415132,-0.107848,-0.791064,-0.968588,-0.178754,-0.036713,0.163216,...,-0.047416,-0.261096,0.056593,0.575459,0.134365,0.932121,-0.099179,-0.003679,22.07,0
173185,121379.0,1.729027,-0.750175,-1.271841,0.419762,-0.373769,-0.941846,0.285176,-0.367833,1.038164,...,0.184076,0.515231,-0.127798,0.063950,0.168067,0.176300,-0.053776,-0.032521,172.14,0
149772,91898.0,-0.257522,1.397632,-0.449291,-0.193755,0.485352,-0.980886,0.611441,0.197320,0.949604,...,-0.398287,-0.883021,0.258075,0.934193,-0.469976,0.050944,0.071732,0.021387,20.48,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137132,82027.0,1.430210,-0.747040,-1.032109,-1.827972,1.395694,3.231363,-1.105565,0.798358,-0.909834,...,-0.139661,-0.703223,0.083502,0.980732,0.388647,-0.466513,0.010500,0.018876,25.00,0
145645,87111.0,1.986810,-0.351825,-0.642933,0.427415,-0.383678,0.224329,-1.086369,0.263560,1.427893,...,0.131678,0.405406,0.193131,-0.082566,-0.481715,0.319753,0.022316,-0.003716,16.14,0
128031,78574.0,-1.252183,-0.115039,1.793129,-0.749173,-0.826122,0.106707,-0.465140,0.908437,0.474787,...,0.061837,-0.050198,0.207716,0.252941,-0.703254,0.736438,-0.057418,0.007903,63.22,0
195599,131140.0,2.034738,-0.364796,-1.464639,0.182070,0.222840,0.003280,-0.198891,0.074451,1.207824,...,0.099921,0.497871,-0.025078,0.183923,0.360827,-0.477338,0.005512,-0.064875,4.50,0


In [10]:
# Check the class distribution in the balanced dataset
print(balanced_dataset["Class"].value_counts())

1    492
0    492
Name: Class, dtype: int64


In [11]:
# Separate features and target variable
X = balanced_dataset.drop("Class", axis='columns')
y = balanced_dataset["Class"]

In [12]:
# Feature Scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

In [14]:
# Logistic Regression
log_model = LogisticRegression()
log_model.fit(X_train, y_train)

In [15]:
# Predict and evaluate
y_pred_log = log_model.predict(X_test)
print("Logistic Regression Accuracy: ", accuracy_score(y_test, y_pred_log))
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_log))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log))

Logistic Regression Accuracy:  0.9238578680203046
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.95      0.93        99
           1       0.95      0.90      0.92        98

    accuracy                           0.92       197
   macro avg       0.93      0.92      0.92       197
weighted avg       0.92      0.92      0.92       197

Confusion Matrix:
[[94  5]
 [10 88]]


In [16]:
# Decision Tree
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)

In [17]:
# Predict and evaluate
y_pred_tree = tree_model.predict(X_test)
print("Decision Tree Accuracy: ", accuracy_score(y_test, y_pred_tree))
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_tree))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_tree))

Decision Tree Accuracy:  0.8934010152284264
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.90      0.89        99
           1       0.90      0.89      0.89        98

    accuracy                           0.89       197
   macro avg       0.89      0.89      0.89       197
weighted avg       0.89      0.89      0.89       197

Confusion Matrix:
[[89 10]
 [11 87]]


In [18]:
# Random Forest
forest_model = RandomForestClassifier()
forest_model.fit(X_train, y_train)

In [19]:
# Predict and evaluate
y_pred_forest = forest_model.predict(X_test)
print("Random Forest Accuracy: ", accuracy_score(y_test, y_pred_forest))
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_forest))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_forest))

Random Forest Accuracy:  0.9187817258883249
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.96      0.92        99
           1       0.96      0.88      0.91        98

    accuracy                           0.92       197
   macro avg       0.92      0.92      0.92       197
weighted avg       0.92      0.92      0.92       197

Confusion Matrix:
[[95  4]
 [12 86]]


In [20]:
# Cross-Validation
cv_scores = cross_val_score(log_model, X, y, cv=5)
print("Cross-Validation Scores: ", cv_scores)
print("Mean Cross-Validation Score: ", cv_scores.mean())

Cross-Validation Scores:  [0.95939086 0.93908629 0.93401015 0.95431472 0.91326531]
Mean Cross-Validation Score:  0.9400134673158604


In [21]:
# Hyperparameter Tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=forest_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [22]:
# Best parameters and best score
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

Best Parameters:  {'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
Best Score:  0.9491493993388698
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   0.5s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_lea

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   0.5s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_est

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estim