Importing the dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

# For imbalance handling
from imblearn.over_sampling import SMOTE

# Model saving
import joblib

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
data = pd.read_csv("/content/drive/MyDrive/creditcard.csv")


In [5]:
pd.options.display.max_columns = None

In [6]:
print("Shape:", data.shape)
print("Class distribution:\n", data['Class'].value_counts())

Shape: (284807, 31)
Class distribution:
 Class
0    284315
1       492
Name: count, dtype: int64


**PREPROCESSING**

In [7]:
# Scale Amount
scaler = StandardScaler()
data['Amount'] = scaler.fit_transform(data[['Amount']])

# Drop Time
data = data.drop(['Time'], axis=1)

# Drop duplicates if any
data = data.drop_duplicates()

print("Final Shape:", data.shape)
print("Class distribution after cleaning:\n", data['Class'].value_counts())

Final Shape: (275663, 30)
Class distribution after cleaning:
 Class
0    275190
1       473
Name: count, dtype: int64


**Train/Test Split**

In [8]:
X = data.drop('Class', axis=1)
y = data['Class']

# Stratify ensures fraud cases distributed properly
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

**Handle Imbalance with SMOTE**

In [9]:
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("After SMOTE:\n", y_train_res.value_counts())

After SMOTE:
 Class
0    220152
1    220152
Name: count, dtype: int64


**Train Pipeline Model**

In [10]:
# Pipeline ensures scaling + classifier integrated
pipeline = Pipeline([
    ('scaler', StandardScaler()),   # standardize features
    ('rf', RandomForestClassifier(
        n_estimators=200,
        class_weight='balanced',
        random_state=42
    ))
])

# Fit model
pipeline.fit(X_train_res, y_train_res)

joblib.dump(pipeline, "credit_card_model.pkl")
print("✅ Model saved as credit_card_model.pkl")

✅ Model saved as credit_card_model.pkl


**Evaluate**

In [11]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Probability scoring (important for fraud detection)
y_proba = pipeline.predict_proba(X_test)[:,1]
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     55038
           1       0.83      0.78      0.80        95

    accuracy                           1.00     55133
   macro avg       0.92      0.89      0.90     55133
weighted avg       1.00      1.00      1.00     55133

Confusion Matrix:
 [[55023    15]
 [   21    74]]
ROC-AUC Score: 0.9626563273986775


**Here we doing a Test With a Fraud Sample**

In [12]:
# Take first fraud transaction row
fraud_sample = data[data['Class']==1].iloc[0].drop('Class')
fraud_array = fraud_sample.values.reshape(1, -1)

print("Fraud sample prediction:", pipeline.predict(fraud_array))
print("Fraud sample prob:", pipeline.predict_proba(fraud_array))

Fraud sample prediction: [1]
Fraud sample prob: [[0.01 0.99]]




**Here we checking Normal Transaction**

In [13]:
# Take first normal transaction row
normal_sample = data[data['Class']==0].iloc[0].drop('Class')
normal_array = normal_sample.values.reshape(1, -1)

print("Normal sample prediction:", pipeline.predict(normal_array))
print("Normal sample prob:", pipeline.predict_proba(normal_array))

Normal sample prediction: [0]
Normal sample prob: [[1. 0.]]


