In [7]:
#Loading and exploring the dataset
import pandas as pd

# Load dataset
df = pd.read_csv("creditcard.csv")

# Check for missing values
print("Missing Values:\n", df.isnull().sum())

# Drop rows with missing values
df.dropna(inplace=True)

# Check class distribution
print("Class Distribution:\n", df['Class'].value_counts())

Missing Values:
 Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        1
V10       1
V11       1
V12       1
V13       1
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64
Class Distribution:
 Class
0.0    89008
1.0      211
Name: count, dtype: int64


In [8]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df[['Amount']])

In [9]:
#Feature Engineering
# Extract Hour from Time feature
df['Hour'] = (df['Time'] // 3600) % 24

# Create Transaction Frequency feature
df['Transaction_Frequency'] = df.groupby('Amount')['Amount'].transform('count')

In [10]:
#Handling Class Imbalance
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

X = df.drop(columns=['Class'])
y = df['Class']

# Apply SMOTE if enough fraud samples exist, else use undersampling
if y.value_counts()[1] > 5:
    smote = SMOTE(sampling_strategy='auto', k_neighbors=2, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
else:
    undersampler = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
    X_resampled, y_resampled = undersampler.fit_resample(X, y)

In [11]:
#Split data into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [12]:
#Train multiple models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)

Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Random Forest...
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



In [13]:
#Evaluate the models
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

best_model = None
best_score = 0

for name, model in models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])

    print(f"{name} Accuracy: {accuracy:.4f}")
    print(f"{name} ROC-AUC Score: {auc_score:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("\n")

    if auc_score > best_score:
        best_score = auc_score
        best_model = model

Logistic Regression Accuracy: 0.9755
Logistic Regression ROC-AUC Score: 0.9955
Confusion Matrix:
 [[17389   337]
 [  536 17342]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      0.98      0.98     17726
         1.0       0.98      0.97      0.98     17878

    accuracy                           0.98     35604
   macro avg       0.98      0.98      0.98     35604
weighted avg       0.98      0.98      0.98     35604



Random Forest Accuracy: 0.9998
Random Forest ROC-AUC Score: 1.0000
Confusion Matrix:
 [[17721     5]
 [    2 17876]]
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     17726
         1.0       1.00      1.00      1.00     17878

    accuracy                           1.00     35604
   macro avg       1.00      1.00      1.00     35604
weighted avg       1.00      1.00      1.00     35604



XGBoost Accuracy: 0.9999
XGBoost ROC-AUC Score: 

In [14]:
#Save the best model
import joblib

joblib.dump(best_model, "credit_fraud_model.pkl")
print(f"Model training complete. The best model has been saved as credit_fraud_model.pkl with ROC-AUC Score: {best_score:.4f}")

Model training complete. The best model has been saved as credit_fraud_model.pkl with ROC-AUC Score: 1.0000


In [16]:
#Download the model
from google.colab import files

# Download the saved model
files.download("credit_fraud_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>