In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    roc_auc_score, 
    average_precision_score,
    precision_recall_curve
)

import warnings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')


In [None]:
try:
    ecom_df = pd.read_csv('../data/processed/ecommerce_processed.csv')
    cc_df = pd.read_csv('../data/processed/creditcard_processed.csv')
    print("Processed datasets loaded successfully!")
    print(f"E-commerce data shape: {ecom_df.shape}")
    print(f"Credit card data shape: {cc_df.shape}")
except FileNotFoundError as e:
    print(f"Error loading datasets: {e}")
    print("Please ensure the 'run_preprocessing.py' script or the first notebook has been run successfully.")

In [None]:
print("--- Preparing E-commerce Data ---")
X_ecom = ecom_df.drop('class', axis=1)
y_ecom = ecom_df['class']

X_ecom_train, X_ecom_test, y_ecom_train, y_ecom_test = train_test_split(
    X_ecom, y_ecom, test_size=0.3, random_state=42, stratify=y_ecom
)
print(f"Original train set class distribution:\n{y_ecom_train.value_counts(normalize=True)}")

smote = SMOTE(random_state=42)
X_ecom_train_smote, y_ecom_train_smote = smote.fit_resample(X_ecom_train, y_ecom_train)
print(f"\nSMOTE-balanced train set class distribution:\n{y_ecom_train_smote.value_counts(normalize=True)}")

In [None]:
print("\n--- Preparing Credit Card Data ---")
X_cc = cc_df.drop('Class', axis=1)
y_cc = cc_df['Class']

X_cc_train, X_cc_test, y_cc_train, y_cc_test = train_test_split(
    X_cc, y_cc, test_size=0.3, random_state=42, stratify=y_cc
)
print(f"Original train set class distribution:\n{y_cc_train.value_counts(normalize=True)}")

X_cc_train_smote, y_cc_train_smote = smote.fit_resample(X_cc_train, y_cc_train)
print(f"\nSMOTE-balanced train set class distribution:\n{y_cc_train_smote.value_counts(normalize=True)}")

In [None]:
def evaluate_model(model, X_test, y_test, model_name="Model"):
    """
    Calculates and prints key performance metrics for a classification model.
    Plots a confusion matrix.
    """
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print(f"--- Evaluation for: {model_name} ---")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Non-Fraud (0)', 'Fraud (1)']))
    
    auc_pr = average_precision_score(y_test, y_proba)
    print(f"Area Under Precision-Recall Curve (AUC-PR): {auc_pr:.4f}")
    
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Predicted 0', 'Predicted 1'], 
                yticklabels=['Actual 0', 'Actual 1'])
    plt.title(f'Confusion Matrix for {model_name}')
    plt.ylabel('Actual Class')
    plt.xlabel('Predicted Class')
    plt.show()

    return auc_pr

In [None]:
print("--- Training Logistic Regression on E-commerce Data ---")
lr_ecom = LogisticRegression(random_state=42, max_iter=1000)
lr_ecom.fit(X_ecom_train_smote, y_ecom_train_smote)

lr_auc_pr = evaluate_model(lr_ecom, X_ecom_test, y_ecom_test, "Logistic Regression (E-commerce)")


In [None]:
print("\n--- Training XGBoost on E-commerce Data ---")
xgb_ecom = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_ecom.fit(X_ecom_train_smote, y_ecom_train_smote)

xgb_auc_pr = evaluate_model(xgb_ecom, X_ecom_test, y_ecom_test, "XGBoost (E-commerce)")