In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    roc_auc_score, 
    average_precision_score,
    precision_recall_curve
)

import warnings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')


In [None]:
try:
    ecom_df = pd.read_csv('../data/processed/ecommerce_processed.csv')
    cc_df = pd.read_csv('../data/processed/creditcard_processed.csv')
    print("Processed datasets loaded successfully!")
    print(f"E-commerce data shape: {ecom_df.shape}")
    print(f"Credit card data shape: {cc_df.shape}")
except FileNotFoundError as e:
    print(f"Error loading datasets: {e}")
    print("Please ensure the 'run_preprocessing.py' script or the first notebook has been run successfully.")

In [None]:
print("--- Preparing E-commerce Data ---")
X_ecom = ecom_df.drop('class', axis=1)
y_ecom = ecom_df['class']

X_ecom_train, X_ecom_test, y_ecom_train, y_ecom_test = train_test_split(
    X_ecom, y_ecom, test_size=0.3, random_state=42, stratify=y_ecom
)
print(f"Original train set class distribution:\n{y_ecom_train.value_counts(normalize=True)}")

smote = SMOTE(random_state=42)
X_ecom_train_smote, y_ecom_train_smote = smote.fit_resample(X_ecom_train, y_ecom_train)
print(f"\nSMOTE-balanced train set class distribution:\n{y_ecom_train_smote.value_counts(normalize=True)}")

In [None]:
print("\n--- Preparing Credit Card Data ---")
X_cc = cc_df.drop('Class', axis=1)
y_cc = cc_df['Class']

X_cc_train, X_cc_test, y_cc_train, y_cc_test = train_test_split(
    X_cc, y_cc, test_size=0.3, random_state=42, stratify=y_cc
)
print(f"Original train set class distribution:\n{y_cc_train.value_counts(normalize=True)}")

X_cc_train_smote, y_cc_train_smote = smote.fit_resample(X_cc_train, y_cc_train)
print(f"\nSMOTE-balanced train set class distribution:\n{y_cc_train_smote.value_counts(normalize=True)}")