In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier  # Alternative: import xgboost as xgb or import lightgbm as lgb
from sklearn.metrics import (
    precision_recall_curve, auc, f1_score, confusion_matrix, ConfusionMatrixDisplay
)
from imblearn.over_sampling import SMOTE
import joblib
import warnings
warnings.filterwarnings('ignore')

### Data Preparation
#### Loading Engineered Data

In [3]:
# Load engineered e-commerce data
fraud_df = pd.read_csv('../data/processed/cleaned_fraud_data.csv')

# Load engineered credit card data (assuming scaled Time and Amount from feature-engineering.ipynb)
credit_df = pd.read_csv('../data/processed/cleaned_creditcard.csv')

### Splitting Data and Separating Features/Target

#### For Fraud_Data:

In [4]:
# Separate features and target
X_fraud = fraud_df.drop(['class', 'user_id', 'signup_time', 'purchase_time', 'device_id', 'ip_address'], axis=1)  # Drop non-features
y_fraud = fraud_df['class']

# Stratified train-test split
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(
    X_fraud, y_fraud, test_size=0.2, stratify=y_fraud, random_state=42
)
print("Train class distribution:", y_train_fraud.value_counts(normalize=True))
# Example output: 0: 0.906, 1: 0.094 (preserved imbalance)

Train class distribution: class
0    0.906352
1    0.093648
Name: proportion, dtype: float64


#### For creditcard:

In [5]:
# Separate features and target
X_credit = credit_df.drop('Class', axis=1)
y_credit = credit_df['Class']

# Stratified train-test split
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(
    X_credit, y_credit, test_size=0.2, stratify=y_credit, random_state=42
)
print("Train class distribution:", y_train_credit.value_counts(normalize=True))
# Example output: 0: 0.9983, 1: 0.0017 (preserved extreme imbalance)

Train class distribution: Class
0    0.998335
1    0.001665
Name: proportion, dtype: float64


### Handling Imbalance with SMOTE (on Train Only)

#### Identify categorical columns

In [7]:
X_train_fraud.dtypes

purchase_value     int64
source            object
browser           object
sex               object
age                int64
ip_int             int64
dtype: object

#### Encode categorical features 

In [8]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [9]:
# Identify categorical and numerical columns
cat_cols = X_train_fraud.select_dtypes(include=['object', 'category']).columns
num_cols = X_train_fraud.select_dtypes(include=['int64', 'float64']).columns

In [10]:
# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', 'passthrough', num_cols)
    ]
)


#### Transform the training data

In [11]:
X_train_fraud_encoded = preprocessor.fit_transform(X_train_fraud)

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE

#### FRAUD DATASET

In [17]:
cat_cols_fraud = X_train_fraud.select_dtypes(include=['object', 'category']).columns
num_cols_fraud = X_train_fraud.select_dtypes(include=['int64', 'float64']).columns

In [None]:
preprocessor_fraud = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols_fraud),
        ('num', 'passthrough', num_cols_fraud)
    ]
)


#### Encode data

In [22]:
X_train_fraud_encoded = preprocessor_fraud.fit_transform(X_train_fraud)

#### Apply SMOTE

In [23]:
smote_fraud = SMOTE(random_state=42)

X_train_fraud_res, y_train_fraud_res = smote_fraud.fit_resample(
    X_train_fraud_encoded, y_train_fraud
)

print("Resampled train distribution (Fraud):")
print(y_train_fraud_res.value_counts(normalize=True))


Resampled train distribution (Fraud):
class
0    0.5
1    0.5
Name: proportion, dtype: float64


#### CREDIT DATASET

#### Define columns automatically

In [24]:
cat_cols_credit = X_train_credit.select_dtypes(include=['object', 'category']).columns
num_cols_credit = X_train_credit.select_dtypes(include=['int64', 'float64']).columns

#### Preprocess (Credit-specific)

In [26]:
cat_cols_credit = X_train_credit.select_dtypes(include=['object', 'category']).columns
num_cols_credit = X_train_credit.select_dtypes(include=['int64', 'float64']).columns

In [None]:
preprocessor_credit = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols_credit),
        ('num', 'passthrough', num_cols_fraud)
    ]
)

#### Encode data

In [30]:
print(X_train_credit.columns)

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')


#### Build the preprocessor 

In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Detect columns automatically
cat_cols_credit = X_train_credit.select_dtypes(include=['object', 'category']).columns
num_cols_credit = X_train_credit.select_dtypes(include=['int64', 'float64']).columns

print("Credit categorical columns:", cat_cols_credit)
print("Credit numerical columns:", num_cols_credit)

# Build correct preprocessor
preprocessor_credit = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols_credit),
        ('num', 'passthrough', num_cols_credit)
    ]
)


Credit categorical columns: Index([], dtype='object')
Credit numerical columns: Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')


#### Encode + SMOTE

In [32]:
from imblearn.over_sampling import SMOTE

X_train_credit_encoded = preprocessor_credit.fit_transform(X_train_credit)

smote_credit = SMOTE(sampling_strategy=0.05, random_state=42)

X_train_credit_res, y_train_credit_res = smote_credit.fit_resample(
    X_train_credit_encoded, y_train_credit
)

print("Resampled train distribution (Credit):")
print(y_train_credit_res.value_counts(normalize=True))


Resampled train distribution (Credit):
Class
0    0.952381
1    0.047619
Name: proportion, dtype: float64
