# Cell 1: Add Project Root to Python Path

In [1]:
# notebooks/model_building.ipynb - Cell 1: Add project root to path

import os
import sys

# Get the current working directory of the notebook (e.g., 'fraud_detection_project/notebooks/')
current_dir = os.getcwd()
# Get the parent directory (which is 'fraud_detection_project/')
project_root = os.path.dirname(current_dir)

# Add the project root to sys.path so Python can find 'src'
if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added '{project_root}' to sys.path.")
else:
    print(f"'{project_root}' already in sys.path.")

Added 'c:\Users\skibret\Downloads\KAIM\Week 8\Project\fraud_detection_project' to sys.path.


# Cell 2: Import Statements

In [2]:
# notebooks/model_building.ipynb - Cell 2: Import Statements

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline as SklearnPipeline


# Import functions from our model_training.py script
from src.model_training import (
    load_processed_data,
    split_data,
    handle_imbalance,
    evaluate_model,
    train_logistic_regression, # NEW
    train_decision_tree,       # NEW
    train_random_forest,       # NEW
    train_gradient_boosting   # NEW

)

# Import necessary models from scikit-learn (will add more as we go)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

# For CNN/RNN, we'll likely use TensorFlow/Keras or PyTorch, which will be installed later.
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten, LSTM

# Configure plot styles for better visualization
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

# Cell 3: Load Processed Data

In [3]:
# notebooks/model_building.ipynb - Cell 3: Load Processed Data

# Define paths to your processed data, using project_root for absolute paths
ecommerce_filepath = os.path.join(project_root, 'data', 'processed', 'processed_ecommerce_fraud.csv')
creditcard_filepath = os.path.join(project_root, 'data', 'processed', 'processed_creditcard_fraud.csv')

print("--- Loading Processed E-commerce Fraud Data ---")
ecommerce_df = load_processed_data(ecommerce_filepath)

print("\n--- Loading Processed Credit Card Fraud Data ---")
creditcard_df = load_processed_data(creditcard_filepath)

if ecommerce_df is not None:
    print(f"\nE-commerce data loaded. Shape: {ecommerce_df.shape}")
    print("E-commerce Data Head:")
    print(ecommerce_df.head())
else:
    print("\nE-commerce data not loaded.")

if creditcard_df is not None:
    print(f"\nCredit Card data loaded. Shape: {creditcard_df.shape}")
    print("Credit Card Data Head:")
    print(creditcard_df.head())
else:
    print("\nCredit Card data not loaded.")

--- Loading Processed E-commerce Fraud Data ---


2025-07-26 20:19:05,940 - INFO - Successfully loaded processed data from c:\Users\skibret\Downloads\KAIM\Week 8\Project\fraud_detection_project\data\processed\processed_ecommerce_fraud.csv. Shape: (151112, 19)



--- Loading Processed Credit Card Fraud Data ---


2025-07-26 20:19:07,330 - INFO - Successfully loaded processed data from c:\Users\skibret\Downloads\KAIM\Week 8\Project\fraud_detection_project\data\processed\processed_creditcard_fraud.csv. Shape: (283726, 31)



E-commerce data loaded. Shape: (151112, 19)
E-commerce Data Head:
   user_id          signup_time        purchase_time  purchase_value  \
0        2  2015-01-11 03:47:13  2015-02-21 10:03:37              54   
1        4  2015-06-02 16:40:57  2015-09-26 21:32:16              41   
2        8  2015-05-28 07:53:06  2015-08-13 11:53:07              47   
3        9  2015-05-16 15:58:32  2015-05-20 23:06:42              62   
4       12  2015-01-10 06:25:12  2015-03-04 20:56:37              35   

       device_id  source  browser sex  age    ip_address  class  \
0  FGBQNDNBETFJJ     SEO   Chrome   F   25  8.802175e+08      0   
1  MKFUIVOHLJBYN  Direct   Safari   F   38  2.785906e+09      0   
2  SCQGQALXBUQZJ     SEO   Chrome   M   25  3.560567e+08      0   
3  IEZOHXPZBIRTE     SEO  FireFox   M   21  7.591047e+08      0   
4  MSNWCFEHKTIOY     Ads   Safari   M   19  2.985180e+09      0   

   ip_address_int  country  hour_of_day  day_of_week  time_since_signup  \
0             NaN     

# Cell 4: Data Preprocessing for Modeling (One-Hot Encoding, Scaling)

In [4]:
# notebooks/model_building.ipynb - Cell 4: Data Preparation & Column Identification

# --- E-commerce Data Preparation ---
ecommerce_X = None
ecommerce_y = None
ecommerce_numerical_cols_for_preprocessor = []
ecommerce_categorical_cols_for_preprocessor = []

if ecommerce_df is not None:
    print("\n--- Preparing E-commerce Data for Modeling ---")
    # Define features (X) and target (y)
    # Drop the target column 'class' and other non-feature columns
    cols_to_drop_ecommerce = [
        'class', 'ip_address', 'lower_bound_ip_address', 'upper_bound_ip_address', 'ip_address_int',
        'signup_time', 'purchase_time', 'time_diff_prev_transaction'
    ]
    ecommerce_X = ecommerce_df.drop(columns=cols_to_drop_ecommerce, errors='ignore')
    ecommerce_y = ecommerce_df['class']

    # Identify numerical and categorical columns *from this X*
    # These lists will be used by the ColumnTransformer in Cell 5
    ecommerce_numerical_cols_for_preprocessor = ecommerce_X.select_dtypes(include=np.number).columns.tolist()
    ecommerce_categorical_cols_for_preprocessor = ecommerce_X.select_dtypes(include=['object', 'category']).columns.tolist()

    print(f"E-commerce X shape (before split): {ecommerce_X.shape}")
    print(f"E-commerce Numerical Columns for Preprocessor: {ecommerce_numerical_cols_for_preprocessor}")
    print(f"E-commerce Categorical Columns for Preprocessor: {ecommerce_categorical_cols_for_preprocessor}")
else:
    print("E-commerce data not loaded. Skipping preparation.")

# --- Credit Card Data Preparation ---
creditcard_X = None
creditcard_y = None
creditcard_numerical_cols_for_preprocessor = []
creditcard_categorical_cols_for_preprocessor = [] # Should be empty for this dataset

if creditcard_df is not None:
    print("\n--- Preparing Credit Card Data for Modeling ---")
    # Credit card data is mostly numerical (V1-V28, Amount, Time)
    # 'Class' is the target.
    creditcard_X = creditcard_df.drop(columns=['Class'])
    creditcard_y = creditcard_df['Class']

    # Identify numerical and categorical columns *from this X*
    # These lists will be used by the ColumnTransformer in Cell 7
    creditcard_numerical_cols_for_preprocessor = creditcard_X.select_dtypes(include=np.number).columns.tolist()
    creditcard_categorical_cols_for_preprocessor = creditcard_X.select_dtypes(include=['object', 'category']).columns.tolist()

    print(f"Credit Card X shape (before split): {creditcard_X.shape}")
    print(f"Credit Card Numerical Columns for Preprocessor: {creditcard_numerical_cols_for_preprocessor}")
    print(f"Credit Card Categorical Columns for Preprocessor: {creditcard_categorical_cols_for_preprocessor}")
else:
    print("Credit Card data not loaded. Skipping preparation.")


--- Preparing E-commerce Data for Modeling ---
E-commerce X shape (before split): (151112, 13)
E-commerce Numerical Columns for Preprocessor: ['user_id', 'purchase_value', 'age', 'country', 'hour_of_day', 'day_of_week', 'time_since_signup', 'transactions_last_7d', 'purchase_value_last_7d']
E-commerce Categorical Columns for Preprocessor: ['device_id', 'source', 'browser', 'sex']

--- Preparing Credit Card Data for Modeling ---
Credit Card X shape (before split): (283726, 30)
Credit Card Numerical Columns for Preprocessor: ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
Credit Card Categorical Columns for Preprocessor: []


# Cell 5: Train-Test Split & Preprocessing (E-commerce)

In [5]:
# notebooks/model_building.ipynb - Cell 5: Train-Test Split & Preprocessing (E-commerce)

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline as SklearnPipeline # Alias for sklearn's Pipeline

ecommerce_X_train_processed = None
ecommerce_X_test_processed = None
ecommerce_y_train = None
ecommerce_y_test = None

if ecommerce_df is not None:
    print("\n--- Splitting E-commerce Data ---")
    # Pass the original dataframe and the target column name to split_data
    # This function returns X_train, X_test, y_train, y_test, and initial column lists
    ecommerce_X_train_raw, ecommerce_X_test_raw, ecommerce_y_train, ecommerce_y_test, \
    initial_ecommerce_categorical_cols, initial_ecommerce_numerical_cols = \
        split_data(ecommerce_df, target_column='class', test_size=0.3, random_state=42)

    # Define additional columns to drop from X_train_raw and X_test_raw
    # These are columns that were in the original dataframe but should not be features
    cols_to_drop_ecommerce_from_X = [
        'ip_address', 'lower_bound_ip_address', 'upper_bound_ip_address', 'ip_address_int',
        'signup_time', 'purchase_time', 'time_diff_prev_transaction'
    ]

    # Drop these columns from the X_train and X_test sets
    ecommerce_X_train = ecommerce_X_train_raw.drop(columns=cols_to_drop_ecommerce_from_X, errors='ignore')
    ecommerce_X_test = ecommerce_X_test_raw.drop(columns=cols_to_drop_ecommerce_from_X, errors='ignore')

    # Re-identify numerical and categorical columns *after* dropping the non-feature columns
    # This ensures the preprocessor uses the correct column lists
    ecommerce_numerical_cols_for_preprocessor = ecommerce_X_train.select_dtypes(include=np.number).columns.tolist()
    ecommerce_categorical_cols_for_preprocessor = ecommerce_X_train.select_dtypes(include=['object', 'category']).columns.tolist()

    print(f"E-commerce Numerical Columns for Preprocessor (after feature selection): {ecommerce_numerical_cols_for_preprocessor}")
    print(f"E-commerce Categorical Columns for Preprocessor (after feature selection): {ecommerce_categorical_cols_for_preprocessor}")

    # Create the preprocessing pipeline using ColumnTransformer
    # This pipeline will handle imputation, scaling, and one-hot encoding
    ecommerce_preprocessor_fitted = ColumnTransformer(
        transformers=[
            ('num', SklearnPipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')), # Impute NaNs in numerical columns
                ('scaler', StandardScaler())                    # Scale numerical features
            ]), ecommerce_numerical_cols_for_preprocessor),
            ('cat', OneHotEncoder(handle_unknown='ignore'), ecommerce_categorical_cols_for_preprocessor) # One-hot encode categorical features
        ],
        remainder='passthrough' # Keep any other columns not specified (e.g., if you added more later)
    ).fit(ecommerce_X_train) # Fit the preprocessor ONLY on the training data to prevent data leakage

    # Transform both training and testing sets using the fitted preprocessor
    ecommerce_X_train_processed = ecommerce_preprocessor_fitted.transform(ecommerce_X_train)
    ecommerce_X_test_processed = ecommerce_preprocessor_fitted.transform(ecommerce_X_test)

    print(f"E-commerce X_train_processed shape: {ecommerce_X_train_processed.shape}")
    print(f"E-commerce X_test_processed shape: {ecommerce_X_test_processed.shape}")
else:
    print("E-commerce data (df) not available for splitting and preprocessing.")

2025-07-26 20:19:07,466 - INFO - Splitting data into training and testing sets (test_size=0.3)...



--- Splitting E-commerce Data ---


2025-07-26 20:19:07,548 - INFO - X_train shape: (105778, 18), y_train shape: (105778,)
2025-07-26 20:19:07,549 - INFO - X_test shape: (45334, 18), y_test shape: (45334,)
2025-07-26 20:19:07,551 - INFO - Training target distribution:
class
0    0.906351
1    0.093649
Name: proportion, dtype: float64
2025-07-26 20:19:07,553 - INFO - Testing target distribution:
class
0    0.906362
1    0.093638
Name: proportion, dtype: float64


E-commerce Numerical Columns for Preprocessor (after feature selection): ['user_id', 'purchase_value', 'age', 'country', 'hour_of_day', 'day_of_week', 'time_since_signup', 'transactions_last_7d', 'purchase_value_last_7d']
E-commerce Categorical Columns for Preprocessor (after feature selection): ['device_id', 'source', 'browser', 'sex']




E-commerce X_train_processed shape: (105778, 97939)
E-commerce X_test_processed shape: (45334, 97939)


# Cell 6: Imbalance Handling (E-commerce)

In [6]:
# notebooks/model_building.ipynb - Cell 6: Imbalance Handling (E-commerce)

ecommerce_X_train_resampled = None
ecommerce_y_train_resampled = None

if ecommerce_X_train_processed is not None and ecommerce_y_train is not None:
    print("\n--- Handling Imbalance for E-commerce Training Data (SMOTE) ---")
    ecommerce_X_train_resampled, ecommerce_y_train_resampled = \
        handle_imbalance(ecommerce_X_train_processed, ecommerce_y_train, strategy='SMOTE', random_state=42)

    print(f"E-commerce Resampled X_train shape: {ecommerce_X_train_resampled.shape}")
    print(f"E-commerce Resampled y_train distribution:\n{ecommerce_y_train_resampled.value_counts(normalize=True)}")
else:
    print("E-commerce processed training data not available for imbalance handling.")

2025-07-26 20:19:08,789 - INFO - Handling class imbalance using strategy: SMOTE...



--- Handling Imbalance for E-commerce Training Data (SMOTE) ---


2025-07-26 20:19:13,371 - INFO - Resampling complete. Original minority count: 9906, majority count: 95872
2025-07-26 20:19:13,372 - INFO - Resampled data shape: (191744, 97939), target shape: (191744,)
2025-07-26 20:19:13,378 - INFO - Resampled target distribution:
class
0    0.5
1    0.5
Name: proportion, dtype: float64


E-commerce Resampled X_train shape: (191744, 97939)
E-commerce Resampled y_train distribution:
class
0    0.5
1    0.5
Name: proportion, dtype: float64


# E-commerce Model Training

In [7]:
# notebooks/model_building.ipynb - NEW Cell 7: E-commerce Model Training and Evaluation

ecommerce_models = {}
ecommerce_results = {}

if ecommerce_X_train_resampled is not None and ecommerce_y_train_resampled is not None:
    print("\n--- Training Models for E-commerce Fraud Detection ---")

    # --- Logistic Regression ---
    lr_model_ecommerce = train_logistic_regression(ecommerce_X_train_resampled, ecommerce_y_train_resampled)
    ecommerce_models['Logistic Regression'] = lr_model_ecommerce
    y_pred_lr_ecommerce = lr_model_ecommerce.predict(ecommerce_X_test_processed)
    y_prob_lr_ecommerce = lr_model_ecommerce.predict_proba(ecommerce_X_test_processed)[:, 1]
    ecommerce_results['Logistic Regression'] = evaluate_model(lr_model_ecommerce, ecommerce_X_test_processed, ecommerce_y_test, y_pred_lr_ecommerce, y_prob_lr_ecommerce)

    # --- Decision Tree ---
    dt_model_ecommerce = train_decision_tree(ecommerce_X_train_resampled, ecommerce_y_train_resampled)
    ecommerce_models['Decision Tree'] = dt_model_ecommerce
    y_pred_dt_ecommerce = dt_model_ecommerce.predict(ecommerce_X_test_processed)
    y_prob_dt_ecommerce = dt_model_ecommerce.predict_proba(ecommerce_X_test_processed)[:, 1]
    ecommerce_results['Decision Tree'] = evaluate_model(dt_model_ecommerce, ecommerce_X_test_processed, ecommerce_y_test, y_pred_dt_ecommerce, y_prob_dt_ecommerce)

    # --- Random Forest ---
    rf_model_ecommerce = train_random_forest(ecommerce_X_train_resampled, ecommerce_y_train_resampled)
    ecommerce_models['Random Forest'] = rf_model_ecommerce
    y_pred_rf_ecommerce = rf_model_ecommerce.predict(ecommerce_X_test_processed)
    y_prob_rf_ecommerce = rf_model_ecommerce.predict_proba(ecommerce_X_test_processed)[:, 1]
    ecommerce_results['Random Forest'] = evaluate_model(rf_model_ecommerce, ecommerce_X_test_processed, ecommerce_y_test, y_pred_rf_ecommerce, y_prob_rf_ecommerce)

    # --- Gradient Boosting ---
    gb_model_ecommerce = train_gradient_boosting(ecommerce_X_train_resampled, ecommerce_y_train_resampled)
    ecommerce_models['Gradient Boosting'] = gb_model_ecommerce
    y_pred_gb_ecommerce = gb_model_ecommerce.predict(ecommerce_X_test_processed)
    y_prob_gb_ecommerce = gb_model_ecommerce.predict_proba(ecommerce_X_test_processed)[:, 1]
    ecommerce_results['Gradient Boosting'] = evaluate_model(gb_model_ecommerce, ecommerce_X_test_processed, ecommerce_y_test, y_pred_gb_ecommerce, y_prob_gb_ecommerce)

   
    print("\n--- E-commerce Model Training and Evaluation Complete ---")
    for model_name, metrics in ecommerce_results.items():
        print(f"\nModel: {model_name}")
        print(f"  AUC-ROC: {metrics['roc_auc']:.4f}")
        print(f"  Precision: {metrics['precision']:.4f}")
        print(f"  Recall: {metrics['recall']:.4f}")
        print(f"  F1-Score: {metrics['f1_score']:.4f}")
else:
    print("E-commerce resampled training data not available for model training.")

2025-07-26 20:19:13,407 - INFO - Training Logistic Regression model...



--- Training Models for E-commerce Fraud Detection ---


2025-07-26 20:19:19,232 - INFO - Logistic Regression training complete.
2025-07-26 20:19:19,263 - INFO - Evaluating model performance...
2025-07-26 20:19:19,374 - INFO - Accuracy: 0.9459
2025-07-26 20:19:19,376 - INFO - Precision: 0.7995
2025-07-26 20:19:19,380 - INFO - Recall: 0.5637
2025-07-26 20:19:19,383 - INFO - F1-Score: 0.6612
2025-07-26 20:19:19,389 - INFO - AUC-ROC: 0.7696
2025-07-26 20:19:19,393 - INFO - 
Confusion Matrix:
[[40489   600]
 [ 1852  2393]]
2025-07-26 20:19:19,395 - INFO - 
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     41089
           1       0.80      0.56      0.66      4245

    accuracy                           0.95     45334
   macro avg       0.88      0.77      0.82     45334
weighted avg       0.94      0.95      0.94     45334

2025-07-26 20:19:19,399 - INFO - Training Decision Tree Classifier model...
2025-07-26 20:22:42,268 - INFO - Decision Tree Classifier training compl


--- E-commerce Model Training and Evaluation Complete ---

Model: Logistic Regression
  AUC-ROC: 0.7696
  Precision: 0.7995
  Recall: 0.5637
  F1-Score: 0.6612

Model: Decision Tree
  AUC-ROC: 0.7693
  Precision: 0.9407
  Recall: 0.5420
  F1-Score: 0.6878

Model: Random Forest
  AUC-ROC: 0.7776
  Precision: 0.9674
  Recall: 0.5376
  F1-Score: 0.6911

Model: Gradient Boosting
  AUC-ROC: 0.7659
  Precision: 0.9160
  Recall: 0.5397
  F1-Score: 0.6792


# Cell 7: Train-Test Split & Preprocessing (Credit Card)

In [8]:
# notebooks/model_building.ipynb - Cell 7: Train-Test Split & Preprocessing (Credit Card)

from sklearn.preprocessing import StandardScaler, OneHotEncoder # Re-import if this cell is run independently
from sklearn.impute import SimpleImputer # Re-import if this cell is run independently
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline as SklearnPipeline # Alias for sklearn's Pipeline

creditcard_X_train_processed = None
creditcard_X_test_processed = None
creditcard_y_train = None
creditcard_y_test = None

if creditcard_df is not None:
    print("\n--- Splitting Credit Card Data ---")
    # Pass the original dataframe and the target column name to split_data
    creditcard_X_train_raw, creditcard_X_test_raw, creditcard_y_train, creditcard_y_test, \
    initial_creditcard_categorical_cols, initial_creditcard_numerical_cols = \
        split_data(creditcard_df, target_column='Class', test_size=0.3, random_state=42)

    # For credit card data, typically no extra columns to drop from X_train/X_test
    # as V-features, Time, Amount are all relevant.
    creditcard_X_train = creditcard_X_train_raw.copy()
    creditcard_X_test = creditcard_X_test_raw.copy()

    # Re-identify numerical and categorical columns (should be mostly numerical)
    creditcard_numerical_cols_for_preprocessor = creditcard_X_train.select_dtypes(include=np.number).columns.tolist()
    creditcard_categorical_cols_for_preprocessor = creditcard_X_train.select_dtypes(include=['object', 'category']).columns.tolist()

    print(f"Credit Card Numerical Columns for Preprocessor (after feature selection): {creditcard_numerical_cols_for_preprocessor}")
    print(f"Credit Card Categorical Columns for Preprocessor (after feature selection): {creditcard_categorical_cols_for_preprocessor}")

    # Create the preprocessing pipeline using ColumnTransformer
    # Only numerical features are expected for credit card data, so only a numerical pipeline
    creditcard_preprocessor_fitted = ColumnTransformer(
        transformers=[
            ('num', SklearnPipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')), # Impute NaNs in numerical columns
                ('scaler', StandardScaler())                    # Scale numerical features
            ]), creditcard_numerical_cols_for_preprocessor)
            # No 'cat' transformer needed if no categorical columns are present
        ],
        remainder='passthrough'
    ).fit(creditcard_X_train) # Fit the preprocessor ONLY on the training data

    # Transform both training and testing sets
    creditcard_X_train_processed = creditcard_preprocessor_fitted.transform(creditcard_X_train)
    creditcard_X_test_processed = creditcard_preprocessor_fitted.transform(creditcard_X_test)

    print(f"Credit Card X_train_processed shape: {creditcard_X_train_processed.shape}")
    print(f"Credit Card X_test_processed shape: {creditcard_X_test_processed.shape}")
else:
    print("Credit Card data (df) not available for splitting and preprocessing.")

2025-07-26 20:53:58,783 - INFO - Splitting data into training and testing sets (test_size=0.3)...



--- Splitting Credit Card Data ---


2025-07-26 20:53:59,045 - INFO - X_train shape: (198608, 30), y_train shape: (198608,)
2025-07-26 20:53:59,046 - INFO - X_test shape: (85118, 30), y_test shape: (85118,)
2025-07-26 20:53:59,048 - INFO - Training target distribution:
Class
0    0.998333
1    0.001667
Name: proportion, dtype: float64
2025-07-26 20:53:59,050 - INFO - Testing target distribution:
Class
0    0.998332
1    0.001668
Name: proportion, dtype: float64


Credit Card Numerical Columns for Preprocessor (after feature selection): ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
Credit Card Categorical Columns for Preprocessor (after feature selection): []
Credit Card X_train_processed shape: (198608, 30)
Credit Card X_test_processed shape: (85118, 30)


# Cell 8: Imbalance Handling (Credit Card)

In [9]:
# notebooks/model_building.ipynb - Cell 8: Imbalance Handling (Credit Card)

creditcard_X_train_resampled = None
creditcard_y_train_resampled = None

if creditcard_X_train_processed is not None and creditcard_y_train is not None:
    print("\n--- Handling Imbalance for Credit Card Training Data (SMOTE_and_Undersample) ---")
    # For credit card data, SMOTE_and_Undersample might be more effective due to extreme imbalance
    creditcard_X_train_resampled, creditcard_y_train_resampled = \
        handle_imbalance(creditcard_X_train_processed, creditcard_y_train, strategy='SMOTE_and_Undersample', random_state=42)

    print(f"Credit Card Resampled X_train shape: {creditcard_X_train_resampled.shape}")
    print(f"Credit Card Resampled y_train distribution:\n{creditcard_y_train_resampled.value_counts(normalize=True)}")
else:
    print("Credit Card processed training data not available for imbalance handling.")

2025-07-26 20:54:00,146 - INFO - Handling class imbalance using strategy: SMOTE_and_Undersample...



--- Handling Imbalance for Credit Card Training Data (SMOTE_and_Undersample) ---


2025-07-26 20:54:03,264 - INFO - Resampling complete. Original minority count: 331, majority count: 198277
2025-07-26 20:54:03,265 - INFO - Resampled data shape: (59481, 30), target shape: (59481,)
2025-07-26 20:54:03,266 - INFO - Resampled target distribution:
Class
0    0.666667
1    0.333333
Name: proportion, dtype: float64


Credit Card Resampled X_train shape: (59481, 30)
Credit Card Resampled y_train distribution:
Class
0    0.666667
1    0.333333
Name: proportion, dtype: float64


# Credit Card Model Training

In [10]:
# notebooks/model_building.ipynb - NEW Cell 9: Credit Card Model Training and Evaluation

creditcard_models = {}
creditcard_results = {}

if creditcard_X_train_resampled is not None and creditcard_y_train_resampled is not None:
    print("\n--- Training Models for Credit Card Fraud Detection ---")

    # --- Logistic Regression ---
    lr_model_creditcard = train_logistic_regression(creditcard_X_train_resampled, creditcard_y_train_resampled)
    creditcard_models['Logistic Regression'] = lr_model_creditcard
    y_pred_lr_creditcard = lr_model_creditcard.predict(creditcard_X_test_processed)
    y_prob_lr_creditcard = lr_model_creditcard.predict_proba(creditcard_X_test_processed)[:, 1]
    creditcard_results['Logistic Regression'] = evaluate_model(lr_model_creditcard, creditcard_X_test_processed, creditcard_y_test, y_pred_lr_creditcard, y_prob_lr_creditcard)

    # --- Decision Tree ---
    dt_model_creditcard = train_decision_tree(creditcard_X_train_resampled, creditcard_y_train_resampled)
    creditcard_models['Decision Tree'] = dt_model_creditcard
    y_pred_dt_creditcard = dt_model_creditcard.predict(creditcard_X_test_processed)
    y_prob_dt_creditcard = dt_model_creditcard.predict_proba(creditcard_X_test_processed)[:, 1]
    creditcard_results['Decision Tree'] = evaluate_model(dt_model_creditcard, creditcard_X_test_processed, creditcard_y_test, y_pred_dt_creditcard, y_prob_dt_creditcard)

    # --- Random Forest ---
    rf_model_creditcard = train_random_forest(creditcard_X_train_resampled, creditcard_y_train_resampled)
    creditcard_models['Random Forest'] = rf_model_creditcard
    y_pred_rf_creditcard = rf_model_creditcard.predict(creditcard_X_test_processed)
    y_prob_rf_creditcard = rf_model_creditcard.predict_proba(creditcard_X_test_processed)[:, 1]
    creditcard_results['Random Forest'] = evaluate_model(rf_model_creditcard, creditcard_X_test_processed, creditcard_y_test, y_pred_rf_creditcard, y_prob_rf_creditcard)

    # --- Gradient Boosting ---
    gb_model_creditcard = train_gradient_boosting(creditcard_X_train_resampled, creditcard_y_train_resampled)
    creditcard_models['Gradient Boosting'] = gb_model_creditcard
    y_pred_gb_creditcard = gb_model_creditcard.predict(creditcard_X_test_processed)
    y_prob_gb_creditcard = gb_model_creditcard.predict_proba(creditcard_X_test_processed)[:, 1]
    creditcard_results['Gradient Boosting'] = evaluate_model(gb_model_creditcard, creditcard_X_test_processed, creditcard_y_test, y_pred_gb_creditcard, y_prob_gb_creditcard)

   
    print("\n--- Credit Card Model Training and Evaluation Complete ---")
    for model_name, metrics in creditcard_results.items():
        print(f"\nModel: {model_name}")
        print(f"  AUC-ROC: {metrics['roc_auc']:.4f}")
        print(f"  Precision: {metrics['precision']:.4f}")
        print(f"  Recall: {metrics['recall']:.4f}")
        print(f"  F1-Score: {metrics['f1_score']:.4f}")
else:
    print("Credit Card resampled training data not available for model training.")

2025-07-26 20:54:03,276 - INFO - Training Logistic Regression model...



--- Training Models for Credit Card Fraud Detection ---


2025-07-26 20:54:03,965 - INFO - Logistic Regression training complete.
2025-07-26 20:54:03,980 - INFO - Evaluating model performance...
2025-07-26 20:54:04,078 - INFO - Accuracy: 0.9870
2025-07-26 20:54:04,079 - INFO - Precision: 0.1035
2025-07-26 20:54:04,080 - INFO - Recall: 0.8873
2025-07-26 20:54:04,080 - INFO - F1-Score: 0.1854
2025-07-26 20:54:04,081 - INFO - AUC-ROC: 0.9683
2025-07-26 20:54:04,082 - INFO - 
Confusion Matrix:
[[83885  1091]
 [   16   126]]
2025-07-26 20:54:04,082 - INFO - 
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     84976
           1       0.10      0.89      0.19       142

    accuracy                           0.99     85118
   macro avg       0.55      0.94      0.59     85118
weighted avg       1.00      0.99      0.99     85118

2025-07-26 20:54:04,084 - INFO - Training Decision Tree Classifier model...
2025-07-26 20:54:08,158 - INFO - Decision Tree Classifier training compl


--- Credit Card Model Training and Evaluation Complete ---

Model: Logistic Regression
  AUC-ROC: 0.9683
  Precision: 0.1035
  Recall: 0.8873
  F1-Score: 0.1854

Model: Decision Tree
  AUC-ROC: 0.8803
  Precision: 0.1540
  Recall: 0.7676
  F1-Score: 0.2565

Model: Random Forest
  AUC-ROC: 0.9774
  Precision: 0.7651
  Recall: 0.8028
  F1-Score: 0.7835

Model: Gradient Boosting
  AUC-ROC: 0.9670
  Precision: 0.1909
  Recall: 0.8592
  F1-Score: 0.3124
