In [2]:
# Standard imports
import sys
import os
import warnings
import time
warnings.filterwarnings('ignore')

# Add src directory to path for modular imports
sys.path.append('../src')

# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-v0_8')

# Machine learning basics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Custom modules for Task 2
from data_splitter import DataSplitter
from model_builder import ModelBuilder
from model_trainer import ModelTrainer
from model_evaluator import ModelEvaluator

# Data loading (reuse from Task 1)
from data_loader import DataLoader
from utils import setup_logging

# Set up logging
setup_logging('INFO')

# Set random seed for reproducibility
np.random.seed(42)

print("✅ All modules imported successfully!")
print("📁 Working directory:", os.getcwd())


✅ All modules imported successfully!
📁 Working directory: c:\Kifiya\Week8\fraud-detection\notebooks


In [3]:
# Initialize data loader
data_loader = DataLoader(data_dir='../data')

print("🔄 Loading datasets for modeling...")

# Load datasets
try:
    fraud_data = data_loader.load_fraud_data()
    creditcard_data = data_loader.load_creditcard_data()
    
    print(f"✅ Fraud data loaded: {fraud_data.shape}")
    print(f"✅ Credit card data loaded: {creditcard_data.shape}")
    
    # Display basic info
    print(f"\n📊 Dataset Overview:")
    print(f"Fraud data target column: {'class' if 'class' in fraud_data.columns else 'Class'}")
    print(f"Credit card target column: {'Class' if 'Class' in creditcard_data.columns else 'class'}")
    
    # Check class distribution
    fraud_target = 'class' if 'class' in fraud_data.columns else 'Class'
    cc_target = 'Class' if 'Class' in creditcard_data.columns else 'class'
    
    print(f"\n🎯 Class Distributions:")
    print(f"Fraud data - {fraud_target}:")
    print(fraud_data[fraud_target].value_counts())
    print(f"\nCredit card data - {cc_target}:")
    print(creditcard_data[cc_target].value_counts())
    
except Exception as e:
    print(f"❌ Error loading data: {e}")
    print("Make sure the data files are in the '../data' directory")


INFO:data_loader:Loading fraud data from ..\data\Fraud_Data.csv


🔄 Loading datasets for modeling...


INFO:data_loader:Loaded fraud data: 151112 rows, 11 columns
INFO:data_loader:Loading credit card data from ..\data\creditcard.csv
INFO:data_loader:Loaded credit card data: 284807 rows, 31 columns


✅ Fraud data loaded: (151112, 11)
✅ Credit card data loaded: (284807, 31)

📊 Dataset Overview:
Fraud data target column: class
Credit card target column: Class

🎯 Class Distributions:
Fraud data - class:
class
0    136961
1     14151
Name: count, dtype: int64

Credit card data - Class:
Class
0    284315
1       492
Name: count, dtype: int64


In [4]:
# Initialize data splitter
data_splitter = DataSplitter(random_state=42)

print("🔧 Preparing datasets for modeling...")

# Prepare both datasets
datasets = data_splitter.prepare_datasets_for_modeling(
    fraud_df=fraud_data,
    creditcard_df=creditcard_data,
    test_size=0.2
)

# Get dataset information
dataset_info = data_splitter.get_dataset_info(datasets)

print("\n📊 PREPARED DATASETS SUMMARY")
print("=" * 60)

for dataset_name, info in dataset_info.items():
    print(f"\n{dataset_name.upper()} Dataset:")
    print(f"  Training samples: {info['train_samples']:,}")
    print(f"  Test samples: {info['test_samples']:,}")
    print(f"  Features: {info['n_features']}")
    print(f"  Train class distribution: {info['train_class_distribution']}")
    print(f"  Test class distribution: {info['test_class_distribution']}")
    print(f"  Train imbalance ratio: {info['train_imbalance_ratio']:.4f}")
    print(f"  Test imbalance ratio: {info['test_imbalance_ratio']:.4f}")

print("\n✅ Data preparation completed!")


INFO:data_splitter:Preparing fraud detection dataset...
INFO:data_splitter:Prepared fraud data: 151112 samples, 2 features
INFO:data_splitter:Features scaled using StandardScaler
INFO:data_splitter:Train-test split completed:
INFO:data_splitter:  Training set: 120889 samples
INFO:data_splitter:  Test set: 30223 samples
INFO:data_splitter:  Training class distribution: {0: np.int64(109568), 1: np.int64(11321)}
INFO:data_splitter:  Test class distribution: {0: np.int64(27393), 1: np.int64(2830)}
INFO:data_splitter:Preparing credit card dataset...


🔧 Preparing datasets for modeling...


INFO:data_splitter:Prepared credit card data: 284807 samples, 30 features
INFO:data_splitter:Features scaled using StandardScaler
INFO:data_splitter:Train-test split completed:
INFO:data_splitter:  Training set: 227845 samples
INFO:data_splitter:  Test set: 56962 samples
INFO:data_splitter:  Training class distribution: {0: np.int64(227451), 1: np.int64(394)}
INFO:data_splitter:  Test class distribution: {0: np.int64(56864), 1: np.int64(98)}
INFO:data_splitter:Both datasets prepared for modeling



📊 PREPARED DATASETS SUMMARY

FRAUD Dataset:
  Training samples: 120,889
  Test samples: 30,223
  Features: 2
  Train class distribution: {0: np.int64(109568), 1: np.int64(11321)}
  Test class distribution: {0: np.int64(27393), 1: np.int64(2830)}
  Train imbalance ratio: 0.1033
  Test imbalance ratio: 0.1033

CREDITCARD Dataset:
  Training samples: 227,845
  Test samples: 56,962
  Features: 30
  Train class distribution: {0: np.int64(227451), 1: np.int64(394)}
  Test class distribution: {0: np.int64(56864), 1: np.int64(98)}
  Train imbalance ratio: 0.0017
  Test imbalance ratio: 0.0017

✅ Data preparation completed!


In [5]:
# Initialize model builder
model_builder = ModelBuilder(random_state=42)

print("🏗️ BUILDING MODEL SUITE")
print("=" * 50)

# Create model suite with required models
models_to_include = ['logistic_regression', 'random_forest', 'lightgbm']

print("Creating models...")
models = model_builder.create_model_suite(include_models=models_to_include)

print(f"\n✅ Created {len(models)} models:")
for model_name in models.keys():
    print(f"  • {model_name}")

# Get model information
model_info = model_builder.get_model_info()

print(f"\n📋 MODEL CHARACTERISTICS:")
for model_name, info in model_info.items():
    print(f"\n{model_name.upper()}:")
    characteristics = info['suitable_for']
    for key, value in characteristics.items():
        print(f"  {key}: {value}")

print("\n🎯 Model Selection Rationale:")
print("• Logistic Regression: Interpretable baseline model with good performance on imbalanced data")
print("• Random Forest: Robust ensemble method with feature importance")  
print("• LightGBM: High-performance gradient boosting optimized for imbalanced datasets")


INFO:model_builder:Created Logistic Regression with params: {'random_state': 42, 'max_iter': 1000, 'class_weight': 'balanced', 'solver': 'liblinear'}
INFO:model_builder:Created Random Forest with params: {'n_estimators': 100, 'random_state': 42, 'class_weight': 'balanced', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'n_jobs': -1}
INFO:model_builder:Created model suite with 2 models: ['logistic_regression', 'random_forest']


🏗️ BUILDING MODEL SUITE
Creating models...

✅ Created 2 models:
  • logistic_regression
  • random_forest

📋 MODEL CHARACTERISTICS:

LOGISTIC_REGRESSION:
  interpretability: High
  training_speed: Fast
  prediction_speed: Very Fast
  memory_usage: Low
  handling_imbalance: Good with class_weight
  best_for: Baseline, interpretable models

RANDOM_FOREST:
  interpretability: Medium
  training_speed: Medium
  prediction_speed: Fast
  memory_usage: Medium
  handling_imbalance: Good with class_weight
  best_for: Robust performance, feature importance

🎯 Model Selection Rationale:
• Logistic Regression: Interpretable baseline model with good performance on imbalanced data
• Random Forest: Robust ensemble method with feature importance
• LightGBM: High-performance gradient boosting optimized for imbalanced datasets


In [6]:
# Initialize model trainer
model_trainer = ModelTrainer(random_state=42)

print("🚀 TRAINING MODELS ON BOTH DATASETS")
print("=" * 60)

# Store results for both datasets
training_results = {}

# Train on fraud detection dataset
print("\n🎯 FRAUD DETECTION DATASET")
print("-" * 40)

fraud_data_info = dataset_info['fraud']
X_train_fraud = datasets['fraud']['X_train']
y_train_fraud = datasets['fraud']['y_train']

# Optimize models for fraud dataset imbalance
fraud_models = model_builder.optimize_for_imbalanced_data(
    models.copy(), 
    fraud_data_info['train_imbalance_ratio']
)

# Train and cross-validate fraud models
fraud_results = model_trainer.train_and_evaluate_suite(
    fraud_models, X_train_fraud, y_train_fraud, cv_folds=5
)

training_results['fraud'] = {
    'results': fraud_results,
    'models': fraud_models,
    'trainer': model_trainer
}

print(f"\n✅ Fraud detection model training completed!")

# Create new trainer for credit card dataset
cc_trainer = ModelTrainer(random_state=42)

print("\n💳 CREDIT CARD DATASET")
print("-" * 40)

cc_data_info = dataset_info['creditcard']
X_train_cc = datasets['creditcard']['X_train']
y_train_cc = datasets['creditcard']['y_train']

# Optimize models for credit card dataset imbalance  
cc_models = model_builder.optimize_for_imbalanced_data(
    models.copy(),
    cc_data_info['train_imbalance_ratio']
)

# Train and cross-validate credit card models
cc_results = cc_trainer.train_and_evaluate_suite(
    cc_models, X_train_cc, y_train_cc, cv_folds=5
)

training_results['creditcard'] = {
    'results': cc_results,
    'models': cc_models,
    'trainer': cc_trainer
}

print(f"\n✅ Credit card model training completed!")


INFO:model_builder:Optimized 2 models for imbalanced data
INFO:model_trainer:Training 2 models...
INFO:model_trainer:Training logistic_regression...
INFO:model_trainer:✅ logistic_regression trained successfully in 0.06 seconds
INFO:model_trainer:Training random_forest...


🚀 TRAINING MODELS ON BOTH DATASETS

🎯 FRAUD DETECTION DATASET
----------------------------------------


INFO:model_trainer:✅ random_forest trained successfully in 1.98 seconds
INFO:model_trainer:Training completed:
INFO:model_trainer:  ✅ Successful: ['logistic_regression', 'random_forest']
INFO:model_trainer:Performing 5-fold cross-validation...
INFO:model_trainer:Cross-validating logistic_regression...
INFO:model_trainer:  logistic_regression - AUC-ROC: 0.5024, AUC-PR: 0.0935, F1: 0.1557
INFO:model_trainer:Cross-validating random_forest...
INFO:model_trainer:  random_forest - AUC-ROC: 0.6968, AUC-PR: 0.1465, F1: 0.2762
INFO:model_trainer:Cross-validation completed
INFO:model_builder:Optimized 2 models for imbalanced data
INFO:model_trainer:Training 2 models...
INFO:model_trainer:Training logistic_regression...



✅ Fraud detection model training completed!

💳 CREDIT CARD DATASET
----------------------------------------


INFO:model_trainer:✅ logistic_regression trained successfully in 4.37 seconds
INFO:model_trainer:Training random_forest...
INFO:model_trainer:✅ random_forest trained successfully in 42.21 seconds
INFO:model_trainer:Training completed:
INFO:model_trainer:  ✅ Successful: ['logistic_regression', 'random_forest']
INFO:model_trainer:Performing 5-fold cross-validation...
INFO:model_trainer:Cross-validating logistic_regression...
INFO:model_trainer:  logistic_regression - AUC-ROC: 0.9825, AUC-PR: 0.0575, F1: 0.1179
INFO:model_trainer:Cross-validating random_forest...
INFO:model_trainer:  random_forest - AUC-ROC: 0.9749, AUC-PR: 0.6813, F1: 0.8233
INFO:model_trainer:Cross-validation completed



✅ Credit card model training completed!


In [7]:
# Initialize model evaluator
model_evaluator = ModelEvaluator(figsize=(12, 8))
