In [14]:
# Standard imports
import sys
import os
import warnings
import time
warnings.filterwarnings('ignore')

# Add src directory to path for modular imports
sys.path.append('../src')

# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-v0_8')

# Machine learning basics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Custom modules for Task 2
from data_splitter import DataSplitter
from model_builder import ModelBuilder
from model_trainer import ModelTrainer
from model_evaluator import ModelEvaluator

# Data loading (reuse from Task 1)
from data_loader import DataLoader
from utils import setup_logging

# Set up logging
setup_logging('INFO')

# Set random seed for reproducibility
np.random.seed(42)

print("‚úÖ All modules imported successfully!")
print("üìÅ Working directory:", os.getcwd())


‚úÖ All modules imported successfully!
üìÅ Working directory: c:\Kifiya\Week8\fraud-detection\notebooks


In [15]:
# Initialize data loader
data_loader = DataLoader(data_dir='../data')

print("üîÑ Loading datasets for modeling...")

# Load datasets
try:
    fraud_data = data_loader.load_fraud_data()
    creditcard_data = data_loader.load_creditcard_data()
    
    print(f"‚úÖ Fraud data loaded: {fraud_data.shape}")
    print(f"‚úÖ Credit card data loaded: {creditcard_data.shape}")
    
    # Display basic info
    print(f"\nüìä Dataset Overview:")
    print(f"Fraud data target column: {'class' if 'class' in fraud_data.columns else 'Class'}")
    print(f"Credit card target column: {'Class' if 'Class' in creditcard_data.columns else 'class'}")
    
    # Check class distribution
    fraud_target = 'class' if 'class' in fraud_data.columns else 'Class'
    cc_target = 'Class' if 'Class' in creditcard_data.columns else 'class'
    
    print(f"\nüéØ Class Distributions:")
    print(f"Fraud data - {fraud_target}:")
    print(fraud_data[fraud_target].value_counts())
    print(f"\nCredit card data - {cc_target}:")
    print(creditcard_data[cc_target].value_counts())
    
except Exception as e:
    print(f"‚ùå Error loading data: {e}")
    print("Make sure the data files are in the '../data' directory")


INFO:data_loader:Loading fraud data from ..\data\Fraud_Data.csv


üîÑ Loading datasets for modeling...


INFO:data_loader:Loaded fraud data: 151112 rows, 11 columns
INFO:data_loader:Loading credit card data from ..\data\creditcard.csv
INFO:data_loader:Loaded credit card data: 284807 rows, 31 columns


‚úÖ Fraud data loaded: (151112, 11)
‚úÖ Credit card data loaded: (284807, 31)

üìä Dataset Overview:
Fraud data target column: class
Credit card target column: Class

üéØ Class Distributions:
Fraud data - class:
class
0    136961
1     14151
Name: count, dtype: int64

Credit card data - Class:
Class
0    284315
1       492
Name: count, dtype: int64


In [16]:
# Initialize data splitter
data_splitter = DataSplitter(random_state=42)

print("üîß Preparing datasets for modeling...")

# Prepare both datasets
datasets = data_splitter.prepare_datasets_for_modeling(
    fraud_df=fraud_data,
    creditcard_df=creditcard_data,
    test_size=0.2
)

# Get dataset information
dataset_info = data_splitter.get_dataset_info(datasets)

print("\nüìä PREPARED DATASETS SUMMARY")
print("=" * 60)

for dataset_name, info in dataset_info.items():
    print(f"\n{dataset_name.upper()} Dataset:")
    print(f"  Training samples: {info['train_samples']:,}")
    print(f"  Test samples: {info['test_samples']:,}")
    print(f"  Features: {info['n_features']}")
    print(f"  Train class distribution: {info['train_class_distribution']}")
    print(f"  Test class distribution: {info['test_class_distribution']}")
    print(f"  Train imbalance ratio: {info['train_imbalance_ratio']:.4f}")
    print(f"  Test imbalance ratio: {info['test_imbalance_ratio']:.4f}")

print("\n‚úÖ Data preparation completed!")


INFO:data_splitter:Preparing fraud detection dataset...
INFO:data_splitter:Prepared fraud data: 151112 samples, 2 features
INFO:data_splitter:Features scaled using StandardScaler
INFO:data_splitter:Train-test split completed:
INFO:data_splitter:  Training set: 120889 samples
INFO:data_splitter:  Test set: 30223 samples
INFO:data_splitter:  Training class distribution: {0: np.int64(109568), 1: np.int64(11321)}
INFO:data_splitter:  Test class distribution: {0: np.int64(27393), 1: np.int64(2830)}
INFO:data_splitter:Preparing credit card dataset...


üîß Preparing datasets for modeling...


INFO:data_splitter:Prepared credit card data: 284807 samples, 30 features
INFO:data_splitter:Features scaled using StandardScaler
INFO:data_splitter:Train-test split completed:
INFO:data_splitter:  Training set: 227845 samples
INFO:data_splitter:  Test set: 56962 samples
INFO:data_splitter:  Training class distribution: {0: np.int64(227451), 1: np.int64(394)}
INFO:data_splitter:  Test class distribution: {0: np.int64(56864), 1: np.int64(98)}
INFO:data_splitter:Both datasets prepared for modeling



üìä PREPARED DATASETS SUMMARY

FRAUD Dataset:
  Training samples: 120,889
  Test samples: 30,223
  Features: 2
  Train class distribution: {0: np.int64(109568), 1: np.int64(11321)}
  Test class distribution: {0: np.int64(27393), 1: np.int64(2830)}
  Train imbalance ratio: 0.1033
  Test imbalance ratio: 0.1033

CREDITCARD Dataset:
  Training samples: 227,845
  Test samples: 56,962
  Features: 30
  Train class distribution: {0: np.int64(227451), 1: np.int64(394)}
  Test class distribution: {0: np.int64(56864), 1: np.int64(98)}
  Train imbalance ratio: 0.0017
  Test imbalance ratio: 0.0017

‚úÖ Data preparation completed!


In [17]:
print("üìä EVALUATING MODELS ON TEST SETS")
print("=" * 60)

# Store all evaluation results
evaluation_results = {}

# Evaluate fraud detection models
print("\nüéØ FRAUD DETECTION DATASET - TEST SET EVALUATION")
print("-" * 50)

fraud_trainer = training_results['fraud']['trainer']
X_test_fraud = datasets['fraud']['X_test']
y_test_fraud = datasets['fraud']['y_test']

fraud_eval_results = model_evaluator.evaluate_model_suite(
    fraud_trainer.trained_models, X_test_fraud, y_test_fraud
)

evaluation_results['fraud'] = fraud_eval_results
print(f"‚úÖ Fraud detection evaluation completed!")

# Evaluate credit card models  
print("\nüí≥ CREDIT CARD DATASET - TEST SET EVALUATION")
print("-" * 50)

cc_trainer = training_results['creditcard']['trainer']
X_test_cc = datasets['creditcard']['X_test']
y_test_cc = datasets['creditcard']['y_test']

# Create new evaluator for credit card models
cc_evaluator = ModelEvaluator(figsize=(12, 8))
cc_eval_results = cc_evaluator.evaluate_model_suite(
    cc_trainer.trained_models, X_test_cc, y_test_cc
)

evaluation_results['creditcard'] = cc_eval_results
print(f"‚úÖ Credit card evaluation completed!")

# Display evaluation summaries
print("\nüìà EVALUATION SUMMARY")
print("=" * 60)

for dataset_name, results in evaluation_results.items():
    print(f"\n{dataset_name.upper()} DATASET:")
    print("-" * 30)
    
    for model_name, result in results.items():
        if result['evaluation_successful']:
            metrics = result['metrics']
            print(f"  {model_name}: AUC-PR={metrics['auc_pr']:.4f}, F1={metrics['f1_score']:.4f}")

print("\nüéØ Ready for model explainability analysis!")


üìä EVALUATING MODELS ON TEST SETS

üéØ FRAUD DETECTION DATASET - TEST SET EVALUATION
--------------------------------------------------


NameError: name 'training_results' is not defined

In [None]:
# Plot confusion matrices for best models
print("üìä PLOTTING MODEL PERFORMANCE VISUALIZATIONS")
print("=" * 60)

# Plot confusion matrices for fraud detection
print("\nüéØ FRAUD DETECTION - Confusion Matrices")
model_evaluator.plot_confusion_matrices()

# Plot confusion matrices for credit card  
print("\nüí≥ CREDIT CARD - Confusion Matrices")
cc_evaluator.plot_confusion_matrices()

# Plot ROC curves
print("\nüìà ROC and Precision-Recall Curves")
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

# Fraud detection curves
model_evaluator.plot_roc_curves(ax=ax1, title="Fraud Detection - ROC Curves")
model_evaluator.plot_precision_recall_curves(ax=ax2, title="Fraud Detection - PR Curves")

# Credit card curves
cc_evaluator.plot_roc_curves(ax=ax3, title="Credit Card - ROC Curves")
cc_evaluator.plot_precision_recall_curves(ax=ax4, title="Credit Card - PR Curves")

plt.tight_layout()
plt.show()

# Get model recommendations
print("\nüèÜ MODEL RECOMMENDATIONS")
print("=" * 60)

fraud_recommendation = model_evaluator.generate_model_recommendation(primary_metric='auc_pr')
cc_recommendation = cc_evaluator.generate_model_recommendation(primary_metric='auc_pr')

print("\nüéØ FRAUD DETECTION DATASET:")
print(f"Best Model: {fraud_recommendation['best_model']}")
print(f"Primary Metric (AUC-PR): {fraud_recommendation['best_score']:.4f}")
print(f"Reasoning: {fraud_recommendation['reasoning']}")

print("\nüí≥ CREDIT CARD DATASET:")
print(f"Best Model: {cc_recommendation['best_model']}")
print(f"Primary Metric (AUC-PR): {cc_recommendation['best_score']:.4f}")
print(f"Reasoning: {cc_recommendation['reasoning']}")

# Store best models for SHAP analysis
best_models = {
    'fraud': {
        'model': fraud_trainer.trained_models[fraud_recommendation['best_model']],
        'model_name': fraud_recommendation['best_model'],
        'X_train': X_train_fraud,
        'X_test': X_test_fraud,
        'y_test': y_test_fraud
    },
    'creditcard': {
        'model': cc_trainer.trained_models[cc_recommendation['best_model']],
        'model_name': cc_recommendation['best_model'],
        'X_train': X_train_cc,
        'X_test': X_test_cc,
        'y_test': y_test_cc
    }
}

print("\n‚úÖ Model evaluation completed! Best models identified for SHAP analysis.")


In [None]:
# Import SHAP explainer module
from model_explainer import ModelExplainer

# Install SHAP if not available
try:
    import shap
    print("‚úÖ SHAP is available")
except ImportError:
    print("Installing SHAP...")
    !pip install shap
    import shap
    print("‚úÖ SHAP installed and imported")

print("\nüîç INITIALIZING SHAP EXPLAINERS")
print("=" * 60)

# Initialize SHAP explainers for best models
explainers = {}

for dataset_name, model_info in best_models.items():
    print(f"\nüìä Setting up SHAP explainer for {dataset_name.upper()} dataset...")
    print(f"   Model: {model_info['model_name']}")
    print(f"   Features: {model_info['X_train'].shape[1]}")
    
    explainer = ModelExplainer(
        model=model_info['model'],
        X_train=model_info['X_train'],
        X_test=model_info['X_test'],
        feature_names=list(model_info['X_train'].columns) if hasattr(model_info['X_train'], 'columns') else None
    )
    
    explainers[dataset_name] = {
        'explainer': explainer,
        'model_name': model_info['model_name'],
        'y_test': model_info['y_test']
    }
    
    print(f"   ‚úÖ SHAP explainer initialized")

print("\nüéØ SHAP explainers ready for both datasets!")


In [None]:
# Calculate SHAP values
print("‚öôÔ∏è CALCULATING SHAP VALUES")
print("=" * 60)

shap_results = {}

for dataset_name, explainer_info in explainers.items():
    print(f"\nüìä Calculating SHAP values for {dataset_name.upper()} dataset...")
    
    explainer = explainer_info['explainer']
    
    # Calculate SHAP values for a sample of test data (for performance)
    sample_size = min(1000, len(explainer.X_test))
    print(f"   Using sample size: {sample_size}")
    
    start_time = time.time()
    shap_values = explainer.calculate_shap_values(sample_size=sample_size, on_test=True)
    calculation_time = time.time() - start_time
    
    print(f"   ‚úÖ SHAP values calculated in {calculation_time:.2f} seconds")
    print(f"   Shape: {shap_values.shape}")
    
    shap_results[dataset_name] = {
        'explainer': explainer,
        'shap_values': shap_values,
        'model_name': explainer_info['model_name'],
        'y_test': explainer_info['y_test']
    }

print("\nüéØ SHAP value calculation completed for all models!")


In [None]:
# Generate SHAP Summary Plots for Global Interpretability
print("üìä GENERATING SHAP SUMMARY PLOTS")
print("=" * 60)

for dataset_name, shap_info in shap_results.items():
    explainer = shap_info['explainer']
    model_name = shap_info['model_name']
    
    print(f"\nüéØ {dataset_name.upper()} Dataset - {model_name.upper()} Model")
    print("-" * 50)
    
    # Summary plot (dot plot) - shows feature importance and feature effects
    print("üìà SHAP Summary Plot (Feature Importance & Effects)")
    explainer.plot_summary(
        plot_type='dot',
        max_display=15,
        title=f'{dataset_name.title()} - {model_name.title()} SHAP Summary'
    )
    
    # Bar plot - shows feature importance only
    print("üìä SHAP Summary Plot (Feature Importance Only)")
    explainer.plot_summary(
        plot_type='bar',
        max_display=15,
        title=f'{dataset_name.title()} - {model_name.title()} Feature Importance'
    )

print("\n‚úÖ Global SHAP analysis completed!")


In [None]:
# Generate local explanations for individual predictions
print("üîç GENERATING LOCAL EXPLANATIONS")
print("=" * 60)

# For each dataset, show explanations for fraud and non-fraud cases
for dataset_name, shap_info in shap_results.items():
    explainer = shap_info['explainer']
    model_name = shap_info['model_name']
    y_test_sample = shap_info['y_test']
    
    print(f"\nüéØ {dataset_name.upper()} Dataset - Local Explanations")
    print("-" * 50)
    
    # Find fraud and non-fraud cases
    if hasattr(y_test_sample, 'iloc'):
        fraud_indices = np.where(y_test_sample.iloc[:len(shap_info['shap_values'])] == 1)[0]
        non_fraud_indices = np.where(y_test_sample.iloc[:len(shap_info['shap_values'])] == 0)[0]
    else:
        fraud_indices = np.where(y_test_sample[:len(shap_info['shap_values'])] == 1)[0]
        non_fraud_indices = np.where(y_test_sample[:len(shap_info['shap_values'])] == 0)[0]
    
    # Show examples if available
    if len(fraud_indices) > 0:
        fraud_idx = fraud_indices[0]
        print(f"üö® FRAUD CASE EXPLANATION (Index: {fraud_idx})")
        
        # Force plot
        explainer.plot_force_plot(fraud_idx, matplotlib=True)
        
        # Waterfall plot
        try:
            explainer.plot_waterfall(fraud_idx)
        except:
            print("   Waterfall plot not available for this model type")
    
    if len(non_fraud_indices) > 0:
        non_fraud_idx = non_fraud_indices[0]
        print(f"‚úÖ NON-FRAUD CASE EXPLANATION (Index: {non_fraud_idx})")
        
        # Force plot
        explainer.plot_force_plot(non_fraud_idx, matplotlib=True)
        
        # Waterfall plot  
        try:
            explainer.plot_waterfall(non_fraud_idx)
        except:
            print("   Waterfall plot not available for this model type")

print("\n‚úÖ Local explanations completed!")


In [None]:
# Analyze feature dependencies and interactions
print("üîó FEATURE DEPENDENCE AND INTERACTION ANALYSIS")
print("=" * 60)

for dataset_name, shap_info in shap_results.items():
    explainer = shap_info['explainer']
    model_name = shap_info['model_name']
    
    print(f"\nüéØ {dataset_name.upper()} Dataset - Feature Dependencies")
    print("-" * 50)
    
    # Get top features for dependence analysis
    feature_importance = explainer.get_feature_importance()
    top_features = feature_importance.head(5)['feature'].tolist()
    
    print(f"üìä Top 5 features for dependence analysis: {top_features}")
    
    # Create dependence plots for top features
    for i, feature in enumerate(top_features[:3]):  # Show top 3 to avoid too many plots
        print(f"\nüìà Dependence plot for: {feature}")
        try:
            explainer.plot_dependence(feature)
        except Exception as e:
            print(f"   Could not create dependence plot for {feature}: {e}")
    
    # Show feature importance table
    print(f"\nüìã Feature Importance Rankings:")
    print(feature_importance.head(10).to_string(index=False))

print("\n‚úÖ Feature dependence analysis completed!")


In [None]:
# Comprehensive fraud driver analysis
print("üïµÔ∏è COMPREHENSIVE FRAUD DRIVER ANALYSIS")
print("=" * 60)

fraud_insights = {}

for dataset_name, shap_info in shap_results.items():
    explainer = shap_info['explainer']
    model_name = shap_info['model_name']
    y_test_sample = shap_info['y_test']
    
    print(f"\nüéØ {dataset_name.upper()} Dataset Analysis")
    print("-" * 50)
    
    # Analyze fraud drivers
    sample_size = len(shap_info['shap_values'])
    if hasattr(y_test_sample, 'iloc'):
        y_sample = y_test_sample.iloc[:sample_size]
    else:
        y_sample = y_test_sample[:sample_size]
    
    analysis = explainer.analyze_fraud_drivers(y_test=y_sample, top_features=10)
    fraud_insights[dataset_name] = analysis
    
    # Display key findings
    print("üîç KEY FINDINGS:")
    print("‚îÄ" * 40)
    
    print("\nüìä TOP FEATURE IMPORTANCE:")
    for idx, row in analysis['top_features'].head(5).iterrows():
        print(f"  {idx+1}. {row['feature']}: {row['importance']:.4f}")
    
    if 'fraud_drivers' in analysis:
        print("\nüö® TOP FRAUD DRIVERS (vs Non-Fraud):")
        for idx, row in analysis['fraud_drivers'].head(5).iterrows():
            direction = "‚Üë" if row['fraud_contribution_diff'] > 0 else "‚Üì"
            print(f"  {idx+1}. {row['feature']}: {direction} {abs(row['fraud_contribution_diff']):.4f}")
    
    print("\nüí° INSIGHTS:")
    for insight in analysis['overall_insights']:
        print(f"  ‚Ä¢ {insight}")
    
    if 'fraud_specific_insights' in analysis:
        for insight in analysis['fraud_specific_insights']:
            print(f"  ‚Ä¢ {insight}")

print("\n‚úÖ Fraud driver analysis completed!")


In [None]:
# Generate comprehensive explanation reports
print("üìù GENERATING COMPREHENSIVE SHAP REPORTS")
print("=" * 60)

reports = {}

for dataset_name, shap_info in shap_results.items():
    explainer = shap_info['explainer']
    model_name = shap_info['model_name']
    y_test_sample = shap_info['y_test']
    
    # Generate report
    sample_size = len(shap_info['shap_values'])
    if hasattr(y_test_sample, 'iloc'):
        y_sample = y_test_sample.iloc[:sample_size]
    else:
        y_sample = y_test_sample[:sample_size]
    
    report = explainer.generate_explanation_report(
        y_test=y_sample,
        dataset_name=f"{dataset_name.title()} ({model_name.title()})"
    )
    
    reports[dataset_name] = report
    
    print(f"\n{report}")
    print("\n" + "="*80)

# Summary of key fraud drivers across datasets
print("\nüéØ CROSS-DATASET FRAUD DRIVER SUMMARY")
print("=" * 60)

print("\nüìä Key Insights from SHAP Analysis:")
print("-" * 40)

if 'fraud' in fraud_insights and 'creditcard' in fraud_insights:
    fraud_analysis = fraud_insights['fraud']
    cc_analysis = fraud_insights['creditcard']
    
    print("\nüîç FRAUD DETECTION DATASET (E-commerce):")
    if 'fraud_drivers' in fraud_analysis:
        top_fraud_driver = fraud_analysis['fraud_drivers'].iloc[0]
        print(f"  ‚Ä¢ Primary fraud driver: {top_fraud_driver['feature']}")
        print(f"  ‚Ä¢ Impact: {top_fraud_driver['fraud_contribution_diff']:.4f}")
    
    print("\nüîç CREDIT CARD DATASET (Bank transactions):")
    if 'fraud_drivers' in cc_analysis:
        top_cc_driver = cc_analysis['fraud_drivers'].iloc[0]
        print(f"  ‚Ä¢ Primary fraud driver: {top_cc_driver['feature']}")
        print(f"  ‚Ä¢ Impact: {top_cc_driver['fraud_contribution_diff']:.4f}")

print("\nüéâ TASK 3 - MODEL EXPLAINABILITY COMPLETED!")
print("=" * 60)
print("‚úÖ SHAP analysis provides comprehensive model interpretability")
print("‚úÖ Global and local explanations generated")
print("‚úÖ Key fraud drivers identified")
print("‚úÖ Feature interactions analyzed")
print("‚úÖ Actionable insights for fraud prevention strategies")


In [None]:
# Initialize model builder
model_builder = ModelBuilder(random_state=42)

print("üèóÔ∏è BUILDING MODEL SUITE")
print("=" * 50)

# Create model suite with required models
models_to_include = ['logistic_regression', 'random_forest', 'lightgbm']

print("Creating models...")
models = model_builder.create_model_suite(include_models=models_to_include)

print(f"\n‚úÖ Created {len(models)} models:")
for model_name in models.keys():
    print(f"  ‚Ä¢ {model_name}")

# Get model information
model_info = model_builder.get_model_info()

print(f"\nüìã MODEL CHARACTERISTICS:")
for model_name, info in model_info.items():
    print(f"\n{model_name.upper()}:")
    characteristics = info['suitable_for']
    for key, value in characteristics.items():
        print(f"  {key}: {value}")

print("\nüéØ Model Selection Rationale:")
print("‚Ä¢ Logistic Regression: Interpretable baseline model with good performance on imbalanced data")
print("‚Ä¢ Random Forest: Robust ensemble method with feature importance")  
print("‚Ä¢ LightGBM: High-performance gradient boosting optimized for imbalanced datasets")


INFO:model_builder:Created Logistic Regression with params: {'random_state': 42, 'max_iter': 1000, 'class_weight': 'balanced', 'solver': 'liblinear'}
INFO:model_builder:Created Random Forest with params: {'n_estimators': 100, 'random_state': 42, 'class_weight': 'balanced', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'n_jobs': -1}
INFO:model_builder:Created model suite with 2 models: ['logistic_regression', 'random_forest']


üèóÔ∏è BUILDING MODEL SUITE
Creating models...

‚úÖ Created 2 models:
  ‚Ä¢ logistic_regression
  ‚Ä¢ random_forest

üìã MODEL CHARACTERISTICS:

LOGISTIC_REGRESSION:
  interpretability: High
  training_speed: Fast
  prediction_speed: Very Fast
  memory_usage: Low
  handling_imbalance: Good with class_weight
  best_for: Baseline, interpretable models

RANDOM_FOREST:
  interpretability: Medium
  training_speed: Medium
  prediction_speed: Fast
  memory_usage: Medium
  handling_imbalance: Good with class_weight
  best_for: Robust performance, feature importance

üéØ Model Selection Rationale:
‚Ä¢ Logistic Regression: Interpretable baseline model with good performance on imbalanced data
‚Ä¢ Random Forest: Robust ensemble method with feature importance
‚Ä¢ LightGBM: High-performance gradient boosting optimized for imbalanced datasets


In [None]:
# Initialize model trainer
model_trainer = ModelTrainer(random_state=42)

print("üöÄ TRAINING MODELS ON BOTH DATASETS")
print("=" * 60)

# Store results for both datasets
training_results = {}

# Train on fraud detection dataset
print("\nüéØ FRAUD DETECTION DATASET")
print("-" * 40)

fraud_data_info = dataset_info['fraud']
X_train_fraud = datasets['fraud']['X_train']
y_train_fraud = datasets['fraud']['y_train']

# Optimize models for fraud dataset imbalance
fraud_models = model_builder.optimize_for_imbalanced_data(
    models.copy(), 
    fraud_data_info['train_imbalance_ratio']
)

# Train and cross-validate fraud models
fraud_results = model_trainer.train_and_evaluate_suite(
    fraud_models, X_train_fraud, y_train_fraud, cv_folds=5
)

training_results['fraud'] = {
    'results': fraud_results,
    'models': fraud_models,
    'trainer': model_trainer
}

print(f"\n‚úÖ Fraud detection model training completed!")

# Create new trainer for credit card dataset
cc_trainer = ModelTrainer(random_state=42)

print("\nüí≥ CREDIT CARD DATASET")
print("-" * 40)

cc_data_info = dataset_info['creditcard']
X_train_cc = datasets['creditcard']['X_train']
y_train_cc = datasets['creditcard']['y_train']

# Optimize models for credit card dataset imbalance  
cc_models = model_builder.optimize_for_imbalanced_data(
    models.copy(),
    cc_data_info['train_imbalance_ratio']
)

# Train and cross-validate credit card models
cc_results = cc_trainer.train_and_evaluate_suite(
    cc_models, X_train_cc, y_train_cc, cv_folds=5
)

training_results['creditcard'] = {
    'results': cc_results,
    'models': cc_models,
    'trainer': cc_trainer
}

print(f"\n‚úÖ Credit card model training completed!")


INFO:model_builder:Optimized 2 models for imbalanced data
INFO:model_trainer:Training 2 models...
INFO:model_trainer:Training logistic_regression...
INFO:model_trainer:‚úÖ logistic_regression trained successfully in 0.06 seconds
INFO:model_trainer:Training random_forest...


üöÄ TRAINING MODELS ON BOTH DATASETS

üéØ FRAUD DETECTION DATASET
----------------------------------------


INFO:model_trainer:‚úÖ random_forest trained successfully in 1.98 seconds
INFO:model_trainer:Training completed:
INFO:model_trainer:  ‚úÖ Successful: ['logistic_regression', 'random_forest']
INFO:model_trainer:Performing 5-fold cross-validation...
INFO:model_trainer:Cross-validating logistic_regression...
INFO:model_trainer:  logistic_regression - AUC-ROC: 0.5024, AUC-PR: 0.0935, F1: 0.1557
INFO:model_trainer:Cross-validating random_forest...
INFO:model_trainer:  random_forest - AUC-ROC: 0.6968, AUC-PR: 0.1465, F1: 0.2762
INFO:model_trainer:Cross-validation completed
INFO:model_builder:Optimized 2 models for imbalanced data
INFO:model_trainer:Training 2 models...
INFO:model_trainer:Training logistic_regression...



‚úÖ Fraud detection model training completed!

üí≥ CREDIT CARD DATASET
----------------------------------------


INFO:model_trainer:‚úÖ logistic_regression trained successfully in 4.37 seconds
INFO:model_trainer:Training random_forest...
INFO:model_trainer:‚úÖ random_forest trained successfully in 42.21 seconds
INFO:model_trainer:Training completed:
INFO:model_trainer:  ‚úÖ Successful: ['logistic_regression', 'random_forest']
INFO:model_trainer:Performing 5-fold cross-validation...
INFO:model_trainer:Cross-validating logistic_regression...
INFO:model_trainer:  logistic_regression - AUC-ROC: 0.9825, AUC-PR: 0.0575, F1: 0.1179
INFO:model_trainer:Cross-validating random_forest...
INFO:model_trainer:  random_forest - AUC-ROC: 0.9749, AUC-PR: 0.6813, F1: 0.8233
INFO:model_trainer:Cross-validation completed



‚úÖ Credit card model training completed!


In [None]:
# Initialize model evaluator
model_evaluator = ModelEvaluator(figsize=(12, 8))
