In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score
from sklearn.metrics import precision_recall_curve, roc_curve
import joblib

# For XGBoost and LightGBM (if available)
try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False
    print("XGBoost not available")

try:
    from lightgbm import LGBMClassifier
    LIGHTGBM_AVAILABLE = True
except ImportError:
    LIGHTGBM_AVAILABLE = False
    print("LightGBM not available")

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
# Load processed data
X_fraud = joblib.load('../data/processed/X_fraud_smote.pkl')
y_fraud = joblib.load('../data/processed/y_fraud_smote.pkl')
fraud_feature_names = joblib.load('../data/processed/fraud_feature_names.pkl')

X_credit = joblib.load('../data/processed/X_credit_smote.pkl')
y_credit = joblib.load('../data/processed/y_credit_smote.pkl')

print("Fraud data shape:", X_fraud.shape)
print("Credit data shape:", X_credit.shape)

In [None]:
# Function to evaluate models
def evaluate_model(y_true, y_pred, y_pred_proba):
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_true, y_pred)
    print(cm)
    
    # Calculate metrics
    auc_pr = average_precision_score(y_true, y_pred_proba)
    auc_roc = roc_auc_score(y_true, y_pred_proba)
    
    print(f"\nAUC-PR: {auc_pr:.4f}")
    print(f"AUC-ROC: {auc_roc:.4f}")
    
    return auc_pr, auc_roc

## Fraud Data Modeling

In [None]:
# Split fraud data
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

print("Fraud train shape:", X_train_fraud.shape)
print("Fraud test shape:", X_test_fraud.shape)

In [None]:
# Baseline Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_fraud, y_train_fraud)

# Predictions
lr_pred = lr_model.predict(X_test_fraud)
lr_pred_proba = lr_model.predict_proba(X_test_fraud)[:, 1]

print("Logistic Regression Results:")
lr_auc_pr, lr_auc_roc = evaluate_model(y_test_fraud, lr_pred, lr_pred_proba)

In [None]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_fraud, y_train_fraud)

# Predictions
rf_pred = rf_model.predict(X_test_fraud)
rf_pred_proba = rf_model.predict_proba(X_test_fraud)[:, 1]

print("Random Forest Results:")
rf_auc_pr, rf_auc_roc = evaluate_model(y_test_fraud, rf_pred, rf_pred_proba)

In [None]:
# XGBoost (if available)
if XGBOOST_AVAILABLE:
    xgb_model = XGBClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    xgb_model.fit(X_train_fraud, y_train_fraud)
    
    xgb_pred = xgb_model.predict(X_test_fraud)
    xgb_pred_proba = xgb_model.predict_proba(X_test_fraud)[:, 1]
    
    print("XGBoost Results:")
    xgb_auc_pr, xgb_auc_roc = evaluate_model(y_test_fraud, xgb_pred, xgb_pred_proba)
else:
    xgb_auc_pr, xgb_auc_roc = 0, 0

In [None]:
# LightGBM (if available)
if LIGHTGBM_AVAILABLE:
    lgb_model = LGBMClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    lgb_model.fit(X_train_fraud, y_train_fraud)
    
    lgb_pred = lgb_model.predict(X_test_fraud)
    lgb_pred_proba = lgb_model.predict_proba(X_test_fraud)[:, 1]
    
    print("LightGBM Results:")
    lgb_auc_pr, lgb_auc_roc = evaluate_model(y_test_fraud, lgb_pred, lgb_pred_proba)
else:
    lgb_auc_pr, lgb_auc_roc = 0, 0

In [None]:
# Model comparison for fraud data
models_fraud = ['Logistic Regression', 'Random Forest', 'XGBoost', 'LightGBM']
auc_pr_scores = [lr_auc_pr, rf_auc_pr, xgb_auc_pr, lgb_auc_pr]
auc_roc_scores = [lr_auc_roc, rf_auc_roc, xgb_auc_roc, lgb_auc_roc]

comparison_df = pd.DataFrame({
    'Model': models_fraud,
    'AUC-PR': auc_pr_scores,
    'AUC-ROC': auc_roc_scores
})

print("Fraud Detection Model Comparison:")
print(comparison_df)

## Credit Card Data Modeling

In [None]:
# Split credit card data
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(
    X_credit, y_credit, test_size=0.2, random_state=42, stratify=y_credit
)

print("Credit train shape:", X_train_credit.shape)
print("Credit test shape:", X_test_credit.shape)

In [None]:
# Train models on credit card data (similar to fraud data)
# Logistic Regression
lr_credit = LogisticRegression(random_state=42, max_iter=1000)
lr_credit.fit(X_train_credit, y_train_credit)
lr_credit_pred = lr_credit.predict(X_test_credit)
lr_credit_pred_proba = lr_credit.predict_proba(X_test_credit)[:, 1]

print("Credit Card - Logistic Regression:")
lr_credit_auc_pr, lr_credit_auc_roc = evaluate_model(y_test_credit, lr_credit_pred, lr_credit_pred_proba)

In [None]:
# Random Forest for credit card
rf_credit = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_credit.fit(X_train_credit, y_train_credit)
rf_credit_pred = rf_credit.predict(X_test_credit)
rf_credit_pred_proba = rf_credit.predict_proba(X_test_credit)[:, 1]

print("Credit Card - Random Forest:")
rf_credit_auc_pr, rf_credit_auc_roc = evaluate_model(y_test_credit, rf_credit_pred, rf_credit_pred_proba)

In [None]:
# Save best models
# For fraud data, assume Random Forest performs best
joblib.dump(rf_model, '../models/fraud_rf_model.pkl')
joblib.dump(rf_credit, '../models/credit_rf_model.pkl')

print("Best models saved!")

In [None]:
# Cross-validation for best model
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(rf_model, X_fraud, y_fraud, cv=cv, scoring='average_precision')

print("Cross-validation AUC-PR scores:", cv_scores)
print("Mean AUC-PR:", cv_scores.mean())
print("Std AUC-PR:", cv_scores.std())