---
## 1. Import Libraries & Load Data

In [None]:
# Data manipulation
import numpy as np
import pandas as pd

# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# Evaluation Metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report, roc_curve, auc
)

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Utilities
import joblib
import warnings
warnings.filterwarnings('ignore')

# Display settings
%matplotlib inline
sns.set_style('whitegrid')

print("✅ Libraries imported successfully!")

In [None]:
# TODO: Load preprocessed data
# X_train = pd.read_csv('../data/X_train.csv')
# X_test = pd.read_csv('../data/X_test.csv')
# y_train = pd.read_csv('../data/y_train.csv').values.ravel()
# y_test = pd.read_csv('../data/y_test.csv').values.ravel()

# print(f"Train set: {X_train.shape}")
# print(f"Test set: {X_test.shape}")

---
## 2. Model 1: Logistic Regression

Baseline linear model with L2 regularization.

In [None]:
# TODO: Train Logistic Regression
# print("Training Logistic Regression...")
# lr_model = LogisticRegression(
#     max_iter=1000,
#     random_state=42,
#     solver='lbfgs',
#     C=1.0  # Regularization strength
# )

# lr_model.fit(X_train, y_train)
# print("✅ Logistic Regression trained")

In [None]:
# TODO: Predictions and probability estimates
# y_pred_lr = lr_model.predict(X_test)
# y_proba_lr = lr_model.predict_proba(X_test)[:, 1]  # Probability of class 1

# # Compute metrics
# lr_metrics = {
#     'Model': 'Logistic Regression',
#     'Accuracy': accuracy_score(y_test, y_pred_lr),
#     'Precision': precision_score(y_test, y_pred_lr),
#     'Recall': recall_score(y_test, y_pred_lr),
#     'F1-Score': f1_score(y_test, y_pred_lr),
#     'ROC-AUC': roc_auc_score(y_test, y_proba_lr)
# }

# print("\nLogistic Regression Metrics:")
# for key, value in lr_metrics.items():
#     if key != 'Model':
#         print(f"{key}: {value:.4f}")

---
## 3. Model 2: Support Vector Classifier (RBF Kernel)

In [None]:
# TODO: Train SVC with RBF kernel
# print("Training SVC (RBF kernel)...")
# # Note: For large datasets, consider using a subset or adjust parameters
# svc_model = SVC(
#     kernel='rbf',
#     C=1.0,
#     gamma='scale',
#     probability=True,  # Enable probability estimates
#     random_state=42
# )

# svc_model.fit(X_train, y_train)
# print("✅ SVC (RBF) trained")

In [None]:
# TODO: Predictions and metrics
# y_pred_svc = svc_model.predict(X_test)
# y_proba_svc = svc_model.predict_proba(X_test)[:, 1]

# svc_metrics = {
#     'Model': 'SVC (RBF)',
#     'Accuracy': accuracy_score(y_test, y_pred_svc),
#     'Precision': precision_score(y_test, y_pred_svc),
#     'Recall': recall_score(y_test, y_pred_svc),
#     'F1-Score': f1_score(y_test, y_pred_svc),
#     'ROC-AUC': roc_auc_score(y_test, y_proba_svc)
# }

# print("\nSVC (RBF) Metrics:")
# for key, value in svc_metrics.items():
#     if key != 'Model':
#         print(f"{key}: {value:.4f}")

---
## 4. Model 3: Artificial Neural Network (MLP)

In [None]:
# TODO: Train MLP Classifier
# print("Training ANN (MLP)...")
# mlp_model = MLPClassifier(
#     hidden_layer_sizes=(128, 64, 32),  # 3 hidden layers
#     activation='relu',
#     solver='adam',
#     alpha=0.0001,  # L2 regularization
#     batch_size=256,
#     learning_rate='adaptive',
#     learning_rate_init=0.001,
#     max_iter=200,
#     random_state=42,
#     early_stopping=True,
#     validation_fraction=0.1,
#     verbose=False
# )

# mlp_model.fit(X_train, y_train)
# print("✅ ANN (MLP) trained")
# print(f"Iterations: {mlp_model.n_iter_}")

In [None]:
# TODO: Predictions and metrics
# y_pred_mlp = mlp_model.predict(X_test)
# y_proba_mlp = mlp_model.predict_proba(X_test)[:, 1]

# mlp_metrics = {
#     'Model': 'ANN (MLP)',
#     'Accuracy': accuracy_score(y_test, y_pred_mlp),
#     'Precision': precision_score(y_test, y_pred_mlp),
#     'Recall': recall_score(y_test, y_pred_mlp),
#     'F1-Score': f1_score(y_test, y_pred_mlp),
#     'ROC-AUC': roc_auc_score(y_test, y_proba_mlp)
# }

# print("\nANN (MLP) Metrics:")
# for key, value in mlp_metrics.items():
#     if key != 'Model':
#         print(f"{key}: {value:.4f}")

---
## 5. Model 4: PCA + SVC

Dimensionality reduction followed by SVM classification.

In [None]:
# TODO: Create PCA + SVC pipeline
# print("Training PCA + SVC pipeline...")
# pca_svc_pipeline = Pipeline([
#     ('pca', PCA(n_components=0.95, random_state=42)),  # Keep 95% variance
#     ('svc', SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42))
# ])

# pca_svc_pipeline.fit(X_train, y_train)
# print("✅ PCA + SVC trained")
# print(f"PCA components: {pca_svc_pipeline.named_steps['pca'].n_components_}")

In [None]:
# TODO: Predictions and metrics
# y_pred_pca_svc = pca_svc_pipeline.predict(X_test)
# y_proba_pca_svc = pca_svc_pipeline.predict_proba(X_test)[:, 1]

# pca_svc_metrics = {
#     'Model': 'PCA + SVC',
#     'Accuracy': accuracy_score(y_test, y_pred_pca_svc),
#     'Precision': precision_score(y_test, y_pred_pca_svc),
#     'Recall': recall_score(y_test, y_pred_pca_svc),
#     'F1-Score': f1_score(y_test, y_pred_pca_svc),
#     'ROC-AUC': roc_auc_score(y_test, y_proba_pca_svc)
# }

# print("\nPCA + SVC Metrics:")
# for key, value in pca_svc_metrics.items():
#     if key != 'Model':
#         print(f"{key}: {value:.4f}")

---
## 6. Parity Plots (Predicted Probability vs Actual Class)

Visualize how well predicted probabilities align with actual classes.

In [None]:
# TODO: Create parity plots for all models
# fig, axes = plt.subplots(2, 2, figsize=(16, 12))
# axes = axes.ravel()

# models_data = [
#     ('Logistic Regression', y_proba_lr),
#     ('SVC (RBF)', y_proba_svc),
#     ('ANN (MLP)', y_proba_mlp),
#     ('PCA + SVC', y_proba_pca_svc)
# ]

# for idx, (model_name, y_proba) in enumerate(models_data):
#     # Scatter plot with jitter for better visibility
#     axes[idx].scatter(y_test, y_proba, alpha=0.3, s=10)
#     
#     # Add reference line (perfect prediction)
#     axes[idx].plot([0, 1], [0, 1], 'r--', linewidth=2, label='Perfect Prediction')
#     
#     axes[idx].set_xlabel('Actual Class', fontsize=12, fontweight='bold')
#     axes[idx].set_ylabel('Predicted Probability', fontsize=12, fontweight='bold')
#     axes[idx].set_title(f'{model_name} - Parity Plot', fontsize=14, fontweight='bold')
#     axes[idx].legend()
#     axes[idx].grid(True, alpha=0.3)
#     axes[idx].set_xlim([-0.1, 1.1])
#     axes[idx].set_ylim([-0.1, 1.1])

# plt.tight_layout()
# plt.savefig('../outputs/parity_plots.png', dpi=300, bbox_inches='tight')
# plt.show()

---
## 7. Comprehensive Metrics Comparison

In [None]:
# TODO: Compile all metrics into comparison table
# results_df = pd.DataFrame([lr_metrics, svc_metrics, mlp_metrics, pca_svc_metrics])
# results_df = results_df.set_index('Model')
# print("\n" + "="*70)
# print("MODEL PERFORMANCE COMPARISON")
# print("="*70)
# print(results_df.round(4))
# print("="*70)

In [None]:
# TODO: Visualize metrics comparison
# metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
# results_plot = results_df[metrics_to_plot]

# fig, ax = plt.subplots(figsize=(14, 8))
# results_plot.T.plot(kind='bar', ax=ax, width=0.8)
# ax.set_xlabel('Metrics', fontsize=13, fontweight='bold')
# ax.set_ylabel('Score', fontsize=13, fontweight='bold')
# ax.set_title('Model Performance Comparison', fontsize=16, fontweight='bold')
# ax.legend(title='Models', fontsize=11)
# ax.set_ylim([0.5, 1.0])
# ax.grid(True, axis='y', alpha=0.3)
# plt.xticks(rotation=0)
# plt.tight_layout()
# plt.savefig('../outputs/metrics_comparison.png', dpi=300, bbox_inches='tight')
# plt.show()

In [None]:
# TODO: ROC Curves comparison
# plt.figure(figsize=(10, 8))

# models_proba = [
#     ('Logistic Regression', y_proba_lr),
#     ('SVC (RBF)', y_proba_svc),
#     ('ANN (MLP)', y_proba_mlp),
#     ('PCA + SVC', y_proba_pca_svc)
# ]

# for model_name, y_proba in models_proba:
#     fpr, tpr, _ = roc_curve(y_test, y_proba)
#     roc_auc = auc(fpr, tpr)
#     plt.plot(fpr, tpr, linewidth=2, label=f'{model_name} (AUC = {roc_auc:.4f})')

# plt.plot([0, 1], [0, 1], 'k--', linewidth=2, label='Random Classifier')
# plt.xlabel('False Positive Rate', fontsize=13, fontweight='bold')
# plt.ylabel('True Positive Rate', fontsize=13, fontweight='bold')
# plt.title('ROC Curves - All Models', fontsize=16, fontweight='bold')
# plt.legend(loc='lower right', fontsize=11)
# plt.grid(True, alpha=0.3)
# plt.tight_layout()
# plt.savefig('../outputs/roc_curves.png', dpi=300, bbox_inches='tight')
# plt.show()

---
## 8. Confusion Matrices

In [None]:
# TODO: Plot confusion matrices for all models
# fig, axes = plt.subplots(2, 2, figsize=(14, 12))
# axes = axes.ravel()

# predictions = [
#     ('Logistic Regression', y_pred_lr),
#     ('SVC (RBF)', y_pred_svc),
#     ('ANN (MLP)', y_pred_mlp),
#     ('PCA + SVC', y_pred_pca_svc)
# ]

# for idx, (model_name, y_pred) in enumerate(predictions):
#     cm = confusion_matrix(y_test, y_pred)
#     sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
#                 xticklabels=['Normal', 'Attack'],
#                 yticklabels=['Normal', 'Attack'])
#     axes[idx].set_title(f'{model_name}', fontsize=14, fontweight='bold')
#     axes[idx].set_xlabel('Predicted', fontsize=12)
#     axes[idx].set_ylabel('Actual', fontsize=12)

# plt.tight_layout()
# plt.savefig('../outputs/confusion_matrices.png', dpi=300, bbox_inches='tight')
# plt.show()

---
## 9. Save Trained Models

In [None]:
# TODO: Save all trained models
# joblib.dump(lr_model, '../models/logistic_regression.pkl')
# joblib.dump(svc_model, '../models/svc_rbf.pkl')
# joblib.dump(mlp_model, '../models/ann_mlp.pkl')
# joblib.dump(pca_svc_pipeline, '../models/pca_svc.pkl')

# print("✅ All models saved to /models directory")

---
## Summary

### Models Trained:
1. ✅ Logistic Regression
2. ✅ SVC (RBF Kernel)
3. ✅ ANN (Multi-layer Perceptron)
4. ✅ PCA + SVC

### Key Findings:
- **Best Model (by ROC-AUC):** [To be filled after training]
- **Best Model (by F1-Score):** [To be filled after training]
- **Fastest Training:** [To be filled after training]

### Next Steps:
- Perform 10-fold cross-validation for robust performance estimates
- Plot learning curves to diagnose bias-variance tradeoff
- Analyze feature importance

---
**Proceed to:** `04_cross_validation_analysis.ipynb`