---
## 1. Import Libraries & Load Data

In [None]:
# Data manipulation
import numpy as np
import pandas as pd

# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# Cross-validation
from sklearn.model_selection import (
    cross_val_score, cross_validate, StratifiedKFold, learning_curve
)

# Metrics
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Utilities
import joblib
from time import time
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_style('whitegrid')

print("‚úÖ Libraries imported successfully!")

In [None]:
# TODO: Load preprocessed data
# X_train = pd.read_csv('../data/X_train.csv')
# X_test = pd.read_csv('../data/X_test.csv')
# y_train = pd.read_csv('../data/y_train.csv').values.ravel()
# y_test = pd.read_csv('../data/y_test.csv').values.ravel()

# print(f"Train set: {X_train.shape}")
# print(f"Test set: {X_test.shape}")

---
## 2. 10-Fold Stratified Cross-Validation

Evaluate model performance using k-fold cross-validation for robust estimates.

In [None]:
# TODO: Define models
# models = {
#     'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
#     'SVC (RBF)': SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42),
#     'ANN (MLP)': MLPClassifier(
#         hidden_layer_sizes=(128, 64, 32),
#         activation='relu',
#         solver='adam',
#         max_iter=200,
#         random_state=42,
#         early_stopping=True
#     ),
#     'PCA + SVC': Pipeline([
#         ('pca', PCA(n_components=0.95, random_state=42)),
#         ('svc', SVC(kernel='rbf', probability=True, random_state=42))
#     ])
# }

In [None]:
# TODO: Define scoring metrics
# scoring = {
#     'accuracy': 'accuracy',
#     'precision': make_scorer(precision_score),
#     'recall': make_scorer(recall_score),
#     'f1': make_scorer(f1_score),
#     'roc_auc': 'roc_auc'
# }

# # Define cross-validation strategy
# cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
# TODO: Perform cross-validation for all models
# cv_results = {}

# for model_name, model in models.items():
#     print(f"\nPerforming 10-fold CV for {model_name}...")
#     start_time = time()
#     
#     # Perform cross-validation
#     scores = cross_validate(
#         model, X_train, y_train,
#         cv=cv,
#         scoring=scoring,
#         return_train_score=True,
#         n_jobs=-1  # Use all available cores
#     )
#     
#     elapsed_time = time() - start_time
#     
#     # Store results
#     cv_results[model_name] = {
#         'accuracy_mean': scores['test_accuracy'].mean(),
#         'accuracy_std': scores['test_accuracy'].std(),
#         'precision_mean': scores['test_precision'].mean(),
#         'precision_std': scores['test_precision'].std(),
#         'recall_mean': scores['test_recall'].mean(),
#         'recall_std': scores['test_recall'].std(),
#         'f1_mean': scores['test_f1'].mean(),
#         'f1_std': scores['test_f1'].std(),
#         'roc_auc_mean': scores['test_roc_auc'].mean(),
#         'roc_auc_std': scores['test_roc_auc'].std(),
#         'cv_time': elapsed_time
#     }
#     
#     print(f"‚úÖ Completed in {elapsed_time:.2f} seconds")
#     print(f"   Accuracy: {cv_results[model_name]['accuracy_mean']:.4f} (+/- {cv_results[model_name]['accuracy_std']:.4f})")
#     print(f"   F1-Score: {cv_results[model_name]['f1_mean']:.4f} (+/- {cv_results[model_name]['f1_std']:.4f})")
#     print(f"   ROC-AUC:  {cv_results[model_name]['roc_auc_mean']:.4f} (+/- {cv_results[model_name]['roc_auc_std']:.4f})")

---
## 3. Cross-Validation Results Comparison

In [None]:
# TODO: Create summary DataFrame
# cv_summary = pd.DataFrame(cv_results).T
# cv_summary = cv_summary.round(4)

# print("\n" + "="*100)
# print("10-FOLD CROSS-VALIDATION RESULTS")
# print("="*100)
# print(cv_summary)
# print("="*100)

In [None]:
# TODO: Visualize cross-validation results with error bars
# metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
# model_names = list(cv_results.keys())

# fig, axes = plt.subplots(2, 3, figsize=(18, 10))
# axes = axes.ravel()

# for idx, metric in enumerate(metrics):
#     means = [cv_results[model][f'{metric}_mean'] for model in model_names]
#     stds = [cv_results[model][f'{metric}_std'] for model in model_names]
#     
#     axes[idx].bar(range(len(model_names)), means, yerr=stds, 
#                   capsize=5, color=['#3498db', '#e74c3c', '#2ecc71', '#f39c12'], alpha=0.8)
#     axes[idx].set_xticks(range(len(model_names)))
#     axes[idx].set_xticklabels(model_names, rotation=45, ha='right')
#     axes[idx].set_ylabel('Score', fontsize=11, fontweight='bold')
#     axes[idx].set_title(f'{metric.upper().replace("_", "-")}', fontsize=13, fontweight='bold')
#     axes[idx].set_ylim([0.5, 1.0])
#     axes[idx].grid(True, axis='y', alpha=0.3)

# # Training time comparison
# times = [cv_results[model]['cv_time'] for model in model_names]
# axes[5].bar(range(len(model_names)), times, color=['#3498db', '#e74c3c', '#2ecc71', '#f39c12'], alpha=0.8)
# axes[5].set_xticks(range(len(model_names)))
# axes[5].set_xticklabels(model_names, rotation=45, ha='right')
# axes[5].set_ylabel('Time (seconds)', fontsize=11, fontweight='bold')
# axes[5].set_title('Cross-Validation Time', fontsize=13, fontweight='bold')
# axes[5].grid(True, axis='y', alpha=0.3)

# plt.tight_layout()
# plt.savefig('../outputs/cv_results_comparison.png', dpi=300, bbox_inches='tight')
# plt.show()

---
## 4. Learning Curves Analysis

Diagnose bias-variance tradeoff by plotting learning curves.

In [None]:
# TODO: Function to plot learning curve
# def plot_learning_curve(estimator, title, X, y, cv=5, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10)):
#     """
#     Plot learning curve to diagnose bias-variance tradeoff.
#     """
#     train_sizes, train_scores, test_scores = learning_curve(
#         estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,
#         scoring='accuracy', random_state=42
#     )
#     
#     train_scores_mean = np.mean(train_scores, axis=1)
#     train_scores_std = np.std(train_scores, axis=1)
#     test_scores_mean = np.mean(test_scores, axis=1)
#     test_scores_std = np.std(test_scores, axis=1)
#     
#     plt.figure(figsize=(10, 6))
#     plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
#                      train_scores_mean + train_scores_std, alpha=0.1, color='r')
#     plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
#                      test_scores_mean + test_scores_std, alpha=0.1, color='g')
#     plt.plot(train_sizes, train_scores_mean, 'o-', color='r', linewidth=2, label='Training score')
#     plt.plot(train_sizes, test_scores_mean, 'o-', color='g', linewidth=2, label='Cross-validation score')
#     
#     plt.xlabel('Training Set Size', fontsize=13, fontweight='bold')
#     plt.ylabel('Accuracy Score', fontsize=13, fontweight='bold')
#     plt.title(f'Learning Curve - {title}', fontsize=15, fontweight='bold')
#     plt.legend(loc='best', fontsize=11)
#     plt.grid(True, alpha=0.3)
#     plt.tight_layout()
#     plt.savefig(f'../outputs/learning_curve_{title.replace(" ", "_").lower()}.png', dpi=300, bbox_inches='tight')
#     plt.show()

In [None]:
# TODO: Generate learning curves for all models
# print("Generating learning curves...\n")

# for model_name, model in models.items():
#     print(f"Plotting learning curve for {model_name}...")
#     plot_learning_curve(model, model_name, X_train, y_train, cv=5)
#     print("‚úÖ Completed\n")

---
## 5. Feature Importance Analysis

Identify most important features for model predictions.

In [None]:
# TODO: Feature importance from Logistic Regression coefficients
# print("Analyzing feature importance from Logistic Regression...")

# # Train LR model
# lr_model = LogisticRegression(max_iter=1000, random_state=42)
# lr_model.fit(X_train, y_train)

# # Get feature importances (absolute coefficients)
# feature_importance = pd.DataFrame({
#     'feature': X_train.columns,
#     'importance': np.abs(lr_model.coef_[0])
# }).sort_values('importance', ascending=False)

# print("\nTop 20 Most Important Features:")
# print(feature_importance.head(20))

In [None]:
# TODO: Visualize top features
# top_n = 20
# top_features = feature_importance.head(top_n)

# plt.figure(figsize=(10, 8))
# plt.barh(range(len(top_features)), top_features['importance'], color='steelblue')
# plt.yticks(range(len(top_features)), top_features['feature'])
# plt.xlabel('Absolute Coefficient Value', fontsize=13, fontweight='bold')
# plt.ylabel('Features', fontsize=13, fontweight='bold')
# plt.title(f'Top {top_n} Most Important Features (Logistic Regression)', fontsize=15, fontweight='bold')
# plt.gca().invert_yaxis()
# plt.grid(True, axis='x', alpha=0.3)
# plt.tight_layout()
# plt.savefig('../outputs/feature_importance.png', dpi=300, bbox_inches='tight')
# plt.show()

In [None]:
# TODO: Optional - Permutation importance (more robust but slower)
# from sklearn.inspection import permutation_importance

# print("Computing permutation importance...")
# perm_importance = permutation_importance(
#     lr_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1
# )

# perm_importance_df = pd.DataFrame({
#     'feature': X_train.columns,
#     'importance': perm_importance.importances_mean
# }).sort_values('importance', ascending=False)

# print("\nTop 20 Features by Permutation Importance:")
# print(perm_importance_df.head(20))

---
## 6. Final Recommendations

In [None]:
# TODO: Identify best model based on CV results
# print("\n" + "="*100)
# print("FINAL RECOMMENDATIONS")
# print("="*100)

# # Best by different metrics
# best_accuracy = max(cv_results.items(), key=lambda x: x[1]['accuracy_mean'])
# best_f1 = max(cv_results.items(), key=lambda x: x[1]['f1_mean'])
# best_roc_auc = max(cv_results.items(), key=lambda x: x[1]['roc_auc_mean'])
# fastest = min(cv_results.items(), key=lambda x: x[1]['cv_time'])

# print(f"\nüèÜ Best Model by Accuracy: {best_accuracy[0]} ({best_accuracy[1]['accuracy_mean']:.4f})")
# print(f"üèÜ Best Model by F1-Score: {best_f1[0]} ({best_f1[1]['f1_mean']:.4f})")
# print(f"üèÜ Best Model by ROC-AUC: {best_roc_auc[0]} ({best_roc_auc[1]['roc_auc_mean']:.4f})")
# print(f"‚ö° Fastest Model: {fastest[0]} ({fastest[1]['cv_time']:.2f}s)")

# print("\n" + "="*100)

---
## Summary

### Analysis Completed:
1. ‚úÖ 10-fold stratified cross-validation
2. ‚úÖ Cross-validation results comparison
3. ‚úÖ Learning curves for bias-variance diagnosis
4. ‚úÖ Feature importance analysis

### Key Insights:
- **Model Selection:** [Fill in best model and justification]
- **Bias-Variance:** [Summarize learning curve findings]
- **Feature Importance:** [List top 5 most important features]
- **Computational Efficiency:** [Compare training times]

### Production Recommendations:
1. **Recommended Model:** [Best model based on business requirements]
2. **Reasoning:** Balance between performance, speed, and interpretability
3. **Next Steps:** Hyperparameter tuning, ensemble methods, deployment strategy

---
**Project Complete! üéâ**