In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Load data
X_train = pd.read_csv('data/X_train_scaled.csv')
X_test = pd.read_csv('data/X_test_scaled.csv')
y_train = pd.read_csv('data/y_train.csv').values.ravel()
y_test = pd.read_csv('data/y_test.csv').values.ravel()

print("="*80)
print("✓ PHASE 5: MODEL BUILDING & TRAINING")
print("="*80)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
}

print(f"\n1. Training {len(models)} models...")

# Train all models
trained_models = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    trained_models[name] = model
    print(f"   ✓ {name}")

print(f"\n2. Evaluating on test set...")

# Evaluate all models
results = {}
for name, model in trained_models.items():
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, y_pred_proba)
    }

# Compare
comparison_df = pd.DataFrame(results).T.sort_values('F1', ascending=False)

print("\n" + "-"*80)
print("MODEL COMPARISON")
print("-"*80)
print(comparison_df)

# Extract best model name (first row after sort)
best_model_name = comparison_df.index[0]
best_f1_score = comparison_df.loc[best_model_name, 'F1']

print(f"\n✓ BEST MODEL: {best_model_name}")
print(f"  F1-Score: {best_f1_score:.4f}")

print(f"\n✓ PHASE 5 COMPLETE!")


In [None]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(probability=True),
    'KNN': KNeighborsClassifier(),
}


In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')


In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)
    trained_models[name] = model


In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


In [None]:
comparison_df = pd.DataFrame(results).T.sort_values('F1', ascending=False)


In [None]:
rf_model = trained_models['Random Forest']
importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf_model.feature_importances_
})


In [None]:
import pickle
import os

# Ensure models directory exists
os.makedirs('models', exist_ok=True)

# Get best model from trained_models
best_model = trained_models[best_model_name]

with open('models/best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print(f"Best model ({best_model_name}) saved to models/best_model.pkl")


## Phase 5 — Model Building & Training

This document summarizes the model training and evaluation performed in `Notebooks/Phase5.ipynb`, lists the artifacts produced, and provides quick run and troubleshooting instructions.

- **Purpose:** Build, train, and evaluate multiple machine learning models on preprocessed data. Compare performance metrics and identify the best model for deployment.
- **Notebook:** `Notebooks/Phase5.ipynb`

**Produced Artifacts**
- `models/best_model.pkl`: Pickled best-performing model (selected by F1-score).
- Console output: Model comparison table with accuracy, precision, recall, F1, and ROC-AUC scores.

**Main Steps (high level)**
- Load preprocessed training and test data from `data/` directory (CSVs created in Phase 4).
- Initialize 5 classification models:
  - Logistic Regression (max_iter=1000)
  - Random Forest (n_estimators=100)
  - Gradient Boosting
  - SVM (with probability=True for ROC-AUC)
  - K-Nearest Neighbors (n_neighbors=5)
- Train all models on the training set.
- Evaluate each model on the test set, computing:
  - Accuracy, Precision, Recall, F1-score, ROC-AUC
- Rank models by F1-score and identify the best performer.
- Serialize the best model to `models/best_model.pkl` for future inference.

**How to run (PowerShell)**
1. From the project root, execute the notebook headless (example):

```powershell
python -m nbconvert --to notebook --execute "Notebooks\Phase5.ipynb" --output "Notebooks\Phase5_executed.ipynb"
```

2. Or run interactively in VS Code / Jupyter and execute cells in order.

**Notes & Troubleshooting**
- Missing `data/` files: Phase 5 loads preprocessed CSVs created by Phase 4. Ensure Phase 4 has been executed and `data/X_train_scaled.csv`, `data/X_test_scaled.csv`, `data/y_train.csv`, and `data/y_test.csv` exist.
- `NameError: name 'best_model' is not defined`: This occurred if the model-saving cell was run before cell 1 (which defines `best_model_name` and trains models). Run cells in order.
- `FileNotFoundError` when saving to `models/`: The notebook creates the `models/` directory automatically via `os.makedirs('models', exist_ok=True)` before saving. If this fails, check file system permissions.
- Model evaluation slow: Training 5 models (especially Random Forest and Gradient Boosting) may take time on larger datasets. Reduce `n_estimators` or simplify model parameters if needed.

**Model Comparison Output**
The notebook prints a comparison table showing metrics for all models. Example:

```
--------------------------------------------
MODEL COMPARISON
--------------------------------------------
                      Accuracy  Precision  Recall    F1  ROC-AUC
Model Name               ...      ...      ...    ...     ...
```

The model with the highest F1-score is selected as the best model and saved.

**Next steps**
- Verify that `models/best_model.pkl` was created after running Phase 5.
- Load and use the best model for inference on new data (e.g., in a production pipeline).
- Optionally, perform hyperparameter tuning on the best model to improve performance.
- Use the feature importances (from Random Forest) to understand which features drive predictions.

**Quick validation**
After running, confirm these files exist:
- `models/best_model.pkl` ✓
- Console output shows F1-scores for all 5 models ✓

