In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import pickle

# Load data from Phase 4
X_train = pd.read_csv('data/X_train_scaled.csv')
X_test = pd.read_csv('data/X_test_scaled.csv')
y_train = pd.read_csv('data/y_train.csv').values.ravel()
y_test = pd.read_csv('data/y_test.csv').values.ravel()

print("="*80)
print("✓ PHASE 6: HYPERPARAMETER TUNING")
print("="*80)

# Hyperparameters to try
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Grid search
print("\n1. Running Grid Search (this takes 5-10 minutes)...")
rf = RandomForestClassifier(random_state=42)
grid = GridSearchCV(rf, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid.fit(X_train, y_train)

print(f"\n✓ Best parameters:")
for param, value in grid.best_params_.items():
    print(f"   {param}: {value}")
print(f"\nBest CV F1-Score: {grid.best_score_:.4f}")

# Test set evaluation
print(f"\n2. Evaluating on test set...")
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"   Test Accuracy: {accuracy:.4f}")
print(f"   Test F1-Score: {f1:.4f}")

# Detailed report
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save tuned model
with open('models/best_model_tuned.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print(f"\n✓ Tuned model saved: models/best_model_tuned.pkl")
print(f"✓ PHASE 6 COMPLETE!")


In [None]:
# Alternative tuning method using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import os
import pickle

# Define base estimator (use RandomForest as in grid search)
rf = RandomForestClassifier(random_state=42)

# Larger parameter space for randomized search
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
}

random_search = RandomizedSearchCV(
    rf, param_dist, n_iter=20, cv=5, scoring='f1', n_jobs=-1, random_state=42
)
random_search.fit(X_train, y_train)

print(f"RandomizedSearch best params: {random_search.best_params_}")
print(f"RandomizedSearch best CV F1: {random_search.best_score_:.4f}")

best_random_model = random_search.best_estimator_
# Ensure models directory exists and save
os.makedirs('models', exist_ok=True)
with open('models/best_model_randomized.pkl', 'wb') as f:
    pickle.dump(best_random_model, f)
print("Saved tuned randomized model to models/best_model_randomized.pkl")


In [None]:
# More sophisticated Bayesian tuning (optional)
try:
    from skopt import BayesSearchCV
except Exception:
    BayesSearchCV = None
    print("Optional package 'scikit-optimize' (skopt) is not installed.")
    print("To enable Bayesian search, install it with:")
    print("  pip install scikit-optimize")
    print("Or use RandomizedSearchCV / GridSearchCV (already included above) as alternatives.")

# If BayesSearchCV is available, you can use it like this (example):
if BayesSearchCV is not None:
    # Example parameter search space (requires skopt)
    from skopt.space import Integer, Real, Categorical
    bayes_space = {
        'n_estimators': Integer(50, 500),
        'max_depth': Integer(5, 50),
        'min_samples_split': Integer(2, 20),
        'min_samples_leaf': Integer(1, 10),
    }
    # Example usage (uncomment to run):
    # bayes_search = BayesSearchCV(rf, bayes_space, n_iter=30, cv=5, scoring='f1', n_jobs=-1, random_state=42)
    # bayes_search.fit(X_train, y_train)
    # print(bayes_search.best_params_)
else:
    # Skipping Bayesian search because skopt is not available
    pass


## Phase 6 — Hyperparameter Tuning

This document summarizes hyperparameter tuning performed in `Notebooks/Phase6.ipynb`, lists the artifacts produced, and provides quick run and troubleshooting instructions.

- **Purpose:** Improve model performance by searching for better hyperparameters using Grid Search, Randomized Search, and (optionally) Bayesian Search.
- **Notebook:** `Notebooks/Phase6.ipynb`

**Produced Artifacts**
- `models/best_model_tuned.pkl`: Best estimator found by `GridSearchCV` (saved after grid search).
- `models/best_model_randomized.pkl`: Best estimator found by `RandomizedSearchCV` (saved after randomized search).
- `models/best_model_randomized.pkl` and `models/best_model_tuned.pkl` can be used for evaluation or deployment.

**Main Steps (high level)**
- Load preprocessed training and test data from Phase 4 (`data/X_train_scaled.csv`, `data/X_test_scaled.csv`, `data/y_train.csv`, `data/y_test.csv`).
- Run `GridSearchCV` on a `RandomForestClassifier` using a predefined `param_grid` and `scoring='f1'` to find a tuned model.
- Evaluate the best grid-search estimator on the test set and print accuracy/F1 and a classification report.
- Optionally run `RandomizedSearchCV` for a larger parameter space (faster than exhaustive grid search) and save the result.
- An optional Bayesian optimization example (using `scikit-optimize` / `skopt`) is provided but guarded by a try/except: install `scikit-optimize` to enable it.

**How to run (PowerShell)**
1. From the project root, execute the notebook headless (example):

```powershell
python -m nbconvert --to notebook --execute "Notebooks\Phase6.ipynb" --output "Notebooks\Phase6_executed.ipynb"
```

2. Or run interactively in VS Code / Jupyter and execute cells in order.

**Notes & Troubleshooting**
- Missing `data/` files: Phase 6 depends on outputs from Phase 4. Ensure `data/X_train_scaled.csv`, `data/X_test_scaled.csv`, `data/y_train.csv`, and `data/y_test.csv` exist.
- Long runtime: Grid search (cv=5) over many parameters can be slow. Use `RandomizedSearchCV` with `n_iter` set to a reasonable value (e.g., 20) to reduce runtime.
- `ModuleNotFoundError: No module named 'skopt'`: The notebook includes an optional `BayesSearchCV` example requiring the `scikit-optimize` package. Install it with:

```powershell
pip install scikit-optimize
# or with conda
conda install -c conda-forge scikit-optimize
```

If you don't want to install additional packages, use `GridSearchCV` or `RandomizedSearchCV` (both included in the notebook).

- Memory/CPU limits: Use `n_jobs` carefully (e.g., `n_jobs=-1` uses all CPUs). On constrained machines, set `n_jobs=1` or reduce `n_estimators`.

- Reproducibility: Randomized and Bayesian searches accept `random_state` for reproducible results.

**Example parameters used in the notebook**
- Grid search `param_grid` (example):
  - `n_estimators`: [100, 200, 300]
  - `max_depth`: [10, 20, 30, None]
  - `min_samples_split`: [2, 5, 10]
  - `min_samples_leaf`: [1, 2, 4]

- Randomized search `param_dist` (example):
  - `n_estimators`: [100, 200, 300, 400, 500]
  - `max_depth`: [10, 20, 30, None]
  - `min_samples_split`: [2, 5, 10]
  - `min_samples_leaf`: [1, 2, 4]
  - `bootstrap`: [True, False]

**Next steps**
- Run Phase 6 and verify `models/best_model_tuned.pkl` and/or `models/best_model_randomized.pkl` are created.
- Optionally perform further tuning using Bayesian optimization (`skopt`) or Optuna for more advanced search strategies.

