# Notebook 04 — Model Training & Evaluation

Train a Random Forest classifier on extracted features, evaluate with cross-validation,
and produce detailed metrics.

**Classifier:** Random Forest

In [None]:
import sys, os
sys.path.insert(0, os.path.abspath('../src'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    f1_score, precision_score, recall_score
)
from sklearn.ensemble import RandomForestClassifier

from visualization import plot_confusion_matrix

sns.set_theme(style='whitegrid')
%matplotlib inline

## 1. Load Extracted Features

In [5]:
# Load XPQRS features (from Notebook 03)
xpqrs_df = pd.read_csv('../results/tables/xpqrs_features.csv')
print(f'XPQRS features: {xpqrs_df.shape}')

# Load PQ Disturbances features
pq_df = pd.read_csv('../results/tables/pq_features.csv')
print(f'PQ Disturbances features: {pq_df.shape}')

XPQRS features: (17000, 37)
PQ Disturbances features: (798, 73)


## 2. Training Pipeline

In [None]:
def train_and_evaluate(df, dataset_name):
    """Train Random Forest classifier and return results."""
    # Prepare data
    le = LabelEncoder()
    feature_cols = [c for c in df.columns if c != 'label']
    X = df[feature_cols].values
    y = le.fit_transform(df['label'])
    class_names = le.classes_

    # Replace inf/nan
    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)

    # Stratified train-test split (80/20)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print(f'\n{"="*60}')
    print(f'Dataset: {dataset_name}')
    print(f'Train: {X_train.shape}, Test: {X_test.shape}')
    print(f'Classes: {len(class_names)}')
    print(f'{"="*60}')

    clf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42, n_jobs=-1)

    # Build pipeline
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', clf)
    ])

    # 5-fold cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_results = cross_validate(
        pipe, X_train, y_train, cv=cv,
        scoring=['accuracy', 'f1_macro'],
        return_train_score=False, n_jobs=-1
    )

    cv_acc = cv_results['test_accuracy']
    cv_f1  = cv_results['test_f1_macro']
    print(f'\n--- Random Forest ---')
    print(f'  CV Accuracy: {cv_acc.mean():.4f} (+/- {cv_acc.std():.4f})')
    print(f'  CV F1 Macro: {cv_f1.mean():.4f} (+/- {cv_f1.std():.4f})')

    # Train on full training set, evaluate on test set
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1  = f1_score(y_test, y_pred, average='macro')
    prec = precision_score(y_test, y_pred, average='macro')
    rec  = recall_score(y_test, y_pred, average='macro')

    print(f'  Test Accuracy : {acc:.4f}')
    print(f'  Test F1 Macro : {f1:.4f}')
    print(f'  Test Precision: {prec:.4f}')
    print(f'  Test Recall   : {rec:.4f}')

    results = {
        'pipeline': pipe,
        'accuracy': acc,
        'f1_macro': f1,
        'precision_macro': prec,
        'recall_macro': rec,
        'cv_accuracy_mean': cv_acc.mean(),
        'cv_accuracy_std': cv_acc.std(),
        'cv_f1_mean': cv_f1.mean(),
        'cv_f1_std': cv_f1.std(),
        'y_test': y_test,
        'y_pred': y_pred,
        'class_names': class_names,
    }

    # Save model
    model_path = f'../results/models/{dataset_name}_random_forest.pkl'
    joblib.dump(pipe, model_path)

    return results, le, class_names

## 3. Train on XPQRS Dataset

In [None]:
%%time
xpqrs_results, xpqrs_le, xpqrs_classes = train_and_evaluate(xpqrs_df, 'xpqrs')

In [None]:
# Results summary table
res = xpqrs_results
xpqrs_summary = pd.DataFrame([{
    'Model': 'Random Forest',
    'CV Accuracy': f"{res['cv_accuracy_mean']:.4f} +/- {res['cv_accuracy_std']:.4f}",
    'Test Accuracy': f"{res['accuracy']:.4f}",
    'Test F1 (Macro)': f"{res['f1_macro']:.4f}",
    'Test Precision': f"{res['precision_macro']:.4f}",
    'Test Recall': f"{res['recall_macro']:.4f}",
}])
xpqrs_summary.to_csv('../results/tables/xpqrs_model_results.csv', index=False)
xpqrs_summary

In [None]:
# Confusion matrix
fig = plot_confusion_matrix(
    xpqrs_results['y_test'], xpqrs_results['y_pred'], xpqrs_classes,
    title=f'XPQRS — Random Forest (Accuracy: {xpqrs_results["accuracy"]:.4f})',
    figsize=(14, 12)
)
fig.savefig('../results/figures/xpqrs_cm_random_forest.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Detailed classification report
print(f'Random Forest (Accuracy: {xpqrs_results["accuracy"]:.4f})\n')
print(classification_report(xpqrs_results['y_test'], xpqrs_results['y_pred'],
                            target_names=xpqrs_classes))

## 4. Train on PQ Disturbances Dataset

In [None]:
%%time
pq_results, pq_le, pq_classes = train_and_evaluate(pq_df, 'pq_disturbances')

In [None]:
# Results summary table
res = pq_results
pq_summary = pd.DataFrame([{
    'Model': 'Random Forest',
    'CV Accuracy': f"{res['cv_accuracy_mean']:.4f} +/- {res['cv_accuracy_std']:.4f}",
    'Test Accuracy': f"{res['accuracy']:.4f}",
    'Test F1 (Macro)': f"{res['f1_macro']:.4f}",
    'Test Precision': f"{res['precision_macro']:.4f}",
    'Test Recall': f"{res['recall_macro']:.4f}",
}])
pq_summary.to_csv('../results/tables/pq_model_results.csv', index=False)
pq_summary

In [None]:
# Confusion matrix
fig = plot_confusion_matrix(
    pq_results['y_test'], pq_results['y_pred'], pq_classes,
    title=f'PQ Disturbances — Random Forest (Accuracy: {pq_results["accuracy"]:.4f})',
    figsize=(12, 10)
)
fig.savefig('../results/figures/pq_cm_random_forest.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Detailed classification report
print(f'Random Forest (Accuracy: {pq_results["accuracy"]:.4f})\n')
print(classification_report(pq_results['y_test'], pq_results['y_pred'],
                            target_names=pq_classes))

---
**Next:** [05_results_comparison.ipynb](05_results_comparison.ipynb) — Cross-dataset comparison and detailed analysis.