# 03 - Model Training & Evaluation

This notebook loads the engineered datasets, composes classical machine learning pipelines, and compares several candidate models using consistent metrics.

## How to use
1. Confirm that the previous notebooks have generated `train_dataset.csv`, `test_dataset.csv`, and `preprocessor.joblib`.
2. Adjust the model list or evaluation settings if needed.
3. Run the notebook to benchmark classical models and persist the best-performing pipeline.

In [None]:
# Optional: install additional modeling dependencies.
# !pip install scikit-learn scipy joblib

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.base import clone
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
import joblib

In [None]:
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
NOTEBOOK_DIR = Path.cwd()
ARTIFACT_DIR = NOTEBOOK_DIR / 'artifacts'
ARTIFACT_DIR.mkdir(exist_ok=True)

TRAIN_DATA_PATH = ARTIFACT_DIR / 'train_dataset.csv'
TEST_DATA_PATH = ARTIFACT_DIR / 'test_dataset.csv'
PREPROCESSOR_PATH = ARTIFACT_DIR / 'preprocessor.joblib'
MODEL_DIR = ARTIFACT_DIR / 'models'
MODEL_DIR.mkdir(exist_ok=True)

TARGET_COLUMN = 'business_capability'
FEATURE_COLUMNS = [
    'content_text',
    'path_keywords',
    'extension',
    'path_depth',
    'original_path_depth',
    'path_token_count',
    'content_char_len',
    'content_word_count',
    'file_exists',
]

RANDOM_STATE = 42
CV_FOLDS = 5

for required_path in [TRAIN_DATA_PATH, TEST_DATA_PATH, PREPROCESSOR_PATH]:
    if not required_path.exists():
        raise FileNotFoundError(f'Missing artifact: {required_path}. Run prior notebooks first.')

In [None]:
train_df = pd.read_csv(TRAIN_DATA_PATH)
test_df = pd.read_csv(TEST_DATA_PATH)

X_train = train_df[FEATURE_COLUMNS]
y_train = train_df[TARGET_COLUMN]
X_test = test_df[FEATURE_COLUMNS]
y_test = test_df[TARGET_COLUMN]

base_preprocessor = joblib.load(PREPROCESSOR_PATH)
print('Loaded artifacts:')
print(f'  Train shape: {X_train.shape}')
print(f'  Test shape: {X_test.shape}')

In [None]:
candidate_models = {
    'logistic_regression': LogisticRegression(
        max_iter=1000,
        solver='saga',
        n_jobs=None,
        penalty='l2',
        class_weight='balanced',
        random_state=RANDOM_STATE,
    ),
    'linear_svc': LinearSVC(
        class_weight='balanced',
        max_iter=5000,
        random_state=RANDOM_STATE,
    ),
    'decision_tree': DecisionTreeClassifier(
        max_depth=None,
        min_samples_leaf=2,
        class_weight='balanced',
        random_state=RANDOM_STATE,
    ),
    'random_forest': RandomForestClassifier(
        n_estimators=300,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        class_weight='balanced_subsample',
    ),
    'gradient_boosting': GradientBoostingClassifier(
        random_state=RANDOM_STATE,
    ),
}

cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)

evaluation_rows = []
model_reports = {}

for model_name, estimator in candidate_models.items():
    print(f'
Training model: {model_name}')
    pipeline = Pipeline([
        ('preprocessor', clone(base_preprocessor)),
        ('classifier', estimator),
    ])

    cv_scores = cross_validate(
        pipeline,
        X_train,
        y_train,
        cv=cv,
        scoring=['accuracy', 'f1_macro', 'f1_weighted'],
        n_jobs=-1,
        return_train_score=False,
    )

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    model_reports[model_name] = report

    evaluation_rows.append({
        'model': model_name,
        'cv_accuracy_mean': cv_scores['test_accuracy'].mean(),
        'cv_accuracy_std': cv_scores['test_accuracy'].std(),
        'cv_macro_f1_mean': cv_scores['test_f1_macro'].mean(),
        'cv_weighted_f1_mean': cv_scores['test_f1_weighted'].mean(),
        'test_accuracy': accuracy_score(y_test, y_pred),
        'test_macro_f1': report['macro avg']['f1-score'],
        'test_weighted_f1': report['weighted avg']['f1-score'],
    })

    joblib.dump(pipeline, MODEL_DIR / f'{model_name}_pipeline.joblib')

results_df = pd.DataFrame(evaluation_rows).sort_values(by='test_weighted_f1', ascending=False).reset_index(drop=True)
results_df

In [None]:
best_model_name = results_df.iloc[0]['model']
print(f'Best model based on weighted F1: {best_model_name}')
print('
Classification report (test set):')
print(pd.DataFrame(model_reports[best_model_name]).T)

BEST_MODEL_PATH = MODEL_DIR / f'{best_model_name}_pipeline.joblib'
print(f'Saved best model pipeline to {BEST_MODEL_PATH}')