# 03 - Model Training & Evaluation

This notebook loads the engineered datasets, composes classical machine learning pipelines, and compares candidate models using consistent metrics.

## How to use
1. Confirm that the previous notebooks have generated `train_dataset.csv`, `test_dataset.csv`, and `preprocessor.joblib`.
2. Adjust the model list or evaluation settings if needed.
3. Run the notebook to benchmark classical models and persist the best-performing pipeline.

In [None]:
# Optional: install additional modeling dependencies.
# !pip install scikit-learn scipy joblib lightgbm xgboost

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.base import clone
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
import joblib

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
NOTEBOOK_DIR = Path.cwd()
ARTIFACT_DIR = NOTEBOOK_DIR / "artifacts"
ARTIFACT_DIR.mkdir(exist_ok=True)

TRAIN_DATA_PATH = ARTIFACT_DIR / "train_dataset.csv"
TEST_DATA_PATH = ARTIFACT_DIR / "test_dataset.csv"
MODEL_DIR = ARTIFACT_DIR / "models"
MODEL_DIR.mkdir(exist_ok=True)
MODEL_COMPARISON_PATH = ARTIFACT_DIR / "model_comparison.csv"

TARGET_COLUMN = "business_capability"
TEXT_FEATURE = "original_path_keywords"
CATEGORICAL_FEATURES = ["extension", "extension_family"]
NUMERIC_FEATURES = ["original_path_depth", "file_size_bytes", "content_word_count"]
FEATURE_COLUMNS = [TEXT_FEATURE] + CATEGORICAL_FEATURES + NUMERIC_FEATURES

RANDOM_STATE = 42
CV_FOLDS = 5

for required_path in [TRAIN_DATA_PATH, TEST_DATA_PATH]:
    if not required_path.exists():
        raise FileNotFoundError(f"Missing artifact: {required_path}. Run prior notebooks first.")

In [None]:
REQUIRED_COLUMNS = FEATURE_COLUMNS + [TARGET_COLUMN]

train_df = pd.read_csv(TRAIN_DATA_PATH)
test_df = pd.read_csv(TEST_DATA_PATH)

missing_train = [col for col in REQUIRED_COLUMNS if col not in train_df.columns]
missing_test = [col for col in REQUIRED_COLUMNS if col not in test_df.columns]
if missing_train or missing_test:
    raise ValueError(f"Missing required columns. train missing={missing_train}, test missing={missing_test}")

def prep_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df[TEXT_FEATURE] = df[TEXT_FEATURE].fillna("").astype(str)
    for column in CATEGORICAL_FEATURES:
        df[column] = df[column].fillna("").astype(str)
    for column in NUMERIC_FEATURES:
        df[column] = pd.to_numeric(df[column], errors="coerce").fillna(0.0).astype(float)
    return df

train_df = prep_dataframe(train_df)
test_df = prep_dataframe(test_df)

X_train = train_df[FEATURE_COLUMNS]
y_train = train_df[TARGET_COLUMN]
X_test = test_df[FEATURE_COLUMNS]
y_test = test_df[TARGET_COLUMN]

text_non_empty = int((X_train[TEXT_FEATURE].str.strip() != "").sum())
print(f"Non-empty rows for {TEXT_FEATURE}: {text_non_empty}")
print(f"Unique extensions: {X_train['extension'].nunique()}")
print(f"Unique extension families: {X_train['extension_family'].nunique()}")

In [None]:
text_vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 3),
    min_df=1,
    strip_accents="unicode",
)

preprocessor = ColumnTransformer(
    transformers=[
        ("path_tfidf", text_vectorizer, [TEXT_FEATURE]),
        ("categorical", OneHotEncoder(handle_unknown="ignore"), CATEGORICAL_FEATURES),
        ("numeric", "passthrough", NUMERIC_FEATURES),
    ],
    remainder="drop",
    sparse_threshold=0.3,
)

In [None]:
candidate_models = {
    "lightgbm": LGBMClassifier(
        n_estimators=400,
        learning_rate=0.05,
        num_leaves=64,
        objective="multiclass",
        random_state=RANDOM_STATE,
    ),
    "xgboost": XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=9,
        subsample=0.9,
        colsample_bytree=0.9,
        objective="multi:softprob",
        eval_metric="mlogloss",
        random_state=RANDOM_STATE,
        tree_method="hist",
    ),
}

cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)

evaluation_rows = []
model_reports = {}

for model_name, estimator in candidate_models.items():
    print(f"\nTraining model: {model_name}")
    pipeline = Pipeline(
        steps=[
            ("preprocessor", clone(preprocessor)),
            ("classifier", estimator),
        ]
    )

    cv_scores = cross_validate(
        pipeline,
        X_train,
        y_train,
        cv=cv,
        scoring=["accuracy", "f1_macro", "f1_weighted"],
        n_jobs=-1,
        return_train_score=False,
    )

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    model_reports[model_name] = report

    evaluation_rows.append(
        {
            "model": model_name,
            "cv_accuracy_mean": cv_scores["test_accuracy"].mean(),
            "cv_accuracy_std": cv_scores["test_accuracy"].std(),
            "cv_macro_f1_mean": cv_scores["test_f1_macro"].mean(),
            "cv_weighted_f1_mean": cv_scores["test_f1_weighted"].mean(),
            "test_accuracy": accuracy_score(y_test, y_pred),
            "test_macro_f1": report["macro avg"]["f1-score"],
            "test_weighted_f1": report["weighted avg"]["f1-score"],
        }
    )

    joblib.dump(pipeline, MODEL_DIR / f"{model_name}_pipeline.joblib")

In [None]:
results_df = pd.DataFrame(evaluation_rows).sort_values(by="test_weighted_f1", ascending=False).reset_index(drop=True)
results_df.to_csv(MODEL_COMPARISON_PATH, index=False)
results_df

In [None]:
best_model_name = results_df.iloc[0]["model"]
print(f"Best model based on weighted F1: {best_model_name}")

best_report = pd.DataFrame(model_reports[best_model_name]).T
display(best_report)

BEST_MODEL_PATH = MODEL_DIR / f"{best_model_name}_pipeline.joblib"
print(f"Saved best model pipeline to {BEST_MODEL_PATH}")