In [21]:
import os
import sys
import numpy as np
import pandas as pd
from scipy.sparse import load_npz
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
# Setup src imports
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))
from logger import get_logger

# Paths

In [22]:
DATA_TRAIN_PATH = "./../data/train"
DATA_TEST_PATH = "./../data/test"
REPORTS_PATH = "./../reports"

# Initialize logger

In [23]:
logger = get_logger("train_models")

# Define classification models

In [24]:
models = {
    "RandomForest": RandomForestClassifier(n_jobs=-1, random_state=42),
    "SVM": SVC(kernel="linear", random_state=42),
    "XGBoost": XGBClassifier(eval_metric="mlogloss", n_jobs=-1, random_state=42),
    "NaiveBayes": MultinomialNB(),
    "MLP": MLPClassifier(max_iter=300, early_stopping=True, random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42)
}

# Define feature sets and their sparsity

In [25]:
feature_sets = [
    ("tfidf_word_1gram", True),
    ("tfidf_word_2gram", True),
    ("tfidf_word_3gram", True),
    ("tfidf_char_2gram", True),
    ("tfidf_char_3gram", True),
    ("bert", False)
]

# Load training and test data for a given feature set

In [26]:
def load_split_data(name, sparse=True):
    if sparse:
        X_train = load_npz(f"{DATA_TRAIN_PATH}/X_{name}.npz")
        X_test = load_npz(f"{DATA_TEST_PATH}/X_{name}.npz")
    else:
        X_train = np.load(f"{DATA_TRAIN_PATH}/X_{name}.npy")
        X_test = np.load(f"{DATA_TEST_PATH}/X_{name}.npy")

    y_train = np.load(f"{DATA_TRAIN_PATH}/y_{name}.npy")
    y_test = np.load(f"{DATA_TEST_PATH}/y_{name}.npy")
    return X_train, X_test, y_train, y_test

# Train and evaluate models

In [27]:
results = []

for feature_name, is_sparse in feature_sets:
    logger.info(f"📊 Feature set: {feature_name}")
    X_train, X_test, y_train, y_test = load_split_data(feature_name, sparse=is_sparse)

    for model_name, model in models.items():
        # Skip Naive Bayes on dense embeddings (e.g. BERT)
        if model_name == "NaiveBayes" and not is_sparse:
            logger.warning(f"⏭ Skipping NaiveBayes for dense feature set: {feature_name}")
            continue

        logger.info(f"🔧 Training model: {model_name}")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Compute metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        rec = recall_score(y_test, y_pred, average="weighted")
        f1 = f1_score(y_test, y_pred, average="weighted")

        logger.info(f"✅ {model_name} — Acc: {acc:.3f} | F1: {f1:.3f}")

        results.append({
            "Feature": feature_name,
            "Model": model_name,
            "Accuracy": acc,
            "Precision": prec,
            "Recall": rec,
            "F1-Score": f1
        })

2025-04-11 16:03:24 [INFO] 📊 Feature set: tfidf_word_1gram
2025-04-11 16:03:24 [INFO] 🔧 Training model: RandomForest
2025-04-11 16:03:25 [INFO] ✅ RandomForest — Acc: 0.646 | F1: 0.643
2025-04-11 16:03:25 [INFO] 🔧 Training model: SVM
2025-04-11 16:03:34 [INFO] ✅ SVM — Acc: 0.792 | F1: 0.793
2025-04-11 16:03:34 [INFO] 🔧 Training model: XGBoost
2025-04-11 16:04:18 [INFO] ✅ XGBoost — Acc: 0.537 | F1: 0.530
2025-04-11 16:04:18 [INFO] 🔧 Training model: NaiveBayes
2025-04-11 16:04:18 [INFO] ✅ NaiveBayes — Acc: 0.838 | F1: 0.825
2025-04-11 16:04:18 [INFO] 🔧 Training model: MLP
2025-04-11 16:04:26 [INFO] ✅ MLP — Acc: 0.771 | F1: 0.774
2025-04-11 16:04:26 [INFO] 🔧 Training model: DecisionTree
2025-04-11 16:04:28 [INFO] ✅ DecisionTree — Acc: 0.292 | F1: 0.286
2025-04-11 16:04:28 [INFO] 📊 Feature set: tfidf_word_2gram
2025-04-11 16:04:28 [INFO] 🔧 Training model: RandomForest
2025-04-11 16:04:29 [INFO] ✅ RandomForest — Acc: 0.421 | F1: 0.422
2025-04-11 16:04:29 [INFO] 🔧 Training model: SVM
2025-04-

# Save results to CSV for reporting

In [28]:
os.makedirs(REPORTS_PATH, exist_ok=True)
results_df = pd.DataFrame(results)
results_df.to_csv(os.path.join(REPORTS_PATH, "model_results.csv"), index=False)
logger.info(f"📄 Saved evaluation report to: {REPORTS_PATH}/model_results.csv")

2025-04-11 16:10:07 [INFO] 📄 Saved evaluation report to: ./../reports/model_results.csv
