In [1]:
import os
import sys
import numpy as np
import pandas as pd
from scipy.sparse import load_npz
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import GridSearchCV
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier

# Setup src imports
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))
from logger import get_logger

# Paths

In [2]:
DATA_TRAIN_PATH = "./../data/train"
DATA_TEST_PATH = "./../data/test"
REPORTS_PATH = "./../reports"

# Initialize logger

In [3]:
logger = get_logger("train_models")

# Define classification models

In [4]:
models = {
    "RandomForest": lambda: RandomForestClassifier(
        class_weight="balanced", n_estimators=100, n_jobs=-1, random_state=42
    ),
    "SVM": lambda: SVC(kernel="linear", class_weight="balanced", random_state=42),
    "XGBoost": lambda: XGBClassifier(
        eval_metric="mlogloss", use_label_encoder=False, n_estimators=100, n_jobs=-1, random_state=42
    ),
    "NaiveBayes": lambda: MultinomialNB(),
    "MLP": lambda: MLPClassifier(
        hidden_layer_sizes=(64,),
        max_iter=200,
        early_stopping=True,
        learning_rate_init=0.01,
        random_state=42
    ),
    "DecisionTree": lambda: DecisionTreeClassifier(class_weight="balanced", random_state=42),
}

# Define feature sets and their sparsity

In [5]:
feature_sets = [
    ("tfidf_word_1gram", True),
    ("tfidf_word_2gram", True),
    ("tfidf_word_3gram", True),
    ("tfidf_char_2gram", True),
    ("tfidf_char_3gram", True),
    ("bert", False)
]

# Load training and test data for a given feature set

In [6]:
def load_split_data(name, sparse=True):
    if sparse:
        X_train = load_npz(f"{DATA_TRAIN_PATH}/X_{name}.npz")
        X_test = load_npz(f"{DATA_TEST_PATH}/X_{name}.npz")
    else:
        X_train = np.load(f"{DATA_TRAIN_PATH}/X_{name}.npy")
        X_test = np.load(f"{DATA_TEST_PATH}/X_{name}.npy")

    y_train = np.load(f"{DATA_TRAIN_PATH}/y_{name}.npy")
    y_test = np.load(f"{DATA_TEST_PATH}/y_{name}.npy")
    return X_train, X_test, y_train, y_test

# Evaluation function

In [7]:
def evaluate_model(feature_name, is_sparse, model_name, model_factory):
    logger.info(f"📊 Feature: {feature_name} | 🔧 Model: {model_name}")
    try:
        X_train, X_test, y_train, y_test = load_split_data(feature_name, sparse=is_sparse)

        if model_name == "NaiveBayes" and not is_sparse:
            logger.warning(f"⏭ Skipping NaiveBayes for dense feature set: {feature_name}")
            return None

        if not is_sparse and model_name in ["SVM", "MLP"]:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

        # Light tuning only for SVM + BERT
        if model_name == "SVM" and feature_name == "bert":
            logger.info(f"🔍 GridSearch for SVM on {feature_name}")
            param_grid = {"C": [1]}
            grid = GridSearchCV(SVC(kernel="linear", class_weight="balanced"), param_grid, cv=2, scoring="f1_weighted", n_jobs=-1)
            grid.fit(X_train, y_train)
            model = grid.best_estimator_
        else:
            model = model_factory()
            model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        err = 1 - acc
        prec_per_class = precision_score(y_test, y_pred, average=None, zero_division=0)
        recall_per_class = recall_score(y_test, y_pred, average=None, zero_division=0)
        f1_per_class = f1_score(y_test, y_pred, average=None, zero_division=0)
        weighted_f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        logger.info(f"✅ {model_name} on {feature_name} → Acc: {acc:.3f} | F1: {weighted_f1:.3f}")

        return {
            "Feature": feature_name,
            "Model": model_name,
            "Accuracy": acc,
            "ErrorRate": err,
            "Pos_Precision": prec_per_class[1] if len(prec_per_class) > 1 else 0,
            "Neg_Precision": prec_per_class[0],
            "Pos_Recall": recall_per_class[1] if len(recall_per_class) > 1 else 0,
            "Neg_Recall": recall_per_class[0],
            "Pos_F1": f1_per_class[1] if len(f1_per_class) > 1 else 0,
            "Neg_F1": f1_per_class[0],
            "Weighted_F1": weighted_f1
        }

    except Exception as e:
        logger.error(f"❌ Error in {model_name} on {feature_name}: {e}")
        return None

# Run evaluations in parallel

In [8]:
results = []

logger.info("🚀 Starting parallel model evaluation...")
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    futures = [
        executor.submit(evaluate_model, fname, is_sparse, mname, mfactory)
        for fname, is_sparse in feature_sets
        for mname, mfactory in models.items()
    ]

    for future in as_completed(futures):
        res = future.result()
        if res:
            results.append(res)

2025-04-21 14:23:24 [INFO] 🚀 Starting parallel model evaluation...
2025-04-21 14:23:24 [INFO] 📊 Feature: tfidf_word_1gram | 🔧 Model: RandomForest
2025-04-21 14:23:24 [INFO] 📊 Feature: tfidf_word_1gram | 🔧 Model: SVM
2025-04-21 14:23:24 [INFO] 📊 Feature: tfidf_word_1gram | 🔧 Model: XGBoost
2025-04-21 14:23:24 [INFO] 📊 Feature: tfidf_word_1gram | 🔧 Model: NaiveBayes
2025-04-21 14:23:24 [INFO] 📊 Feature: tfidf_word_1gram | 🔧 Model: MLP
2025-04-21 14:23:24 [INFO] 📊 Feature: tfidf_word_1gram | 🔧 Model: DecisionTree
2025-04-21 14:23:24 [INFO] 📊 Feature: tfidf_word_2gram | 🔧 Model: RandomForest
2025-04-21 14:23:24 [INFO] 📊 Feature: tfidf_word_2gram | 🔧 Model: SVM
2025-04-21 14:23:24 [INFO] 📊 Feature: tfidf_word_2gram | 🔧 Model: XGBoost
2025-04-21 14:23:24 [INFO] 📊 Feature: tfidf_word_2gram | 🔧 Model: NaiveBayes
2025-04-21 14:23:24 [INFO] 📊 Feature: tfidf_word_2gram | 🔧 Model: MLP
2025-04-21 14:23:24 [INFO] 📊 Feature: tfidf_word_2gram | 🔧 Model: DecisionTree
2025-04-21 14:23:25 [INFO] ✅ NaiveB

# Save results to CSV for reporting

In [9]:
results_df = pd.DataFrame(results)
os.makedirs(REPORTS_PATH, exist_ok=True)
results_df.to_csv(os.path.join(REPORTS_PATH, "model_results.csv"), index=False)
logger.info(f"📄 Saved detailed report to: {REPORTS_PATH}/model_results.csv")

2025-04-21 14:29:04 [INFO] 📄 Saved detailed report to: ./../reports/model_results.csv
