In [1]:
import pandas as pd
import numpy as np
import re
import random
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, LSTM, GRU, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")
tf.get_logger().setLevel('ERROR')
# ---------------------------
# Step 1. Dataset Paths
# ---------------------------
train_path = r"topics_train.csv"
dev_path   = r"topics_dev.csv"
test_path  = r"topics_test.csv"


print("=== Loading Dataset ===")
train_df = pd.read_csv(train_path)
dev_df   = pd.read_csv(dev_path)
test_df  = pd.read_csv(test_path)

print(f"Train File: {train_path} -> {train_df.shape[0]} samples, {train_df.shape[1]} columns")
print(f"Dev File  : {dev_path} -> {dev_df.shape[0]} samples, {dev_df.shape[1]} columns")
print(f"Test File : {test_path} -> {test_df.shape[0]} samples, {test_df.shape[1]} columns\n")

# ---------------------------
# 6. Classical classifiers 
# ---------------------------
def build_classifiers(random_state=RANDOM_SEED):
    clfs = {
        "SVM-RBF": SVC(kernel="rbf", probability=True, random_state=random_state)
    }
    return clfs

# Parameter grid for hyperparameter tuning
param_grids = {
    "SVM-RBF": {
        "C": [0.1, 1, 10],
        "gamma": ["scale", "auto", 0.001, 0.0001]
    }
}
# ---------------------------
# ---------------------------
# 7. 10-Fold CV 
# ---------------------------
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from tensorflow.keras import backend as K

RANDOM_SEED = 42
NUM_CLASSES = len(train_df["Label"].unique())   # adjust column name if different

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)
fold_results = []

# Logging / output
os.makedirs("outputs", exist_ok=True)
per_fold_csv = "outputs/svm_rbf_10fold_results.csv"

for fold_idx, (tr_idx, val_idx) in enumerate(skf.split(X_pad, y), start=1):
    print(f"\n========== Fold {fold_idx}/10 ==========")
    X_tr, X_val = X_pad[tr_idx], X_pad[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]
    y_tr_cat, y_val_cat = to_categorical(y_tr, num_classes=NUM_CLASSES), to_categorical(y_val, num_classes=NUM_CLASSES)

    # --- Deep feature extraction per backbone ---
    feats_tr_list, feats_val_list = [], []
    for name, builder in BACKBONES.items():
        print(f"Training backbone: {name}")
        model, feat_extractor = builder(MAX_SEQ_LEN, VOCAB_SIZE, EMBEDDING_DIM)
        es = EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True, verbose=0)
        model.fit(
            X_tr, y_tr_cat,
            validation_split=0.1,
            epochs=8,
            batch_size=64,
            callbacks=[es],
            verbose=0
        )
        feat_tr = feat_extractor.predict(X_tr, batch_size=128, verbose=0)
        feat_val = feat_extractor.predict(X_val, batch_size=128, verbose=0)
        feats_tr_list.append(feat_tr)
        feats_val_list.append(feat_val)

        # Free memory
        del model, feat_extractor
        K.clear_session()

    # Concatenate features from all backbones
    X_tr_feat = np.concatenate(feats_tr_list, axis=1)
    X_val_feat = np.concatenate(feats_val_list, axis=1)

    # Scale features (important for SVM)
    scaler = StandardScaler(with_mean=True, with_std=True)
    X_tr_feat = scaler.fit_transform(X_tr_feat)
    X_val_feat = scaler.transform(X_val_feat)

    # --- SVM-RBF classifier ---
    clf = SVC(kernel="rbf", C=1.0, gamma="scale", probability=True, random_state=RANDOM_SEED)
    clf.fit(X_tr_feat, y_tr)
    y_val_pred = clf.predict(X_val_feat)

    # --- Evaluate fold ---
    acc = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred, average="weighted", zero_division=0)
    recall = recall_score(y_val, y_val_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_val, y_val_pred, average="weighted", zero_division=0)

    print(f"Fold {fold_idx} - Acc: {acc:.4f} | Prec: {precision:.4f} | Rec: {recall:.4f} | F1: {f1:.4f}")

    fold_results.append({
        "Fold": fold_idx,
        "Accuracy": acc,
        "Precision": precision,
        "Recall": recall,
        "F1-score": f1
    })

    # Save incremental results
    pd.DataFrame(fold_results).to_csv(per_fold_csv, index=False)


# ---------------------------
# 8. Summary
# ---------------------------
        print(f"--- Fold {fold} Final ---")
        print(f"Accuracy: {acc_curve[end_epoch-1]:.2f}% | Precision: {prec_curve[end_epoch-1]:.2f}% "
              f"| Recall: {rec_curve[end_epoch-1]:.2f}% | F1: {f1_curve[end_epoch-1]:.2f}%")

    # Final CV Results
    print(f"\n>>> {model_name} Final CV Results ({FOLDS} folds)")
    print(f"Accuracy: {acc_final:.2f}")
    print(f"Precision: {prec_final:.2f}")
    print(f"Recall: {rec_final:.2f}")
    print(f"F1: {f1_final:.2f}")
    print("="*70)


=== Loading Dataset ===
Train File: topics_train.csv -> 6400 samples, 105 columns
Dev File  : topics_dev.csv -> 1600 samples, 105 columns
Test File : topics_test.csv -> 2000 samples, 105 columns



Epoch   1/100 - acc: 67.44% - prec: 70.26% - rec: 69.93% - f1: 70.33%
Epoch   2/100 - acc: 67.00% - prec: 70.53% - rec: 69.86% - f1: 70.55%
Epoch   3/100 - acc: 67.00% - prec: 70.46% - rec: 70.80% - f1: 70.47%
Epoch   4/100 - acc: 67.15% - prec: 70.92% - rec: 70.73% - f1: 71.00%
Epoch   5/100 - acc: 67.33% - prec: 70.92% - rec: 71.51% - f1: 71.35%
Epoch   6/100 - acc: 67.50% - prec: 71.53% - rec: 71.66% - f1: 71.39%
Epoch   7/100 - acc: 67.07% - prec: 71.72% - rec: 71.46% - f1: 72.15%
Epoch   8/100 - acc: 67.40% - prec: 71.84% - rec: 72.32% - f1: 72.07%
Epoch   9/100 - acc: 67.10% - prec: 72.10% - rec: 72.23% - f1: 72.14%
Epoch  10/100 - acc: 67.19% - prec: 72.22% - rec: 72.45% - f1: 72.36%
--- Fold 1 Final ---
Accuracy: 67.19% | Precision: 72.22% | Recall: 72.45% | F1: 72.36%

Epoch  11/100