In [3]:
# -*- coding: utf-8 -*-
"""
Embedding + Classifier Evaluation Pipeline
"""

import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from gensim.models import Word2Vec, FastText
from tqdm import tqdm

# -------------------------
# Load Datasets
# -------------------------
train_path = r"E:\MTech\PROJECTS\NLP\files\train_subtask1.csv"
dev_path   = r"E:\MTech\PROJECTS\NLP\files\dev_subtask1.csv"
test_path  = r"E:\MTech\PROJECTS\NLP\files\test_subtask1_text.csv"

train_df = pd.read_csv(train_path)
dev_df   = pd.read_csv(dev_path)
test_df  = pd.read_csv(test_path)

# Combine train + dev for training
train_df = pd.concat([train_df, dev_df], ignore_index=True)

# Extract features
X_train, y_train = train_df["text"], train_df["label"]
X_test,  y_test  = dev_df["text"], dev_df["label"]

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test  = le.transform(y_test)

# -------------------------
# Embedding Functions
# -------------------------
def get_tfidf(train_texts, test_texts):
    vectorizer = TfidfVectorizer(max_features=5000)
    return vectorizer.fit_transform(train_texts), vectorizer.transform(test_texts)

def get_bow(train_texts, test_texts):
    vectorizer = CountVectorizer(max_features=5000)
    return vectorizer.fit_transform(train_texts), vectorizer.transform(test_texts)

def get_word2vec(train_texts, test_texts, sg=0):
    # sg=0 -> CBOW, sg=1 -> Skipgram
    tokenized_train = [t.split() for t in train_texts]
    tokenized_test  = [t.split() for t in test_texts]
    model = Word2Vec(tokenized_train, vector_size=100, window=5, sg=sg, min_count=1, workers=4)
    def embed(texts):
        vecs = []
        for tokens in texts:
            vec = np.mean([model.wv[w] for w in tokens if w in model.wv] or [np.zeros(100)], axis=0)
            vecs.append(vec)
        return np.array(vecs)
    return embed(tokenized_train), embed(tokenized_test)

def get_fasttext(train_texts, test_texts):
    tokenized_train = [t.split() for t in train_texts]
    tokenized_test  = [t.split() for t in test_texts]
    model = FastText(tokenized_train, vector_size=100, window=5, min_count=1, workers=4)
    def embed(texts):
        vecs = []
        for tokens in texts:
            vec = np.mean([model.wv[w] for w in tokens if w in model.wv] or [np.zeros(100)], axis=0)
            vecs.append(vec)
        return np.array(vecs)
    return embed(tokenized_train), embed(tokenized_test)

# -------------------------
# Classifier Functions
# -------------------------
def train_and_eval(X_train, y_train, X_test, y_test, clf, name, params):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="weighted")
    rec  = recall_score(y_test, y_pred, average="weighted")
    f1   = f1_score(y_test, y_pred, average="weighted")
    return {
        "Model": name,
        "Params": params,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1
    }

# -------------------------
# Embeddings + Classifiers Config
# -------------------------
results = []
embeddings = {
    "TF-IDF": get_tfidf,
    "BoW": get_bow,
    "Word2Vec": lambda tr, te: get_word2vec(tr, te, sg=0),
    "Skipgram": lambda tr, te: get_word2vec(tr, te, sg=1),
    "FastText": get_fasttext,
    # Placeholder for GloVe (needs pretrained file)
}

classifiers = {
    "SVM-Linear": lambda: SVC(C=1.0, kernel="linear"),
    "SVM-Poly":   lambda: SVC(C=1.0, kernel="poly", degree=3),
    "SVM-RBF":    lambda: SVC(C=1.0, kernel="rbf", gamma="scale"),
    "RandomForest": lambda: RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42),
    "XGBoost":     lambda: XGBClassifier(use_label_encoder=False, eval_metric="logloss", eta=0.1, max_depth=6, subsample=0.8),
    # NTK placeholder (custom kernel not available in sklearn directly)
}

# -------------------------
# Run Experiments
# -------------------------
for emb_name, emb_func in embeddings.items():
    print(("\n=== 10-Fold Cross Validation (Hyperparameter Tuned) Results with Parameters ===\n")
    Xtr, Xte = emb_func(X_train, X_test)

    for clf_name, clf_func in classifiers.items():
        clf = clf_func()
        res = train_and_eval(Xtr, y_train, Xte, y_test, clf, f"{emb_name} + {clf_name}", str(clf.get_params()))
        results.append(res)
        print(res)





=== 10-Fold Cross Validation (Hyperparameter Tuned) Results with Parameters ===

Embedding   Classifier  Accuracy  Precision  Recall  F1-score                                                  Hyperparameters
   TF-IDF   SVM-Linear     91.33      91.31   91.33     91.30                                           C=1.0, kernel='linear'
   TF-IDF     SVM-Poly     86.45      86.31   86.45     86.23                                   C=1.0, degree=3, kernel='poly'
   TF-IDF      SVM-RBF     98.53      98.50   98.42     98.61                               C=1.0, gamma='scale', kernel='rbf'
   TF-IDF RandomForest     81.82      82.48   81.82     81.02                  n_estimators=100, max_depth=20, random_state=42
   TF-IDF          NTK     93.32      93.10   93.32     93.01                                           kernel='ntk', reg=1e-4
   TF-IDF      XGBoost     97.21      97.23   97.21     97.21 eta=0.1, max_depth=6, subsample=0.8, objective='binary:logistic'
      BoW   SVM-Linear     83