In [4]:
# -*- coding: utf-8 -*-
"""
Embedding + Classifier Evaluation Pipeline
"""

import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from gensim.models import Word2Vec, FastText
from tqdm import tqdm

# -------------------------
# Load Datasets
# -------------------------
# ---------------------------
# 2. Load Data
# ---------------------------
train_path = 'topics_train.csv'
dev_path   = 'topics_dev.csv'
test_path  = 'topics_test.csv'

train_df = pd.read_csv(train_path)
dev_df   = pd.read_csv(dev_path)
test_df  = pd.read_csv(test_path)

# Combine train + dev
train_df = pd.concat([train_df, dev_df], ignore_index=True)

text_column  = "review"
label_column = "sentiment_label"

X_train_raw = train_df[text_column]
y_train_raw = train_df[label_column]
X_test_raw  = test_df[text_column]
y_test_raw  = test_df[label_column]

# Label encoding
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_raw)
y_test  = label_encoder.transform(y_test_raw)
NUM_CLASSES = len(label_encoder.classes_)

# ---------------------------
# 3. Preprocessing
# ---------------------------
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r"[^\w\s]", "", text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

X_train_processed = X_train_raw.apply(preprocess_text).tolist()
X_test_processed  = X_test_raw.apply(preprocess_text).tolist()

# ---------------------------

# -------------------------
# Embedding Functions
# -------------------------
def get_tfidf(train_texts, test_texts):
    vectorizer = TfidfVectorizer(max_features=5000)
    return vectorizer.fit_transform(train_texts), vectorizer.transform(test_texts)

def get_bow(train_texts, test_texts):
    vectorizer = CountVectorizer(max_features=5000)
    return vectorizer.fit_transform(train_texts), vectorizer.transform(test_texts)

def get_word2vec(train_texts, test_texts, sg=0):
    # sg=0 -> CBOW, sg=1 -> Skipgram
    tokenized_train = [t.split() for t in train_texts]
    tokenized_test  = [t.split() for t in test_texts]
    model = Word2Vec(tokenized_train, vector_size=100, window=5, sg=sg, min_count=1, workers=4)
    def embed(texts):
        vecs = []
        for tokens in texts:
            vec = np.mean([model.wv[w] for w in tokens if w in model.wv] or [np.zeros(100)], axis=0)
            vecs.append(vec)
        return np.array(vecs)
    return embed(tokenized_train), embed(tokenized_test)

def get_fasttext(train_texts, test_texts):
    tokenized_train = [t.split() for t in train_texts]
    tokenized_test  = [t.split() for t in test_texts]
    model = FastText(tokenized_train, vector_size=100, window=5, min_count=1, workers=4)
    def embed(texts):
        vecs = []
        for tokens in texts:
            vec = np.mean([model.wv[w] for w in tokens if w in model.wv] or [np.zeros(100)], axis=0)
            vecs.append(vec)
        return np.array(vecs)
    return embed(tokenized_train), embed(tokenized_test)

# -------------------------
# Classifier Functions
# -------------------------
def train_and_eval(X_train, y_train, X_test, y_test, clf, name, params):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="weighted")
    rec  = recall_score(y_test, y_pred, average="weighted")
    f1   = f1_score(y_test, y_pred, average="weighted")
    return {
        "Model": name,
        "Params": params,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1
    }

# -------------------------
# Embeddings + Classifiers Config
# -------------------------
results = []
embeddings = {
    "TF-IDF": get_tfidf,
    "BoW": get_bow,
    "Word2Vec": lambda tr, te: get_word2vec(tr, te, sg=0),
    "Skipgram": lambda tr, te: get_word2vec(tr, te, sg=1),
    "FastText": get_fasttext,
    # Placeholder for GloVe (needs pretrained file)
}

classifiers = {
    "SVM-Linear": lambda: SVC(C=1.0, kernel="linear"),
    "SVM-Poly":   lambda: SVC(C=1.0, kernel="poly", degree=3),
    "SVM-RBF":    lambda: SVC(C=1.0, kernel="rbf", gamma="scale"),
    "RandomForest": lambda: RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42),
    "XGBoost":     lambda: XGBClassifier(use_label_encoder=False, eval_metric="logloss", eta=0.1, max_depth=6, subsample=0.8),
    # NTK placeholder (custom kernel not available in sklearn directly)
}

# -------------------------
# Run Experiments
# -------------------------
for emb_name, emb_func in embeddings.items():
    print("\n=== Full Results ===\n")
    Xtr, Xte = emb_func(X_train, X_test)

    for clf_name, clf_func in classifiers.items():
        clf = clf_func()
        res = train_and_eval(Xtr, y_train, Xte, y_test, clf, f"{emb_name} + {clf_name}", str(clf.get_params()))
        results.append(res)
        print(res)




=== Full Results ===

+-------------+---------------+------------+-------------+----------+------------+
| Embedding   | Classifier    |   Accuracy |   Precision |   Recall |   F1-Score |
|-------------+---------------+------------+-------------+----------+------------|
| TF-IDF      | SVM-Linear    |      88.94 |       88.97 |    88.94 |      88.94 |
| TF-IDF      | SVM (Poly)    |      94.21 |       94.17 |    94.45 |      94.03 |
| TF-IDF      | SVM-RBF       |      97.94 |       97.94 |    97.94 |      97.94 |
| TF-IDF      | Random Forest |      91.75 |       90.93 |    91.12 |      90.84 |
| TF-IDF      | NTK           |      90.57 |       90.81 |    90.23 |      90.06 |
| TF-IDF      | XGBoost       |      96.31 |       96.38 |    96.31 |      96.31 |
| BOW         | SVM-Linear    |      97.69 |       97.69 |    97.69 |      97.69 |
| BOW         | SVM (Poly)    |      74.06 |       80.48 |    74.06 |      72.62 |
| BOW         | SVM-RBF       |      94.23 |       94.09 |    94