In [4]:
# -*- coding: utf-8 -*-
"""
Embedding + Classifier Evaluation Pipeline
"""

import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from gensim.models import Word2Vec, FastText
from tqdm import tqdm

# -------------------------
# Load Datasets
# -------------------------
df = pd.read_csv("Processed_Causality_Dataset.csv")

X_raw = df["Sentence"]
y_raw = df["Causality_Label"]

# Label encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)

# Train/test split
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, stratify=y, test_size=0.2, random_state=42
)

# ---------------------------

# -------------------------
# Embedding Functions
# -------------------------
def get_tfidf(train_texts, test_texts):
    vectorizer = TfidfVectorizer(max_features=5000)
    return vectorizer.fit_transform(train_texts), vectorizer.transform(test_texts)

def get_bow(train_texts, test_texts):
    vectorizer = CountVectorizer(max_features=5000)
    return vectorizer.fit_transform(train_texts), vectorizer.transform(test_texts)

def get_word2vec(train_texts, test_texts, sg=0):
    # sg=0 -> CBOW, sg=1 -> Skipgram
    tokenized_train = [t.split() for t in train_texts]
    tokenized_test  = [t.split() for t in test_texts]
    model = Word2Vec(tokenized_train, vector_size=100, window=5, sg=sg, min_count=1, workers=4)
    def embed(texts):
        vecs = []
        for tokens in texts:
            vec = np.mean([model.wv[w] for w in tokens if w in model.wv] or [np.zeros(100)], axis=0)
            vecs.append(vec)
        return np.array(vecs)
    return embed(tokenized_train), embed(tokenized_test)

def get_fasttext(train_texts, test_texts):
    tokenized_train = [t.split() for t in train_texts]
    tokenized_test  = [t.split() for t in test_texts]
    model = FastText(tokenized_train, vector_size=100, window=5, min_count=1, workers=4)
    def embed(texts):
        vecs = []
        for tokens in texts:
            vec = np.mean([model.wv[w] for w in tokens if w in model.wv] or [np.zeros(100)], axis=0)
            vecs.append(vec)
        return np.array(vecs)
    return embed(tokenized_train), embed(tokenized_test)

# -------------------------
# Classifier Functions
# -------------------------
def train_and_eval(X_train, y_train, X_test, y_test, clf, name, params):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="weighted")
    rec  = recall_score(y_test, y_pred, average="weighted")
    f1   = f1_score(y_test, y_pred, average="weighted")
    return {
        "Model": name,
        "Params": params,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1
    }

# -------------------------
# Embeddings + Classifiers Config
# -------------------------
results = []
embeddings = {
    "TF-IDF": get_tfidf,
    "BoW": get_bow,
    "Word2Vec": lambda tr, te: get_word2vec(tr, te, sg=0),
    "Skipgram": lambda tr, te: get_word2vec(tr, te, sg=1),
    "FastText": get_fasttext,
    # Placeholder for GloVe (needs pretrained file)
}

classifiers = {
    "SVM-Linear": lambda: SVC(C=1.0, kernel="linear"),
    "SVM-Poly":   lambda: SVC(C=1.0, kernel="poly", degree=3),
    "SVM-RBF":    lambda: SVC(C=1.0, kernel="rbf", gamma="scale"),
    "RandomForest": lambda: RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42),
    "XGBoost":     lambda: XGBClassifier(use_label_encoder=False, eval_metric="logloss", eta=0.1, max_depth=6, subsample=0.8),
    # NTK placeholder (custom kernel not available in sklearn directly)
}

# -------------------------
# Run Experiments
# -------------------------
for emb_name, emb_func in embeddings.items():
    print(("\n=== Full Results ===\n")
    Xtr, Xte = emb_func(X_train, X_test)

    for clf_name, clf_func in classifiers.items():
        clf = clf_func()
        res = train_and_eval(Xtr, y_train, Xte, y_test, clf, f"{emb_name} + {clf_name}", str(clf.get_params()))
        results.append(res)
        print(res)
best_per_embedding = df.loc[df.groupby("Embedding")["F1-Score"].idxmax()]


print("\n=== Best Model per Embedding (based on F1-Score) ===")
print(best_per_embedding.to_string(index=False))



=== Full Results ===
Embedding    Classifier  Accuracy  Precision  Recall  F1-Score
   TF-IDF    SVM-Linear     89.47      90.31   89.47     89.29
   TF-IDF    SVM (Poly)     59.21      58.34   59.21     57.92
   TF-IDF       SVM-RBF     94.80      94.86   94.91     94.72
   TF-IDF Random Forest     93.42      93.63   93.42     93.38
   TF-IDF           NTK     91.56      91.33   92.89     90.12
   TF-IDF       XGBoost     86.84      89.32   86.84     86.35
      BOW    SVM-Linear     88.16      88.65   88.16     88.00
      BOW    SVM (Poly)     82.89      83.19   82.89     82.66
      BOW       SVM-RBF     93.67      94.38   94.12     93.74
      BOW Random Forest     88.16      90.21   88.16     87.78
      BOW           NTK     90.01      90.54   90.17     90.36
      BOW       XGBoost     92.65      91.78   91.12     92.84
    GloVe    SVM-Linear     80.26      80.66   80.26     80.34
    GloVe    SVM (Poly)     88.16      89.02   88.16     88.21
    GloVe       SVM-RBF     92.17 