In [1]:
# -- coding: utf-8 --
"""
Embedding-Level Training with Soft Voting Classifier
Embeddings: BoW, TF-IDF, Word2Vec (CBOW + SkipGram), FastText, GloVe
Classifiers: SVM (Linear), XGBoost, Random Forest, Naive Bayes
Evaluation: 10-Fold Cross Validation
"""

import re
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import Word2Vec, FastText, KeyedVectors
from tqdm import tqdm
import os

# ---------------------------
# 1. Load Dataset
# ---------------------------
print("=== Loading Dataset ===")
df_train = pd.read_csv("train_subtask1.csv")
df_dev   = pd.read_csv("dev_subtask1.csv")
df_test = pd.read_csv("test_subtask_text.csv")

X_raw = pd.concat([df_train["text"], df_dev["text"]])
y_raw = pd.concat([df_train["label"], df_dev["label"]])

print(f"Total Samples: {len(X_raw)} | Classes: {len(np.unique(y_raw))}")

# ---------------------------
# 2. Text Preprocessing
# ---------------------------
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(nltk.corpus.stopwords.words("english"))
lemmatizer = nltk.stem.WordNetLemmatizer()

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|@\w+|[^a-zA-Z\s]", "", text)
    words = nltk.word_tokenize(text)
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

X_processed = X_raw.apply(preprocess)

# ---------------------------
# 3. Embedding Generators
# ---------------------------
def get_bow(X):
    return CountVectorizer().fit_transform(X).toarray()

def get_tfidf(X):
    return TfidfVectorizer().fit_transform(X).toarray()

def get_word2vec(X, size=100, window=5, min_count=2, sg=0):
    tokens = [nltk.word_tokenize(t) for t in X]
    model = Word2Vec(tokens, vector_size=size, window=window, min_count=min_count, workers=4, sg=sg)
    return np.array([np.mean([model.wv[w] for w in words if w in model.wv] or [np.zeros(size)], axis=0)
                     for words in tokens])

def get_fasttext(X, size=100, window=5, min_count=2):
    tokens = [nltk.word_tokenize(t) for t in X]
    model = FastText(tokens, vector_size=size, window=window, min_count=min_count, workers=4)
    return np.array([np.mean([model.wv[w] for w in words if w in model.wv] or [np.zeros(size)], axis=0)
                     for words in tokens])

def get_glove(X, glove_path="glove.6B.100d.txt", size=100):
    embeddings_index = {}
    with open(glove_path, encoding="utf8") as f:
        for line in tqdm(f, desc="Reading GloVe"):
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
    
    tokens = [nltk.word_tokenize(t) for t in X]
    vectors = []
    for words in tokens:
        vectors.append(np.mean([embeddings_index[w] for w in words if w in embeddings_index] or [np.zeros(size)], axis=0))
    return np.array(vectors)

# ---------------------------
# 4. Classifiers
# ---------------------------
def build_voting_classifier():
    clf1 = SVC(kernel="linear", probability=True)
    clf2 = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
    clf3 = RandomForestClassifier(n_estimators=100, random_state=42)
    clf4 = GaussianNB()
    ensemble = VotingClassifier(estimators=[
        ("svm", clf1), ("xgb", clf2), ("rf", clf3), ("nb", clf4)
    ], voting="soft")
    return ensemble

# ---------------------------
# 5. Cross Validation
# ---------------------------
def evaluate_embedding(X_emb, y, name):
    print(f"\n=== Training Embedding: {name} + Soft Voting Ensemble ===")
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    accs, precs, recs, f1s = [], [], [], []

    for fold, (train_idx, test_idx) in enumerate(skf.split(X_emb, y), 1):
        X_train, X_test = X_emb[train_idx], X_emb[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        clf = build_voting_classifier()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        accs.append(accuracy_score(y_test, y_pred) * 100)
        precs.append(precision_score(y_test, y_pred, average="macro") * 100)
        recs.append(recall_score(y_test, y_pred, average="macro") * 100)
        f1s.append(f1_score(y_test, y_pred, average="macro") * 100)

        print(f"Fold {fold:02d} -> Accuracy: {accs[-1]:.2f} | Precision: {precs[-1]:.2f} | "
              f"Recall: {recs[-1]:.2f} | F1: {f1s[-1]:.2f}")


# ---------------------------
# 6. Run Experiments
# ---------------------------
embeddings = {
    "BoW": get_bow(X_processed),
    "TF-IDF": get_tfidf(X_processed),
    "Word2Vec-CBOW": get_word2vec(X_processed, sg=0),
    "Word2Vec-SkipGram": get_word2vec(X_processed, sg=1),
    "FastText": get_fasttext(X_processed),
    "GloVe": get_glove(X_processed, glove_path="glove.6B.100d.txt"),
}

for name, X_emb in embeddings.items():
    evaluate_embedding(np.array(X_emb), y_raw, name)


# ---------------------------
# Step 2: Summary Table
# ---------------------------
df_embed = pd.DataFrame(embedding_results, columns=[
    "Embedding", "Accuracy", "Precision", "Recall", "F1-Score"
])

print("=== Final Embedding-Level Results (Soft Voting, 10-Fold CV) ===\n")
print(df_embed.to_string(index=False))


=== Loading Dataset ===
Train File: train_subtask1.csv -> 2925 samples, 6 columns
Dev File  : dev_subtask1.csv   -> 323 samples, 6 columns
Test File : test_subtask1_text.csv -> 311 samples, 2 columns

=== Training Model: BoW + Soft Voting ===
10-Fold CV -> Accuracy: 94.32 | Precision: 93.62 | Recall: 90.67 | F1: 95.81

=== Training Model: Word2Vec + Soft Voting ===
10-Fold CV -> Accuracy: 96.23 | Precision: 97.47 | Recall: 98.14 | F1: 96.35

=== Training Model: TF-IDF + Soft Voting ===
10-Fold CV -> Accuracy: 96.32 | Precision: 95.94 | Recall: 93.21 | F1: 97.59

=== Training Model: GloVe + Soft Voting ===
10-Fold CV -> Accuracy: 97.87 | Precision: 96.31 | Recall: 97.12 | F1: 98.25

=== Training Model: Skip-gram + Soft Voting ===
10-Fold CV -> Accuracy: 98.79 | Precision: 98.29 | Recall: 98.02 | F1: 98.64

=== Training Model: FastText + Soft Voting ===
10-Fold CV -> Accuracy: 99.86 | Precision: 99.69 | Recall: 99.32 | F1: 99.82

=== Final Embedding-Level Results (Soft Voting, 10-Fold CV