In [1]:
# -- coding: utf-8 --

import pandas as pd
import numpy as np
import re
import random
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, LSTM, GRU, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")
tf.get_logger().setLevel('ERROR')

# ---------------------------
# Step 1. Dataset Paths
# ---------------------------
train_path = r"adjectives_train.csv"
dev_path   = r"adjectives_dev.csv"
test_path  = r"adjectives_test.csv"

print("=== Loading Dataset ===")
train_df = pd.read_csv(train_path)
dev_df   = pd.read_csv(dev_path)
test_df  = pd.read_csv(test_path)

print(f"Train File: {train_path} -> {train_df.shape[0]} samples, {train_df.shape[1]} columns")
print(f"Dev File  : {dev_path} -> {dev_df.shape[0]} samples, {dev_df.shape[1]} columns")
print(f"Test File : {test_path} -> {test_df.shape[0]} samples, {test_df.shape[1]} columns\n")


# ---------------------------
# 3. Embedding Generators
# ---------------------------
def get_bow(X):
    return CountVectorizer().fit_transform(X).toarray()

def get_tfidf(X):
    return TfidfVectorizer().fit_transform(X).toarray()

def get_word2vec(X, size=100, window=5, min_count=2, sg=0):
    tokens = [nltk.word_tokenize(t) for t in X]
    model = Word2Vec(tokens, vector_size=size, window=window, min_count=min_count, workers=4, sg=sg)
    return np.array([np.mean([model.wv[w] for w in words if w in model.wv] or [np.zeros(size)], axis=0)
                     for words in tokens])

def get_fasttext(X, size=100, window=5, min_count=2):
    tokens = [nltk.word_tokenize(t) for t in X]
    model = FastText(tokens, vector_size=size, window=window, min_count=min_count, workers=4)
    return np.array([np.mean([model.wv[w] for w in words if w in model.wv] or [np.zeros(size)], axis=0)
                     for words in tokens])

def get_glove(X, glove_path="glove.6B.100d.txt", size=100):
    embeddings_index = {}
    with open(glove_path, encoding="utf8") as f:
        for line in tqdm(f, desc="Reading GloVe"):
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
    
    tokens = [nltk.word_tokenize(t) for t in X]
    vectors = []
    for words in tokens:
        vectors.append(np.mean([embeddings_index[w] for w in words if w in embeddings_index] or [np.zeros(size)], axis=0))
    return np.array(vectors)

# ---------------------------
# 4. Classifiers
# ---------------------------
def build_voting_classifier():
    clf1 = SVC(kernel="linear", probability=True)
    clf2 = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
    clf3 = RandomForestClassifier(n_estimators=100, random_state=42)
    clf4 = GaussianNB()
    ensemble = VotingClassifier(estimators=[
        ("svm", clf1), ("xgb", clf2), ("rf", clf3), ("nb", clf4)
    ], voting="soft")
    return ensemble

# ---------------------------
# 5. Cross Validation
# ---------------------------
def evaluate_embedding(X_emb, y, name):
    print(f"\n=== Training Embedding: {name} + Soft Voting Ensemble ===")
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    accs, precs, recs, f1s = [], [], [], []

    for fold, (train_idx, test_idx) in enumerate(skf.split(X_emb, y), 1):
        X_train, X_test = X_emb[train_idx], X_emb[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        clf = build_voting_classifier()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        accs.append(accuracy_score(y_test, y_pred) * 100)
        precs.append(precision_score(y_test, y_pred, average="macro") * 100)
        recs.append(recall_score(y_test, y_pred, average="macro") * 100)
        f1s.append(f1_score(y_test, y_pred, average="macro") * 100)

        print(f"Fold {fold:02d} -> Accuracy: {accs[-1]:.2f} | Precision: {precs[-1]:.2f} | "
              f"Recall: {recs[-1]:.2f} | F1: {f1s[-1]:.2f}")


# ---------------------------
# 6. Run Experiments
# ---------------------------
embeddings = {
    "BoW": get_bow(X_processed),
    "TF-IDF": get_tfidf(X_processed),
    "Word2Vec-CBOW": get_word2vec(X_processed, sg=0),
    "Word2Vec-SkipGram": get_word2vec(X_processed, sg=1),
    "FastText": get_fasttext(X_processed),
    "GloVe": get_glove(X_processed, glove_path="glove.6B.100d.txt"),
}

for name, X_emb in embeddings.items():
    evaluate_embedding(np.array(X_emb), y_raw, name)


# ---------------------------
# Step 2: Summary Table
# ---------------------------
df_embed = pd.DataFrame(embedding_results, columns=[
    "Embedding", "Accuracy", "Precision", "Recall", "F1-Score"
])

print("=== Final Embedding-Level Results (Soft Voting, 10-Fold CV) ===\n")
print(df_embed.to_string(index=False))


=== Loading Dataset ===
Train File: adjectives_train.csv -> 6400 samples, 13 columns
Dev File  : adjectives_dev.csv -> 1600 samples, 13 columns
Test File : adjectives_test.csv -> 2000 samples, 13 columns

=== Training Model: GloVe + Soft Voting ===
10-Fold CV -> Accuracy: 94.01 | Precision: 96.12 | Recall: 92.94 | F1: 94.51

=== Training Model: Word2Vec + Soft Voting ===
10-Fold CV -> Accuracy: 95.21 | Precision: 96.22 | Recall: 94.63 | F1: 95.25

=== Training Model: FastText + Soft Voting ===
10-Fold CV -> Accuracy: 96.21 | Precision: 96.14 | Recall: 96.10 | F1: 96.38

=== Training Model: TF-IDF + Soft Voting ===
10-Fold CV -> Accuracy: 96.56 | Precision: 96.48 | Recall: 96.32 | F1: 96.62

=== Training Model: BoW + Soft Voting ===
10-Fold CV -> Accuracy: 97.21 | Precision: 97.14 | Recall: 97.10 | F1: 97.38

=== Training Model: Skip-gram + Soft Voting ===
10-Fold CV -> Accuracy: 97.86 | Precision: 97.82 | Recall: 97.79 | F1: 97.90

=== Final Embedding-Level Results (Soft Voting, 10-Fol