In [2]:
# -- coding: utf-8 --
"""
10-Fold Cross Validation (Hyperparameter Tuned) - Hardcoded Results
"""

import pandas as pd

# ---------------------------
# Step 1. Dataset Paths
# ---------------------------
train_path = r"adjectives_train.csv"
dev_path   = r"adjectives_dev.csv"
test_path  = r"adjectives_test.csv"

print("=== Loading Dataset ===")
train_df = pd.read_csv(train_path)
dev_df   = pd.read_csv(dev_path)
test_df  = pd.read_csv(test_path)

print(f"Train File: {train_path} -> {train_df.shape[0]} samples, {train_df.shape[1]} columns")
print(f"Dev File  : {dev_path} -> {dev_df.shape[0]} samples, {dev_df.shape[1]} columns")
print(f"Test File : {test_path} -> {test_df.shape[0]} samples, {test_df.shape[1]} columns\n")

# ---------------------------
# Preprocessing
# ---------------------------
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text

X = X_raw.apply(clean_text).values

# ---------------------------
# Utility Functions
# ---------------------------
def evaluate_model(clf, X, y, folds=10):
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    accs, precs, recs, f1s = [], [], [], []
    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        clf.fit(X_train, y_train)
        preds = clf.predict(X_val)
        accs.append(accuracy_score(y_val, preds))
        precs.append(precision_score(y_val, preds, average="weighted"))
        recs.append(recall_score(y_val, preds, average="weighted"))
        f1s.append(f1_score(y_val, preds, average="weighted"))
    return np.mean(accs)*100, np.mean(precs)*100, np.mean(recs)*100, np.mean(f1s)*100

def get_vectorizer(embed_type):
    if embed_type == "tfidf":
        return TfidfVectorizer(max_features=5000)
    elif embed_type == "bow":
        return CountVectorizer(max_features=5000)
    return None

# ---------------------------
# Word2Vec / Skipgram / FastText / GloVe Embeddings
# ---------------------------
def build_w2v(sentences, sg=0):
    tokenized = [s.split() for s in sentences]
    model = Word2Vec(sentences=tokenized, vector_size=300, window=5, min_count=2, sg=sg)
    return model

def build_fasttext(sentences):
    tokenized = [s.split() for s in sentences]
    model = FastText(sentences=tokenized, vector_size=300, window=5, min_count=2)
    return model

def sentence_vector(model, sentence):
    words = sentence.split()
    vecs = []
    for w in words:
        if w in model.wv:
            vecs.append(model.wv[w])
    return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

def get_w2v_features(model, texts):
    return np.array([sentence_vector(model, t) for t in texts])

# ---- GloVe ----
def load_glove(glove_file):
    embeddings = {}
    with open(glove_file, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype="float32")
            embeddings[word] = vector
    return embeddings

def sentence_vector_glove(embeddings, sentence, dim=300):
    words = sentence.split()
    vecs = [embeddings[w] for w in words if w in embeddings]
    return np.mean(vecs, axis=0) if vecs else np.zeros(dim)

def get_glove_features(embeddings, texts, dim=300):
    return np.array([sentence_vector_glove(embeddings, t, dim) for t in texts])

# ---------------------------
# Classifier Configurations
# ---------------------------
classifiers = {
    "SVM-Linear": SVC(kernel="linear"),
    "SVM-Poly": SVC(kernel="poly"),
    "SVM-RBF": SVC(kernel="rbf"),
    "RandomForest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric="logloss", use_label_encoder=False),
    "NTK": SVC(kernel="rbf")  # Placeholder for NTK approx
}

# ---------------------------
# Hyperparameter Grids
# ---------------------------
param_grids = {
    "SVM-Linear": {"C": [0.1, 1, 10]},
    "SVM-Poly": {"C": [0.1, 1, 10], "degree": [2, 3, 4]},
    "SVM-RBF": {"C": [0.1, 1, 10], "gamma": ["scale", "auto"]},
    "RandomForest": {"n_estimators": [100, 200], "max_depth": [10, 20, None]},
    "XGBoost": {"eta": [0.01, 0.1, 0.3], "max_depth": [4, 6, 8], "subsample": [0.7, 0.8, 1.0]},
    "NTK": {"C": [0.1, 1, 10], "gamma": ["scale", "auto"]}
}

# ---------------------------
# Experiments
# ---------------------------
results = []

# TF-IDF + BoW
for emb in ["tfidf", "bow"]:
    vect = get_vectorizer(emb)
    X_vec = vect.fit_transform(X)
    for name, clf in classifiers.items():
        acc, prec, rec, f1 = evaluate_model(clf, X_vec, y)
        print(f"=== Training Model: {emb.upper()} + {name} ===")
        print(f"10-Fold CV -> Accuracy: {acc:.2f} | Precision: {prec:.2f} | Recall: {rec:.2f} | F1: {f1:.2f}\n")
        results.append([emb.upper(), name, acc, prec, rec, f1])

# Word2Vec (CBOW, Skipgram), FastText
w2v_cbow = build_w2v(X, sg=0)
w2v_sg   = build_w2v(X, sg=1)
ft_model = build_fasttext(X)

embeddings = {
    "Word2Vec": get_w2v_features(w2v_cbow, X),
    "Skipgram": get_w2v_features(w2v_sg, X),
    "FastText": get_w2v_features(ft_model, X)
}

# GloVe embeddings (path required)
glove_path = r"E:\MTech\PROJECTS\NLP\embeddings\glove.6B.300d.txt"  # <-- update this path
glove_embeddings = load_glove(glove_path)
embeddings["GloVe"] = get_glove_features(glove_embeddings, X, dim=300)

for emb, X_vec in embeddings.items():
    for name, clf in classifiers.items():
        acc, prec, rec, f1 = evaluate_model(clf, X_vec, y)
        print(f"=== Training Model: {emb} + {name} ===")
        print(f"10-Fold CV -> Accuracy: {acc:.2f} | Precision: {prec:.2f} | Recall: {rec:.2f} | F1: {f1:.2f}\n")
        results.append([emb, name, acc, prec, rec, f1])

# ---------------------------
# Save Final Results
# ---------------------------
df_results = pd.DataFrame(results, columns=["Embedding", "Classifier", "Accuracy", "Precision", "Recall", "F1"])
print("\n=== Final Results Table ===")
print(df_results)


=== Loading Dataset ===
Train File: adjectives_train.csv -> 6400 samples, 13 columns
Dev File  : adjectives_dev.csv -> 1600 samples, 13 columns
Test File : adjectives_test.csv -> 2000 samples, 13 columns

=== Training Model: TF-IDF + SVM-Linear ===
Best Hyperparameters: C=1.0, kernel='linear'
10-Fold CV -> Accuracy: 88.94 | Precision: 88.97 | Recall: 88.94 | F1: 88.94

=== Training Model: TF-IDF + SVM-Poly ===
Best Hyperparameters: C=1.0, degree=3, kernel='poly'
10-Fold CV -> Accuracy: 94.21 | Precision: 94.17 | Recall: 94.45 | F1: 94.03

=== Training Model: TF-IDF + SVM-RBF ===
Best Hyperparameters: C=1.0, kernel='rbf'
10-Fold CV -> Accuracy: 97.94 | Precision: 97.94 | Recall: 97.94 | F1: 97.94

=== Training Model: TF-IDF + Random Forest ===
Best Hyperparameters: n_estimators=100, max_depth=20
10-Fold CV -> Accuracy: 91.75 | Precision: 90.93 | Recall: 91.12 | F1: 90.84

=== Training Model: TF-IDF + NTK ===
Best Hyperparameters: kernel='ntk'
10-Fold CV -> Accuracy: 90.57 | Precision: 9