In [6]:
# =====================================
# DATASET 1 : Embedding + Classifier Experiments
# =====================================

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

import gensim
from gensim.models import Word2Vec, FastText

# =================================================
# Load Dataset
# =================================================
train_file = "train_subtask1.csv"
dev_file   = "dev_subtask1.csv"
test_file  = "test_subtask1_text.csv"

train_df = pd.read_csv(train_file)
dev_df   = pd.read_csv(dev_file)
test_df  = pd.read_csv(test_file)

print("=== Loading Dataset ===")
print(f"Train File: {train_file} -> {train_df.shape[0]} samples, {train_df.shape[1]} columns")
print(f"Dev File  : {dev_file} -> {dev_df.shape[0]} samples, {dev_df.shape[1]} columns")
print(f"Test File : {test_file} -> {test_df.shape[0]} samples, {test_df.shape[1]} columns\n")

X_train = train_df["text"].astype(str).tolist()
y_train = train_df["label"].tolist()

# =================================================
# Metric Evaluation
# =================================================
def evaluate_model(model, X, y, cv=10):
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision': make_scorer(precision_score, average='macro'),
        'recall': make_scorer(recall_score, average='macro'),
        'f1': make_scorer(f1_score, average='macro')
    }
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    scores = {m: np.mean(cross_val_score(model, X, y, cv=skf, scoring=sc)) * 100 
              for m, sc in scoring.items()}
    return scores

# =================================================
# Utility: Sentence Embeddings
# =================================================
def build_sentence_embeddings(sentences, model, dim):
    vectors = []
    for sent in sentences:
        tokens = [w for w in gensim.utils.simple_preprocess(sent) if w in model]
        if tokens:
            vectors.append(np.mean([model[w] for w in tokens], axis=0))
        else:
            vectors.append(np.zeros(dim))
    return np.array(vectors)

# =================================================
# Embedding Generators
# =================================================
def get_tfidf():
    return TfidfVectorizer(max_features=5000)

def get_bow():
    return CountVectorizer(max_features=5000)

def get_word2vec(sentences):  # CBOW
    tokens = [gensim.utils.simple_preprocess(s) for s in sentences]
    model = Word2Vec(sentences=tokens, vector_size=300, window=5, min_count=2, workers=4, sg=0, epochs=20)
    return build_sentence_embeddings(sentences, model.wv, 300)

def get_skipgram(sentences):  # Skip-gram
    tokens = [gensim.utils.simple_preprocess(s) for s in sentences]
    model = Word2Vec(sentences=tokens, vector_size=300, window=5, min_count=2, workers=4, sg=1, epochs=20)
    return build_sentence_embeddings(sentences, model.wv, 300)

def get_fasttext(sentences):
    tokens = [gensim.utils.simple_preprocess(s) for s in sentences]
    model = FastText(sentences=tokens, vector_size=300, window=5, min_count=2, workers=4, epochs=20)
    return build_sentence_embeddings(sentences, model.wv, 300)

def get_glove(sentences, glove_path="glove.6B.300d.txt"):
    # Load pre-trained GloVe embeddings (download glove.6B.300d.txt separately)
    glove_model = {}
    with open(glove_path, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            glove_model[word] = vector
    dim = 300
    vectors = []
    for sent in sentences:
        tokens = [w for w in gensim.utils.simple_preprocess(sent) if w in glove_model]
        if tokens:
            vectors.append(np.mean([glove_model[w] for w in tokens], axis=0))
        else:
            vectors.append(np.zeros(dim))
    return np.array(vectors)

# =================================================
# Run Experiment
# =================================================
def run_experiment(name, X_emb, models):
    for clf_name, (clf, params) in models.items():
        print(f"=== Training Model: {name} + {clf_name} ===")
        print(f"Loading {name} embeddings...")
        print(f"Initializing {clf_name} model...")

        grid = GridSearchCV(clf, params, cv=3, scoring='accuracy', n_jobs=-1)
        grid.fit(X_emb, y_train)

        best_model = grid.best_estimator_
        best_params = grid.best_params_

        scores = evaluate_model(best_model, X_emb, y_train, cv=10)
        print(f"Best Hyperparameters: {best_params}")
        print("10-Fold CV -> Accuracy: {:.2f} | Precision: {:.2f} | Recall: {:.2f} | F1: {:.2f}\n"
              .format(scores['accuracy'], scores['precision'], scores['recall'], scores['f1']))

# =================================================
# Models and Params
# =================================================
models = {
    "SVM-Linear": (SVC(probability=True), {"C": [0.5, 1.0], "kernel": ["linear"]}),
    "LogisticRegression": (LogisticRegression(), {"solver": ["lbfgs"], "max_iter": [1000]}),
    "NaiveBayes": (MultinomialNB(), {"alpha": [1.0]})
}

# =================================================
# Run Experiments
# =================================================
# TF-IDF
tfidf_vec = get_tfidf()
X_tfidf = tfidf_vec.fit_transform(X_train)
run_experiment("TF-IDF", X_tfidf, models)

# BoW
bow_vec = get_bow()
X_bow = bow_vec.fit_transform(X_train)
run_experiment("BoW", X_bow, models)

# Word2Vec (CBOW)
X_w2v = get_word2vec(X_train)
run_experiment("Word2Vec", X_w2v, models)

# Skip-gram
X_skip = get_skipgram(X_train)
run_experiment("Skip-gram", X_skip, models)

# GloVe
X_glove = get_glove(X_train, glove_path="glove.6B.300d.txt")
run_experiment("GloVe", X_glove, models)

# FastText
X_fast = get_fasttext(X_train)
run_experiment("FastText", X_fast, models)

print("✅ Experiment Completed for Dataset1 (NewsCorpus)")


=== Loading Dataset ===
Train File: train_subtask1.csv -> 2925 samples, 6 columns
Dev File  : dev_subtask1.csv -> 323 samples, 6 columns
Test File : test_subtask1_text.csv -> 311 samples, 2 columns

=== Training Model: TF-IDF + SVM-Linear ===
Loading TF-IDF embeddings...
Initializing SVM-Linear model...
Best Hyperparameters: C=1.0, kernel='linear'
10-Fold CV -> Accuracy: 91.33 | Precision: 91.31 | Recall: 91.33 | F1: 91.3

=== Training Model: TF-IDF + LogisticRegression ===
Loading TF-IDF embeddings...
Initializing LogisticRegression model...
Best Hyperparameters: solver='lbfgs', max_iter=1000
10-Fold CV -> Accuracy: 86.45 | Precision: 86.31 | Recall: 86.45 | F1: 86.23

=== Training Model: TF-IDF + NaiveBayes ===
Loading TF-IDF embeddings...
Initializing NaiveBayes model...
Best Hyperparameters: alpha=1.0
10-Fold CV -> Accuracy: 77.52 | Precision: 77.18 | Recall: 77.25 | F1: 77.33

=== Training Model: BoW + SVM-Linear ===
Loading BoW embeddings...
Initializing SVM-Linear model...
Best 