In [2]:
# -- coding: utf-8 --
"""
Embedding-Level Results with Weighted Soft Voting Classifier
Subtask1 Dataset - 10-Fold Cross Validation
"""

import pandas as pd

# ---------------------------
# Dataset Info
# ---------------------------
train_path = r"train_subtask1.csv"
dev_path   = r"dev_subtask1.csv"
test_path  = r"test_subtask1_text.csv"

print("=== Loading Dataset ===")
train_df = pd.read_csv(train_path)
dev_df   = pd.read_csv(dev_path)
test_df  = pd.read_csv(test_path)

print(f"Train File: {train_path.split('\\')[-1]} -> {train_df.shape[0]} samples, {train_df.shape[1]} columns")
print(f"Dev File  : {dev_path.split('\\')[-1]} -> {dev_df.shape[0]} samples, {dev_df.shape[1]} columns")
print(f"Test File : {test_path.split('\\')[-1]} -> {test_df.shape[0]} samples, {test_df.shape[1]} columns\n")

# ---------------------------
# Preprocessing
# ---------------------------
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text

train_df["text"] = train_df["text"].apply(clean_text)
dev_df["text"]   = dev_df["text"].apply(clean_text)

X = train_df["text"].values
y = LabelEncoder().fit_transform(train_df["label"].values)

# ---------------------------
# Utility Functions
# ---------------------------
def evaluate_model(clf, X, y, folds=10):
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    accs, precs, recs, f1s = [], [], [], []
    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        clf.fit(X_train, y_train)
        preds = clf.predict(X_val)
        accs.append(accuracy_score(y_val, preds))
        precs.append(precision_score(y_val, preds, average="weighted"))
        recs.append(recall_score(y_val, preds, average="weighted"))
        f1s.append(f1_score(y_val, preds, average="weighted"))
    return np.mean(accs)*100, np.mean(precs)*100, np.mean(recs)*100, np.mean(f1s)*100

def get_vectorizer(embed_type):
    if embed_type == "tfidf":
        return TfidfVectorizer(max_features=5000)
    elif embed_type == "bow":
        return CountVectorizer(max_features=5000)
    return None

# ---------------------------
# Word2Vec / Skipgram / FastText / GloVe Embeddings
# ---------------------------
def build_w2v(sentences, sg=0):
    tokenized = [s.split() for s in sentences]
    model = Word2Vec(sentences=tokenized, vector_size=300, window=5, min_count=2, sg=sg)
    return model

def build_fasttext(sentences):
    tokenized = [s.split() for s in sentences]
    model = FastText(sentences=tokenized, vector_size=300, window=5, min_count=2)
    return model

def sentence_vector(model, sentence):
    words = sentence.split()
    vecs = []
    for w in words:
        if w in model.wv:
            vecs.append(model.wv[w])
    return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

def get_w2v_features(model, texts):
    return np.array([sentence_vector(model, t) for t in texts])

# ---- GloVe ----
def load_glove(glove_file):
    embeddings = {}
    with open(glove_file, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype="float32")
            embeddings[word] = vector
    return embeddings

def sentence_vector_glove(embeddings, sentence, dim=300):
    words = sentence.split()
    vecs = [embeddings[w] for w in words if w in embeddings]
    return np.mean(vecs, axis=0) if vecs else np.zeros(dim)

def get_glove_features(embeddings, texts, dim=300):
    return np.array([sentence_vector_glove(embeddings, t, dim) for t in texts])

def build_weighted_voting(random_state=42):
    np.random.seed(random_state)  # reproducibility if needed
    # generate random positive integers as weights
    weights = np.random.randint(0, 2, size=4).tolist()  

    print(f"[Info] Using random weights for ensemble: {weights}")

    svm_linear = SVC(kernel="linear", probability=True, random_state=random_state)
    rf         = RandomForestClassifier(random_state=random_state)
    xgb        = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=random_state)
    nb         = GaussianNB()

    clf = VotingClassifier(
        estimators=[
            ("SVM-Linear", svm_linear),
            ("RandomForest", rf),
            ("XGBoost", xgb),
            ("NaiveBayes", nb),
        ],
        voting="soft",
        weights=weights
    )
    return clf

results = []

# TF-IDF + BoW
for emb in ["tfidf", "bow"]:
    vect = get_vectorizer(emb)
    X_vec = vect.fit_transform(X)
    for name, clf in classifiers.items():
        acc, prec, rec, f1 = evaluate_model(clf, X_vec, y)
        print(f"=== Training Model: {emb.upper()} + {name} ===")
        print(f"10-Fold CV -> Accuracy: {acc:.2f} | Precision: {prec:.2f} | Recall: {rec:.2f} | F1: {f1:.2f}\n")
        results.append([emb.upper(), name, acc, prec, rec, f1])

# Word2Vec (CBOW, Skipgram), FastText
w2v_cbow = build_w2v(X, sg=0)
w2v_sg   = build_w2v(X, sg=1)
ft_model = build_fasttext(X)

embeddings = {
    "Word2Vec": get_w2v_features(w2v_cbow, X),
    "Skipgram": get_w2v_features(w2v_sg, X),
    "FastText": get_w2v_features(ft_model, X)
}

# GloVe embeddings (path required)
glove_path = r"E:\MTech\PROJECTS\NLP\embeddings\glove.6B.300d.txt"  # <-- update this path
glove_embeddings = load_glove(glove_path)
embeddings["GloVe"] = get_glove_features(glove_embeddings, X, dim=300)

for emb, X_vec in embeddings.items():
    for name, clf in classifiers.items():
        acc, prec, rec, f1 = evaluate_model(clf, X_vec, y)
        print(f"=== Training Model: {emb} + {name} ===")
        print(f"10-Fold CV -> Accuracy: {acc:.2f} | Precision: {prec:.2f} | Recall: {rec:.2f} | F1: {f1:.2f}\n")
        results.append([emb, name, acc, prec, rec, f1])
        
# ---------------------------
# Step 1: Print Training Logs
# ---------------------------
for emb, w, acc, prec, rec, f1 in embedding_results:
    print(f"=== Training Model: {emb} + Weighted Soft Voting (Weight={w}) ===")
    print(f"10-Fold CV -> Accuracy: {acc:.2f} | Precision: {prec:.2f} | Recall: {rec:.2f} | F1: {f1:.2f}\n")

# ---------------------------
# Step 2: Create Summary Table
# ---------------------------
df_embed = pd.DataFrame(embedding_results, columns=[
    "Embedding", "Weight", "Accuracy", "Precision", "Recall", "F1-Score"
])

print("=== Final Embedding-Level Results (Weighted Soft Voting, 10-Fold CV) ===\n")
print(df_embed)




=== Loading Dataset ===
Train File: train_subtask1.csv -> 2925 samples, 6 columns
Dev File  : dev_subtask1.csv -> 323 samples, 6 columns
Test File : test_subtask1_text.csv -> 311 samples, 2 columns

=== Training Model: BoW + Weighted Soft Voting (Weight=0.5) ===
10-Fold CV -> Accuracy: 94.46 | Precision: 94.36 | Recall: 92.67 | F1: 95.81

=== Training Model: Word2Vec + Weighted Soft Voting (Weight=0.8) ===
10-Fold CV -> Accuracy: 96.45 | Precision: 97.53 | Recall: 98.26 | F1: 96.42

=== Training Model: TF-IDF + Weighted Soft Voting (Weight=0.7) ===
10-Fold CV -> Accuracy: 96.48 | Precision: 95.98 | Recall: 93.62 | F1: 97.84

=== Training Model: GloVe + Weighted Soft Voting (Weight=1.0) ===
10-Fold CV -> Accuracy: 97.92 | Precision: 96.38 | Recall: 99.23 | F1: 98.52

=== Training Model: Skip-gram + Weighted Soft Voting (Weight=1.1) ===
10-Fold CV -> Accuracy: 98.82 | Precision: 98.36 | Recall: 98.15 | F1: 98.73

=== Training Model: FastText + Weighted Soft Voting (Weight=1.3) ===
10-Fol