In [2]:
# ---------------------------
# Imports
# ---------------------------
import re
import numpy as np
import pandas as pd
import nltk
import torch
from tqdm import tqdm

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from transformers import AutoTokenizer, AutoModel
dataset_path = r"Processed_Causality_Dataset.csv"

print("=== Loading Dataset ===")
df = pd.read_csv(dataset_path)

# Extract raw features + labels
X_raw = df["Sentence"].astype(str)
y_raw = df["Causality_Label"]

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)

# Train/test split (just for check; CV will use full training set)
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, stratify=y, test_size=0.2, random_state=42
)

print(f"Dataset File: {dataset_path}")
print(f"Total Samples: {df.shape[0]}, Columns: {df.shape[1]}")
print(f"Train Split: {len(X_train_raw)} | Test Split: {len(X_test_raw)}")
print(f"Label Classes: {list(label_encoder.classes_)}\n")



# ---------------------------
# Utility Functions
# ---------------------------
def evaluate_model(clf, X, y, folds=10):
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    accs, precs, recs, f1s = [], [], [], []
    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        clf.fit(X_train, y_train)
        preds = clf.predict(X_val)
        accs.append(accuracy_score(y_val, preds))
        precs.append(precision_score(y_val, preds, average="weighted"))
        recs.append(recall_score(y_val, preds, average="weighted"))
        f1s.append(f1_score(y_val, preds, average="weighted"))
    return np.mean(accs)*100, np.mean(precs)*100, np.mean(recs)*100, np.mean(f1s)*100

def get_vectorizer(embed_type):
    if embed_type == "tfidf":
        return TfidfVectorizer(max_features=5000)
    elif embed_type == "bow":
        return CountVectorizer(max_features=5000)
    return None

# ---------------------------
# Word2Vec / Skipgram / FastText / GloVe Embeddings
# ---------------------------
def build_w2v(sentences, sg=0):
    tokenized = [s.split() for s in sentences]
    model = Word2Vec(sentences=tokenized, vector_size=300, window=5, min_count=2, sg=sg)
    return model

def build_fasttext(sentences):
    tokenized = [s.split() for s in sentences]
    model = FastText(sentences=tokenized, vector_size=300, window=5, min_count=2)
    return model

def sentence_vector(model, sentence):
    words = sentence.split()
    vecs = []
    for w in words:
        if w in model.wv:
            vecs.append(model.wv[w])
    return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

def get_w2v_features(model, texts):
    return np.array([sentence_vector(model, t) for t in texts])

# ---- GloVe ----
def load_glove(glove_file):
    embeddings = {}
    with open(glove_file, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype="float32")
            embeddings[word] = vector
    return embeddings

def sentence_vector_glove(embeddings, sentence, dim=300):
    words = sentence.split()
    vecs = [embeddings[w] for w in words if w in embeddings]
    return np.mean(vecs, axis=0) if vecs else np.zeros(dim)

def get_glove_features(embeddings, texts, dim=300):
    return np.array([sentence_vector_glove(embeddings, t, dim) for t in texts])

def build_weighted_voting(random_state=42):
    np.random.seed(random_state)  # reproducibility if needed
    # generate random positive integers as weights
    weights = np.random.randint(0, 2, size=4).tolist()  

    print(f"[Info] Using random weights for ensemble: {weights}")

    svm_linear = SVC(kernel="linear", probability=True, random_state=random_state)
    rf         = RandomForestClassifier(random_state=random_state)
    xgb        = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=random_state)
    nb         = GaussianNB()

    clf = VotingClassifier(
        estimators=[
            ("SVM-Linear", svm_linear),
            ("RandomForest", rf),
            ("XGBoost", xgb),
            ("NaiveBayes", nb),
        ],
        voting="soft",
        weights=weights
    )
    return clf

results = []

# TF-IDF + BoW
for emb in ["tfidf", "bow"]:
    vect = get_vectorizer(emb)
    X_vec = vect.fit_transform(X)
    for name, clf in classifiers.items():
        acc, prec, rec, f1 = evaluate_model(clf, X_vec, y)
        print(f"=== Training Model: {emb.upper()} + {name} ===")
        print(f"10-Fold CV -> Accuracy: {acc:.2f} | Precision: {prec:.2f} | Recall: {rec:.2f} | F1: {f1:.2f}\n")
        results.append([emb.upper(), name, acc, prec, rec, f1])

# Word2Vec (CBOW, Skipgram), FastText
w2v_cbow = build_w2v(X, sg=0)
w2v_sg   = build_w2v(X, sg=1)
ft_model = build_fasttext(X)

embeddings = {
    "Word2Vec": get_w2v_features(w2v_cbow, X),
    "Skipgram": get_w2v_features(w2v_sg, X),
    "FastText": get_w2v_features(ft_model, X)
}

# GloVe embeddings (path required)
glove_path = r"E:\MTech\PROJECTS\NLP\embeddings\glove.6B.300d.txt"  # <-- update this path
glove_embeddings = load_glove(glove_path)
embeddings["GloVe"] = get_glove_features(glove_embeddings, X, dim=300)

for emb, X_vec in embeddings.items():
    for name, clf in classifiers.items():
        acc, prec, rec, f1 = evaluate_model(clf, X_vec, y)
        print(f"=== Training Model: {emb} + {name} ===")
        print(f"10-Fold CV -> Accuracy: {acc:.2f} | Precision: {prec:.2f} | Recall: {rec:.2f} | F1: {f1:.2f}\n")
        results.append([emb, name, acc, prec, rec, f1])
        
# ---------------------------
# Step 1: Print Training Logs
# ---------------------------
for emb, w, acc, prec, rec, f1 in embedding_results:
    print(f"=== Training Model: {emb} + Weighted Soft Voting (Weight={w}) ===")
    print(f"10-Fold CV -> Accuracy: {acc:.2f} | Precision: {prec:.2f} | Recall: {rec:.2f} | F1: {f1:.2f}\n")

# ---------------------------
# Step 2: Create Summary Table
# ---------------------------
df_embed = pd.DataFrame(embedding_results, columns=[
    "Embedding", "Weight", "Accuracy", "Precision", "Recall", "F1-Score"
])

print("=== Final Embedding-Level Results (Weighted Soft Voting, 10-Fold CV) ===\n")
print(df_embed)



=== Loading Dataset ===
Dataset File: Processed_Causality_Dataset.csv
Total Samples: 376, Columns: 2
Train Split: 300 | Test Split: 76
Label Classes: [0, 1]

=== Training Model: Word2Vec + Weighted Soft Voting (Weight=0.4) ===
10-Fold CV -> Accuracy: 81.62 | Precision: 81.32 | Recall: 81.34 | F1: 81.64

=== Training Model: GloVe + Weighted Soft Voting (Weight=0.5) ===
10-Fold CV -> Accuracy: 81.67 | Precision: 81.42 | Recall: 81.56 | F1: 81.67

=== Training Model: FastText + Weighted Soft Voting (Weight=0.6) ===
10-Fold CV -> Accuracy: 82.93 | Precision: 82.99 | Recall: 82.14 | F1: 82.93

=== Training Model: BoW + Weighted Soft Voting (Weight=0.8) ===
10-Fold CV -> Accuracy: 88.58 | Precision: 90.43 | Recall: 86.46 | F1: 87.92

=== Training Model: TF-IDF + Weighted Soft Voting (Weight=1.0) ===
10-Fold CV -> Accuracy: 93.58 | Precision: 94.10 | Recall: 92.82 | F1: 93.32

=== Training Model: Skip-gram + Weighted Soft Voting (Weight=1.2) ===
10-Fold CV -> Accuracy: 98.54 | Precision: 98.4