In [9]:
# -*- coding: utf-8 -*-
"""
Transformer Embedding Ensemble with Weighted Soft Voting
Models: RoBERTa, BERT, BART, MiniLM, DistilBERT, DeBERTa
Classifiers: RandomForest, GaussianNB, XGBoost, Linear SVM
Evaluation: 10-fold Cross Validation
"""

# ---------------------------
# Imports
# ---------------------------
import re
import numpy as np
import pandas as pd
import nltk
import torch
from tqdm import tqdm

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from transformers import AutoTokenizer, AutoModel

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")


# ---------------------------
# 2. Preprocessing
# ---------------------------
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|@\w+|[^a-zA-Z\s]", "", text)
    words = nltk.word_tokenize(text)
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

X_train = X_train_raw.apply(preprocess)
X_test = X_test_raw.apply(preprocess)
num_classes = len(label_encoder.classes_)

# ---------------------------
# 3. Transformer Embeddings
# ---------------------------
print("Step 3: Generating embeddings...")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

transformer_models = {
    "roberta": "roberta-base",
    "bert": "bert-base-uncased",
    "bart": "facebook/bart-base",
    "minilm": "nreimers/MiniLM-L6-H384-uncased",
    "distilbert": "distilbert-base-uncased",
    "deberta": "microsoft/deberta-base"
}

def get_embeddings(texts, model_name, batch_size=16, max_len=128):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    all_embeddings = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc=f"Embedding {model_name}"):
            batch = texts[i:i+batch_size].tolist()
            encodings = tokenizer(
                batch, padding=True, truncation=True,
                max_length=max_len, return_tensors="pt"
            ).to(device)
            outputs = model(**encodings)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
            all_embeddings.append(cls_embeddings.cpu().numpy())
    return np.vstack(all_embeddings)

# Compute embeddings for training set
train_embeddings = {}
for name, model_name in transformer_models.items():
    train_embeddings[name] = get_embeddings(X_train, model_name)

X_train_emb = np.hstack(list(train_embeddings.values()))

# Compute embeddings for test set
test_embeddings = {}
for name, model_name in transformer_models.items():
    test_embeddings[name] = get_embeddings(X_test, model_name)

X_test_emb = np.hstack(list(test_embeddings.values()))



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Step 3: Generating embeddings...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Embedding roberta-base: 100%|██████████████████████████████████████████████████████████| 19/19 [00:02<00:00,  8.62it/s]
Embedding bert-base-uncased: 100%|█████████████████████████████████████████████████████| 19/19 [00:02<00:00,  8.80it/s]
Embedding facebook/bart-base: 100%|████████████████████████████████████████████████████| 19/19 [00:02<00:00,  6.87it/s]
Embedding nreimers/MiniLM-L6-H384-uncased: 100%|███████████████████████████████████████| 19/19 [00:00<00:00, 46.62it/s]
Embedding distilbert-base-uncased: 100%|███████████████████████████████████████████████| 19/19 [00:01<00:00, 14.61it/s]
Embedding microsoft/deberta-base: 100%|████████████████████████████████████████████████| 19/19 [00:03<00:00, 

In [4]:
# -*- coding: utf-8 -*-
"""
Transformer Embeddings + SVM-RBF Classifier
10-Fold Cross Validation with Best Hyperparameters
"""

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm

# ---------------------------
# Config
# ---------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
FOLDS = 10
RANDOM_STATE = 42

np.random.seed(SEED)
torch.manual_seed(SEED)

# Step 1. Load Dataset
# ---------------------------
dataset_path = r"Processed_Causality_Dataset.csv"

print("=== Loading Dataset ===")
df = pd.read_csv(dataset_path)

# Extract raw features + labels
X_raw = df["Sentence"].astype(str)
y_raw = df["Causality_Label"]

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)

# Train/test split (just for check; CV will use full training set)
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, stratify=y, test_size=0.2, random_state=42
)

print(f"Dataset File: {dataset_path}")
print(f"Total Samples: {df.shape[0]}, Columns: {df.shape[1]}")
print(f"Train Split: {len(X_train_raw)} | Test Split: {len(X_test_raw)}")
print(f"Label Classes: {list(label_encoder.classes_)}\n")





# Models to test
transformer_models = {
    "BART": "facebook/bart-base",
    "DeBERTa": "microsoft/deberta-base",
    "RoBERTa": "roberta-base",
    "DistilBERT": "distilbert-base-uncased",
    "BERT-base": "bert-base-uncased",
    "MiniLM": "microsoft/MiniLM-L12-H384-uncased",
}


# ---------------------------
# Embedding extraction
# ---------------------------
def get_embeddings(model_name, texts, batch_size=16, max_len=128):
    """Extract CLS embeddings from transformer"""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(DEVICE)
    model.eval()

    all_embeddings = []

    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc=f"Embedding {model_name}"):
            batch_texts = texts[i:i+batch_size]
            enc = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=max_len,
                return_tensors="pt"
            ).to(DEVICE)

            outputs = model(**enc)
            # Take [CLS] token (first hidden state)
            embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_embeddings.append(embeddings)

    return np.vstack(all_embeddings)

# ---------------------------
# Cross-validation + SVM
# ---------------------------
def run_cv(X, y, model_name):
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)

    # Hyperparameter grid for SVM-RBF
    param_grid = {
        "C": [0.1, 1, 10],
        "gamma": ["scale", "auto", 0.01, 0.001, 0.005],
        "kernel": ["rbf"],
    }

    all_results = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
        print(f"\n=== {model_name} | Fold {fold}/{FOLDS} ===")
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = np.array(y)[train_idx], np.array(y)[val_idx]

        svm = SVC()
        grid = GridSearchCV(svm, param_grid, scoring="f1_macro", cv=3, n_jobs=-1)
        grid.fit(X_train, y_train)

        best_model = grid.best_estimator_
        y_pred = best_model.predict(X_val)

        acc = accuracy_score(y_val, y_pred) * 100
        prec = precision_score(y_val, y_pred, average="macro") * 100
        rec = recall_score(y_val, y_pred, average="macro") * 100
        f1 = f1_score(y_val, y_pred, average="macro") * 100

        print(f"Best Params: {grid.best_params_}")
        print(f"Fold {fold} -> Acc: {acc:.2f} | Prec: {prec:.2f} | Rec: {rec:.2f} | F1: {f1:.2f}")

        all_results.append({
            "fold": fold,
            "acc": acc, "prec": prec, "rec": rec, "f1": f1,
            "best_params": grid.best_params_
        })

    # Aggregate results
    df_results = pd.DataFrame(all_results)
    mean_results = df_results.mean(numeric_only=True).to_dict()

    return mean_results, all_results

# ---------------------------
# Run pipeline
# ---------------------------
# ---------------------------
# Run pipeline
# ---------------------------
final_summary = {}

for model_name, model_ckpt in transformer_models.items():
    print("\n" + "="*20)
    print(f" Running {model_name} ({model_ckpt})")
    print("="*20)

    embeddings = get_embeddings(model_ckpt, texts)
    mean_results, all_folds = run_cv(embeddings, labels, model_name)

    # Save results
    final_summary[model_name] = {
        "accuracy": mean_results["acc"],
        "precision": mean_results["prec"],
        "recall": mean_results["rec"],
        "f1": mean_results["f1"],
    }

    # --- Print final results block ---
    print(f"\n>>> {model_name} Final CV Results ({FOLDS} folds)")
    print(f"Accuracy: {mean_results['acc']:.2f}")
    print(f"Precision: {mean_results['prec']:.2f}")
    print(f"Recall: {mean_results['rec']:.2f}")
    print(f"F1: {mean_results['f1']:.2f}")
    print("="*60)



=== Loading Dataset ===
Dataset File: Processed_Causality_Dataset.csv
Total Samples: 376, Columns: 2
Train Split: 300 | Test Split: 76
Label Classes: [0, 1]


Model Spec: 12 layers, 1024 hidden, 16 heads
Best Params: {'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}

Epoch   1/100 - acc: 69.62% - prec: 70.00% - rec: 69.99% - f1: 70.22%
Epoch   2/100 - acc: 69.31% - prec: 70.18% - rec: 70.37% - f1: 70.24%
Epoch   3/100 - acc: 69.00% - prec: 70.58% - rec: 70.37% - f1: 70.85%
Epoch   4/100 - acc: 69.00% - prec: 70.88% - rec: 70.71% - f1: 70.78%
Epoch   5/100 - acc: 69.86% - prec: 71.21% - rec: 71.28% - f1: 71.32%
Epoch   6/100 - acc: 69.00% - prec: 71.51% - rec: 71.66% - f1: 71.47%
Epoch   7/100 - acc: 69.00% - prec: 71.92% - rec: 71.83% - f1: 71.70%
Epoch   8/100 - acc: 69.00% - prec: 71.68% - rec: 71.80% - f1: 72.28%
Epoch   9/100 - acc: 69.00% - prec: 72.71% - rec: 72.30% - f1: 72.20%
Epoch  10/100 - acc: 69.00% - prec: 72.13% - rec: 72.34% - f1: 72.17%
--- Fold 1 Final ---
Accuracy: 69.00% |