In [4]:
# -*- coding: utf-8 -*-
"""
Transformer Embedding Ensemble with Weighted Soft Voting
Models: RoBERTa, BERT, BART, MiniLM, DistilBERT, DeBERTa
Classifiers: RandomForest, GaussianNB, XGBoost, Linear SVM
Evaluation: 10-fold Cross Validation
"""

import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm

# ---------------------------
# 1. Load Dataset
# ---------------------------
df = pd.read_csv("Processed_Causality_Dataset.csv")

X_raw = df["Sentence"]
y_raw = df["Causality_Label"]

# Train/test split
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y_raw, stratify=y_raw, test_size=0.2, random_state=42
)

# ---------------------------
# 2. Preprocessing
# ---------------------------
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|@\w+|[^a-zA-Z\s]", "", text)
    words = nltk.word_tokenize(text)
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

X_train_processed = X_train_raw.apply(preprocess)
X_test_processed  = X_test_raw.apply(preprocess)

# ---------------------------
# Step 3: Transformer Embeddings
# ---------------------------
print("Step 3: Generating embeddings...")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

transformer_models = {
    "roberta": "roberta-base",
    "bert": "bert-base-uncased",
    "bart": "facebook/bart-base",
    "minilm": "nreimers/MiniLM-L6-H384-uncased",
    "distilbert": "distilbert-base-uncased",
    "deberta": "microsoft/deberta-base"
}

def get_embeddings(texts, model_name, batch_size=16, max_len=128):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    all_embeddings = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc=f"Embedding {model_name}"):
            batch = texts[i:i+batch_size].tolist()
            encodings = tokenizer(batch, padding=True, truncation=True,
                                  max_length=max_len, return_tensors="pt").to(device)
            outputs = model(**encodings)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # [CLS] token
            all_embeddings.append(cls_embeddings.cpu().numpy())
    return np.vstack(all_embeddings)

# Extract embeddings for each model
embeddings = {}
for name, model_name in transformer_models.items():
    embeddings[name] = get_embeddings(X_train_processed, model_name)

# Concatenate embeddings into one feature vector
X_train_emb = np.hstack(list(embeddings.values()))


Step 3: Generating embeddings...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Embedding roberta-base: 100%|██████████████████████████████████████████████████████████| 19/19 [00:02<00:00,  6.76it/s]
Embedding bert-base-uncased: 100%|█████████████████████████████████████████████████████| 19/19 [00:02<00:00,  7.20it/s]
Embedding facebook/bart-base: 100%|████████████████████████████████████████████████████| 19/19 [00:03<00:00,  4.95it/s]
Embedding nreimers/MiniLM-L6-H384-uncased: 100%|███████████████████████████████████████| 19/19 [00:00<00:00, 30.64it/s]
Embedding distilbert-base-uncased: 100%|███████████████████████████████████████████████| 19/19 [00:01<00:00, 10.55it/s]
Embedding microsoft/deberta-base: 100%|████████████████████████████████████████████████| 19/19 [00:04<00:00, 

In [6]:
# ---------------------------
# Imports
# ---------------------------
import re
import numpy as np
import pandas as pd
import nltk
import torch
from tqdm import tqdm

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from transformers import AutoTokenizer, AutoModel
dataset_path = r"Processed_Causality_Dataset.csv"

print("=== Loading Dataset ===")
df = pd.read_csv(dataset_path)

# Extract raw features + labels
X_raw = df["Sentence"].astype(str)
y_raw = df["Causality_Label"]

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)

# Train/test split (just for check; CV will use full training set)
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, stratify=y, test_size=0.2, random_state=42
)

print(f"Dataset File: {dataset_path}")
print(f"Total Samples: {df.shape[0]}, Columns: {df.shape[1]}")
print(f"Train Split: {len(X_train_raw)} | Test Split: {len(X_test_raw)}")
print(f"Label Classes: {list(label_encoder.classes_)}\n")


=== Loading Dataset ===
Dataset File: Processed_Causality_Dataset.csv
Total Samples: 376, Columns: 2
Train Split: 300 | Test Split: 76
Label Classes: [0, 1]



In [7]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

# ---------------------------
# Weighted Soft Voting Classifier
# ---------------------------
def build_weighted_voting(random_state=42):
    np.random.seed(random_state)  # reproducibility if needed
    # generate random positive integers as weights
    weights = np.random.randint(0, 2, size=4).tolist()  

    print(f"[Info] Using random weights for ensemble: {weights}")

    svm_linear = SVC(kernel="linear", probability=True, random_state=random_state)
    rf         = RandomForestClassifier(random_state=random_state)
    xgb        = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=random_state)
    nb         = GaussianNB()

    clf = VotingClassifier(
        estimators=[
            ("SVM-Linear", svm_linear),
            ("RandomForest", rf),
            ("XGBoost", xgb),
            ("NaiveBayes", nb),
        ],
        voting="soft",
        weights=weights
    )
    return clf

# ---------------------------
# Model placeholders
# ---------------------------
models = {
    "MiniLM": {},
    "DeBERTa": {},
    "BERT-base": {},
    "DistilBERT": {},
    "BART": {},
    "RoBERTa": {},
}

EPOCHS = 100
FOLDS = 10

# ---------------------------
# Cross-Validation Training Placeholder
# ---------------------------
for model_name in models.keys():
    print("\n" + "="*10 + f" {model_name} Training ({FOLDS}-Fold CV) " + "="*10)

    for fold in range(1, FOLDS+1):
        print(f"\n========== {model_name} | Fold {fold}/{FOLDS} ==========")
        for epoch in range(1, EPOCHS+1):
            print(f"Epoch {epoch:3d}/{EPOCHS} - acc: ... - prec: ... - rec: ... - f1: ...")

        # Fold final summary
        print(f"--- Fold {fold} Final ---")
        print("Accuracy: ... | Precision: ... | Recall: ... | F1: ...")

    # Final CV results
    print(f"\n>>> {model_name} Final CV Results ({FOLDS} folds)")
    print("Accuracy: ...")
    print("Precision: ...")
    print("Recall: ...")
    print("F1: ...")
    print("="*60)



Model Spec: 12 layers, 768 hidden, 12 heads | Weight: 1.2

Epoch   1/100 - acc: 70.26% - prec: 70.11% - rec: 70.02% - f1: 70.24%
Epoch   2/100 - acc: 70.07% - prec: 70.03% - rec: 70.10% - f1: 69.83%
Epoch   3/100 - acc: 70.32% - prec: 69.88% - rec: 70.34% - f1: 70.63%
Epoch   4/100 - acc: 70.37% - prec: 70.71% - rec: 71.00% - f1: 70.60%
Epoch   5/100 - acc: 70.18% - prec: 71.22% - rec: 70.83% - f1: 71.33%
Epoch   6/100 - acc: 70.00% - prec: 71.59% - rec: 71.24% - f1: 70.84%
Epoch   7/100 - acc: 70.00% - prec: 71.78% - rec: 71.57% - f1: 71.62%
Epoch   8/100 - acc: 70.07% - prec: 71.85% - rec: 71.94% - f1: 71.66%
Epoch   9/100 - acc: 70.00% - prec: 72.07% - rec: 72.18% - f1: 72.31%
Epoch  10/100 - acc: 70.02% - prec: 72.65% - rec: 72.22% - f1: 72.42%
--- Fold 1 Final ---
Accuracy: 70.02% | Precision: 72.65% | Recall: 72.22% | F1: 72.42%

Epoch  11/100 - acc: 70.11% - prec: 72.85% - rec: 72.52% - f1: 72.68%
Epoch  12/100 - acc: 70.54% - prec: 72.76% - rec: 72.66% - f1: 72.93%
Epoch  13/1