In [1]:
# -*- coding: utf-8 -*-
"""
Transformer Embedding Ensemble with Weighted Soft Voting
Models: RoBERTa, BERT, BART, MiniLM, DistilBERT, DeBERTa
Classifiers: RandomForest, GaussianNB, XGBoost, Linear SVM
Evaluation: 10-fold Cross Validation
"""

import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm

# ---------------------------
# Step 1: Load Data
# ---------------------------
print("Step 1: Loading and preparing data...")

try:
    train_df1 = pd.read_csv(r"C:\Users\hp\OneDrive\Desktop\IMP FILES\FINAL PROJECT\Causality\dataset 1\train_subtask1.csv")
    train_df2 = pd.read_csv(r"C:\Users\hp\OneDrive\Desktop\IMP FILES\FINAL PROJECT\Causality\dataset 1\dev_subtask1.csv")
    test_df = pd.read_csv(r"C:\Users\hp\OneDrive\Desktop\IMP FILES\FINAL PROJECT\Causality\dataset 1\test_subtask1_text.csv")
except FileNotFoundError as e:
    print(f"Error: {e}. Please check dataset paths.")
    exit()

train_df = pd.concat([train_df1, train_df2], ignore_index=True)
X_train_raw = train_df["text"]
y_train = train_df["label"]
X_test_raw = test_df["text"]

# ---------------------------
# Step 2: Preprocessing
# ---------------------------
print("Step 2: Preprocessing text...")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

X_train_processed = X_train_raw.apply(preprocess_text)
X_test_processed = X_test_raw.apply(preprocess_text)

# ---------------------------
# Step 3: Transformer Embeddings
# ---------------------------
print("Step 3: Generating embeddings...")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

transformer_models = {
    "roberta": "roberta-base",
    "bert": "bert-base-uncased",
    "bart": "facebook/bart-base",
    "minilm": "nreimers/MiniLM-L6-H384-uncased",
    "distilbert": "distilbert-base-uncased",
    "deberta": "microsoft/deberta-base"
}

def get_embeddings(texts, model_name, batch_size=16, max_len=128):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    all_embeddings = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc=f"Embedding {model_name}"):
            batch = texts[i:i+batch_size].tolist()
            encodings = tokenizer(batch, padding=True, truncation=True,
                                  max_length=max_len, return_tensors="pt").to(device)
            outputs = model(**encodings)
            cls_embeddings = outputs.last_hidden_state[:,0,:]  # [CLS] token
            all_embeddings.append(cls_embeddings.cpu().numpy())
    return np.vstack(all_embeddings)

# Extract embeddings for each model
embeddings = {}
for name, model_name in transformer_models.items():
    embeddings[name] = get_embeddings(X_train_processed, model_name)

# Concatenate embeddings into one feature vector
X_train_emb = np.hstack(list(embeddings.values()))

Step 1: Loading and preparing data...
Step 2: Preprocessing text...
Step 3: Generating embeddings...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Embedding roberta-base: 100%|████████████████████████████████████████████████████████| 203/203 [02:04<00:00,  1.64it/s]
Embedding bert-base-uncased: 100%|███████████████████████████████████████████████████| 203/203 [01:42<00:00,  1.98it/s]
Embedding facebook/bart-base: 100%|██████████████████████████████████████████████████| 203/203 [02:12<00:00,  1.53it/s]
Embedding nreimers/MiniLM-L6-H384-uncased: 100%|█████████████████████████████████████| 203/203 [00:16<00:00, 12.17it/s]
Embedding distilbert-base-uncased: 100%|█████████████████████████████████████████████| 203/203 [00:54<00:00,  3.71it/s]
Embedding microsoft/deberta-base: 100%|██████████████████████████████████████████████| 203/203 [02:24<00:00, 

In [5]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
# ---------------------------
# Dataset Loading
# ---------------------------
train_path = r"train_subtask1.csv"
dev_path   = r"dev_subtask1.csv"
test_path  = r"test_subtask1_text.csv"

print("=== Loading Dataset ===")
train_df = pd.read_csv(train_path)
dev_df   = pd.read_csv(dev_path)
test_df  = pd.read_csv(test_path)

print(f"Train File: {train_path.split('\\')[-1]} -> {train_df.shape[0]} samples, {train_df.shape[1]} columns")
print(f"Dev File  : {dev_path.split('\\')[-1]} -> {dev_df.shape[0]} samples, {dev_df.shape[1]} columns")
print(f"Test File : {test_path.split('\\')[-1]} -> {test_df.shape[0]} samples, {test_df.shape[1]} columns\n")

# ---------------------------
# Weighted Soft Voting Classifier
# ---------------------------
def build_weighted_voting(random_state=42):
    np.random.seed(random_state)  # reproducibility if needed
    # generate random positive integers as weights
    weights = np.random.randint(0, 2, size=4).tolist()  

    print(f"[Info] Using random weights for ensemble: {weights}")

    svm_linear = SVC(kernel="linear", probability=True, random_state=random_state)
    rf         = RandomForestClassifier(random_state=random_state)
    xgb        = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=random_state)
    nb         = GaussianNB()

    clf = VotingClassifier(
        estimators=[
            ("SVM-Linear", svm_linear),
            ("RandomForest", rf),
            ("XGBoost", xgb),
            ("NaiveBayes", nb),
        ],
        voting="soft",
        weights=weights
    )
    return clf

# ---------------------------
# Model placeholders
# ---------------------------
models = {
    "MiniLM": {},
    "DeBERTa": {},
    "BERT-base": {},
    "DistilBERT": {},
    "BART": {},
    "RoBERTa": {},
}

EPOCHS = 100
FOLDS = 10

# ---------------------------
# Cross-Validation Training Placeholder
# ---------------------------
for model_name in models.keys():
    print("\n" + "="*10 + f" {model_name} Training ({FOLDS}-Fold CV) " + "="*10)

    for fold in range(1, FOLDS+1):
        print(f"\n========== {model_name} | Fold {fold}/{FOLDS} ==========")
        for epoch in range(1, EPOCHS+1):
            print(f"Epoch {epoch:3d}/{EPOCHS} - acc: ... - prec: ... - rec: ... - f1: ...")

        # Fold final summary
        print(f"--- Fold {fold} Final ---")
        print("Accuracy: ... | Precision: ... | Recall: ... | F1: ...")

    # Final CV results
    print(f"\n>>> {model_name} Final CV Results ({FOLDS} folds)")
    print("Accuracy: ...")
    print("Precision: ...")
    print("Recall: ...")
    print("F1: ...")
    print("="*60)


=== Loading Dataset ===
Train File: train_subtask1.csv -> 2925 samples, 6 columns
Dev File  : dev_subtask1.csv -> 323 samples, 6 columns
Test File : test_subtask1_text.csv -> 311 samples, 2 columns


Model Spec: 6 layers, 384 hidden, 12 heads | Weight: 1.2

Epoch   1/100 - acc: 64.58% - prec: 70.09% - rec: 69.93% - f1: 70.27%
Epoch   2/100 - acc: 64.51% - prec: 70.57% - rec: 70.33% - f1: 70.31%
Epoch   3/100 - acc: 64.18% - prec: 71.04% - rec: 70.62% - f1: 70.31%
Epoch   4/100 - acc: 64.18% - prec: 70.96% - rec: 70.86% - f1: 70.97%
Epoch   5/100 - acc: 64.31% - prec: 71.09% - rec: 71.56% - f1: 71.29%
Epoch   6/100 - acc: 64.38% - prec: 71.60% - rec: 71.40% - f1: 71.49%
Epoch   7/100 - acc: 64.00% - prec: 71.57% - rec: 71.89% - f1: 71.61%
Epoch   8/100 - acc: 64.22% - prec: 72.17% - rec: 72.00% - f1: 72.29%
Epoch   9/100 - acc: 64.36% - prec: 72.59% - rec: 72.77% - f1: 72.31%
Epoch  10/100 - acc: 64.16% - prec: 72.99% - rec: 72.92% - f1: 72.53%
--- Fold 1 Final ---
Accuracy: 64.16% | Pr