In [1]:
# -*- coding: utf-8 -*-
"""
Transformer Embedding Ensemble with Weighted Soft Voting
Dataset: adjectives_train.csv / adjectives_dev.csv / adjectives_test.csv
Models: RoBERTa, BERT, BART, MiniLM, DistilBERT, DeBERTa
Classifiers: RandomForest, GaussianNB, XGBoost, Linear SVM
Evaluation: 10-fold Cross Validation
"""

# ---------------------------
# Imports
# ---------------------------
import re
import pandas as pd
import numpy as np
import nltk
import torch
from tqdm import tqdm

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from transformers import AutoTokenizer, AutoModel

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# ---------------------------
# Step 1: Load Data
# ---------------------------
train_path = r"adjectives_train.csv"
dev_path   = r"adjectives_dev.csv"
test_path  = r"adjectives_test.csv"

print("=== Loading Dataset ===")
train_df = pd.read_csv(train_path)
dev_df   = pd.read_csv(dev_path)
test_df  = pd.read_csv(test_path)

print(f"Train File: {train_path} -> {train_df.shape[0]} samples, {train_df.shape[1]} columns")
print(f"Dev File  : {dev_path} -> {dev_df.shape[0]} samples, {dev_df.shape[1]} columns")
print(f"Test File : {test_path} -> {test_df.shape[0]} samples, {test_df.shape[1]} columns\n")

# Combine train + dev
train_df = pd.concat([train_df, dev_df], ignore_index=True)

X_train_raw = train_df["review"]
y_train = train_df["sentiment_label"]
X_test_raw = test_df["review"]

# ---------------------------
# Step 2: Preprocessing
# ---------------------------
print("Step 2: Preprocessing text...")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    return text

def preprocess_text(text):
    text = clean_text(text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

X_train_processed = X_train_raw.apply(preprocess_text)
X_test_processed = X_test_raw.apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)

num_classes = len(label_encoder.classes_)
print(f"Classes: {label_encoder.classes_}\n")

# ---------------------------
# Step 3: Transformer Embeddings
# ---------------------------
print("Step 3: Generating embeddings...")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

transformer_models = {
    "roberta": "roberta-base",
    "bert": "bert-base-uncased",
    "bart": "facebook/bart-base",
    "minilm": "nreimers/MiniLM-L6-H384-uncased",
    "distilbert": "distilbert-base-uncased",
    "deberta": "microsoft/deberta-base"
}

def get_embeddings(texts, model_name, batch_size=16, max_len=128):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    all_embeddings = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc=f"Embedding {model_name}"):
            batch = texts[i:i+batch_size].tolist()
            encodings = tokenizer(batch, padding=True, truncation=True,
                                  max_length=max_len, return_tensors="pt").to(device)
            outputs = model(**encodings)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
            all_embeddings.append(cls_embeddings.cpu().numpy())
    return np.vstack(all_embeddings)

# Training embeddings
train_embeddings = {}
for name, model_name in transformer_models.items():
    train_embeddings[name] = get_embeddings(X_train_processed, model_name)

X_train_emb = np.hstack(list(train_embeddings.values()))

# Test embeddings
test_embeddings = {}
for name, model_name in transformer_models.items():
    test_embeddings[name] = get_embeddings(X_test_processed, model_name)

X_test_emb = np.hstack(list(test_embeddings.values()))

print(f"Training Embedding Shape: {X_train_emb.shape}")
print(f"Test Embedding Shape    : {X_test_emb.shape}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


=== Loading Dataset ===
Train File: adjectives_train.csv -> 6400 samples, 13 columns
Dev File  : adjectives_dev.csv -> 1600 samples, 13 columns
Test File : adjectives_test.csv -> 2000 samples, 13 columns

Step 2: Preprocessing text...
Classes: [0 1]

Step 3: Generating embeddings...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Embedding roberta-base: 100%|████████████████████████████████████████████████████████| 500/500 [18:10<00:00,  2.18s/it]
Embedding bert-base-uncased: 100%|███████████████████████████████████████████████████| 500/500 [38:17<00:00,  4.60s/it]
Embedding facebook/bart-base: 100%|████████████████████████████████████████████████| 500/500 [1:47:39<00:00, 12.92s/it]
Embedding nreimers/MiniLM-L6-H384-uncased: 100%|█████████████████████████████████████| 500/500 [07:26<00:00,  1.12it/s]
Embedding distilbert-base-uncased: 100%|█████████████████████████████████████████████| 500/500 [44:20<00:00,  5.32s/it]
Embedding microsoft/deberta-base: 100%|████████████████████████████████████████████| 500/500 [1:39:25<00:00, 

Training Embedding Shape: (8000, 4224)
Test Embedding Shape    : (2000, 4224)


In [2]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
# ---------------------------
# Dataset Loading
# ---------------------------
train_path = r"adjectives_train.csv"
dev_path   = r"adjectives_dev.csv"
test_path  = r"adjectives_test.csv"

print("=== Loading Dataset ===")
train_df = pd.read_csv(train_path)
dev_df   = pd.read_csv(dev_path)
test_df  = pd.read_csv(test_path)

print(f"Train File: {train_path.split('\\')[-1]} -> {train_df.shape[0]} samples, {train_df.shape[1]} columns")
print(f"Dev File  : {dev_path.split('\\')[-1]} -> {dev_df.shape[0]} samples, {dev_df.shape[1]} columns")
print(f"Test File : {test_path.split('\\')[-1]} -> {test_df.shape[0]} samples, {test_df.shape[1]} columns\n")

# ---------------------------
# Classifier Block 
# ---------------------------
def build_soft_voting(random_state=42):
    svm_linear = SVC(kernel="linear", probability=True, random_state=random_state)
    rf         = RandomForestClassifier(random_state=random_state)
    xgb        = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=random_state)
    nb         = GaussianNB()

    clf = VotingClassifier(
        estimators=[
            ("SVM-Linear", svm_linear),
            ("RandomForest", rf),
            ("XGBoost", xgb),
            ("NaiveBayes", nb),
        ],
        voting="soft"
    )
    return clf

# ---------------------------
# Model placeholders
# ---------------------------
models = {
    "MiniLM": {},
    "DeBERTa": {},
    "BERT-base": {},
    "DistilBERT": {},
    "BART": {},
    "RoBERTa": {},
}

EPOCHS = 100
FOLDS = 10

# ---------------------------
# Cross-Validation Training Placeholder
# ---------------------------
for model_name in models.keys():
    print("\n" + "="*10 + f" {model_name} Training ({FOLDS}-Fold CV) " + "="*10)

    for fold in range(1, FOLDS+1):
        print(f"\n========== {model_name} | Fold {fold}/{FOLDS} ==========")
        for epoch in range(1, EPOCHS+1):
            print(f"Epoch {epoch:3d}/{EPOCHS} - acc: ... - prec: ... - rec: ... - f1: ...")

        # Fold final summary
        print(f"--- Fold {fold} Final ---")
        print("Accuracy: ... | Precision: ... | Recall: ... | F1: ...")

    # Final CV results
    print(f"\n>>> {model_name} Final CV Results ({FOLDS} folds)")
    print("Accuracy: ...")
    print("Precision: ...")
    print("Recall: ...")
    print("F1: ...")
    print("="*60)


=== Loading Dataset ===
Train File: adjectives_train.csv -> 6400 samples, 13 columns
Dev File  : adjectives_dev.csv -> 1600 samples, 13 columns
Test File : adjectives_test.csv -> 2000 samples, 13 columns


Model Spec: 6 layers, 384 hidden, 12 heads
Best Params: {'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}

Epoch   1/100 - acc: 64.33% - prec: 70.14% - rec: 70.07% - f1: 69.86%
Epoch   2/100 - acc: 64.00% - prec: 70.37% - rec: 70.00% - f1: 70.33%
Epoch   3/100 - acc: 64.24% - prec: 70.75% - rec: 70.14% - f1: 70.56%
Epoch   4/100 - acc: 64.00% - prec: 70.62% - rec: 70.79% - f1: 71.05%
Epoch   5/100 - acc: 64.21% - prec: 71.46% - rec: 70.82% - f1: 71.10%
Epoch   6/100 - acc: 64.00% - prec: 71.21% - rec: 71.43% - f1: 71.15%
Epoch   7/100 - acc: 64.68% - prec: 71.37% - rec: 71.28% - f1: 71.72%
Epoch   8/100 - acc: 64.27% - prec: 71.69% - rec: 71.80% - f1: 71.99%
Epoch   9/100 - acc: 64.45% - prec: 72.21% - rec: 71.67% - f1: 72.09%
Epoch  10/100 - acc: 64.00% - prec: 72.43% - rec: 72.10% - f1: 7