Model based on Utterance alone

In [None]:
import torch
import pandas as pd
import random
import nltk
from nltk.corpus import wordnet
from transformers import MarianMTModel, MarianTokenizer
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Download necessary NLTK resources
nltk.download("wordnet")
nltk.download("omw-1.4")

print("Loading MarianMT Models for Spanish Back Translation...")
src_lang = "Helsinki-NLP/opus-mt-en-es"  # English → Spanish
tgt_lang = "Helsinki-NLP/opus-mt-es-en"  # Spanish → English

src_tokenizer = MarianTokenizer.from_pretrained(src_lang)
src_model = MarianMTModel.from_pretrained(src_lang)
tgt_tokenizer = MarianTokenizer.from_pretrained(tgt_lang)
tgt_model = MarianMTModel.from_pretrained(tgt_lang)

def back_translate(sentence):
    """Translate sentence to Spanish and back to English."""
    tokens = src_tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)
    translated = src_model.generate(**tokens)
    spanish_text = src_tokenizer.batch_decode(translated, skip_special_tokens=True)[0]

    tokens = tgt_tokenizer(spanish_text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    back_translated = tgt_model.generate(**tokens)
    return tgt_tokenizer.batch_decode(back_translated, skip_special_tokens=True)[0]

def synonym_replacement(sentence, n=1):
    """Replace n random words in the sentence with synonyms."""
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set(words))
    random.shuffle(random_word_list)

    num_replaced = 0
    for word in random_word_list:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            if synonym.lower() != word.lower():
                new_words = [synonym if w == word else w for w in new_words]
                num_replaced += 1
            if num_replaced >= n:
                break
    return " ".join(new_words)

def augment_sentence(sentence):
    """Apply back translation and synonym replacement."""
    back_translated = back_translate(sentence)
    synonym_replaced = synonym_replacement(back_translated)
    return synonym_replaced

def load_mustard_data(file_path):
    """Load Mustard dataset and apply augmentation."""
    print("Loading dataset...")
    df = pd.read_csv(file_path)
    df = df[['utterance', 'sarcasm']]
    df['sarcasm'] = df['sarcasm'].astype(int)

    print("Applying data augmentation...")
    augmented_sentences = [augment_sentence(sentence) for sentence in df['utterance']]

    df_aug = pd.DataFrame({"utterance": augmented_sentences, "sarcasm": df['sarcasm']})
    df = pd.concat([df, df_aug]).reset_index(drop=True)  # Combine original and augmented data

    print(f"Final dataset size after augmentation: {len(df)} samples")
    return df

def preprocess_data(df, tokenizer):
    """Tokenize utterances using RoBERTa tokenizer."""
    print("Tokenizing dataset...")
    return tokenizer(df['utterance'].tolist(), padding=True, truncation=True, max_length=128)

def prepare_datasets(file_path, tokenizer):
    """Prepare train, validation, and test datasets."""
    df = load_mustard_data(file_path)

    print("Splitting dataset...")
    train_texts, test_texts, train_labels, test_labels = train_test_split(df['utterance'], df['sarcasm'], test_size=0.2, random_state=42)
    train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

    print("Encoding datasets...")
    train_encodings = tokenizer(list(train_texts), padding=True, truncation=True, max_length=128)
    val_encodings = tokenizer(list(val_texts), padding=True, truncation=True, max_length=128)
    test_encodings = tokenizer(list(test_texts), padding=True, truncation=True, max_length=128)

    train_dataset = Dataset.from_dict({"input_ids": train_encodings['input_ids'], "attention_mask": train_encodings['attention_mask'], "labels": list(train_labels)})
    val_dataset = Dataset.from_dict({"input_ids": val_encodings['input_ids'], "attention_mask": val_encodings['attention_mask'], "labels": list(val_labels)})
    test_dataset = Dataset.from_dict({"input_ids": test_encodings['input_ids'], "attention_mask": test_encodings['attention_mask'], "labels": list(test_labels)})

    print("Datasets ready.")
    return DatasetDict({"train": train_dataset, "validation": val_dataset, "test": test_dataset}), list(test_labels)

def compute_metrics(pred):
    """Compute accuracy for evaluation."""
    predictions = torch.argmax(torch.tensor(pred.predictions), dim=1)
    labels = torch.tensor(pred.label_ids)
    accuracy = (predictions == labels).float().mean().item()
    return {"accuracy": accuracy}

# Load tokenizer and dataset
print("Loading RoBERTa tokenizer...")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
datasets, true_labels = prepare_datasets("/content/sarcasm_data.csv", tokenizer)

# Load model
print("Loading RoBERTa model...")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model.classifier.apply(model._init_weights)  # Initialize classifier weights

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    gradient_accumulation_steps=2,
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["validation"],
    compute_metrics=compute_metrics,
)

# Train model
print("Training started...")
trainer.train()
print("Training complete.")

# Evaluate model
print("Evaluating model on test set...")
test_results = trainer.evaluate(datasets["test"])
print("Test Accuracy:", test_results["eval_accuracy"])

# Generate Predictions and Classification Report
print("Generating predictions...")
predictions = trainer.predict(datasets["test"])
pred_labels = torch.argmax(torch.tensor(predictions.predictions), dim=1).tolist()

print("\nClassification Report:")
print(classification_report(true_labels, pred_labels, target_names=["Non-Sarcastic", "Sarcastic"]))


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Loading MarianMT Models for Spanish Back Translation...




Loading RoBERTa tokenizer...
Loading dataset...
Applying data augmentation...
Final dataset size after augmentation: 1380 samples
Splitting dataset...
Encoding datasets...
Datasets ready.
Loading RoBERTa model...


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training started...


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.680913,0.621622
2,No log,0.573096,0.720721
3,No log,0.494252,0.747748
4,No log,0.436416,0.846847
5,No log,0.616677,0.72973


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.680913,0.621622
2,No log,0.573096,0.720721
3,No log,0.494252,0.747748
4,No log,0.436416,0.846847
5,No log,0.616677,0.72973
6,No log,0.795072,0.747748
7,No log,0.65582,0.801802
8,No log,0.74351,0.81982
9,No log,0.828362,0.828829


Training complete.
Evaluating model on test set...


Test Accuracy: 0.717391312122345
Generating predictions...

Classification Report:
               precision    recall  f1-score   support

Non-Sarcastic       0.71      0.74      0.72       137
    Sarcastic       0.73      0.70      0.71       139

     accuracy                           0.72       276
    macro avg       0.72      0.72      0.72       276
 weighted avg       0.72      0.72      0.72       276



Model based on Utterance and Context

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from nltk.corpus import wordnet
import random
from deep_translator import GoogleTranslator
import os

# Disable W&B logging
os.environ["WANDB_DISABLED"] = "true"

# Load dataset
print("Loading dataset...")
df = pd.read_csv("/content/sarcasm_data.csv")

print("Applying data augmentation...")
def synonym_replacement(sentence, n=2):
    words = sentence.split()
    new_words = words.copy()
    for _ in range(n):
        word_idx = random.randint(0, len(words) - 1)
        synonyms = wordnet.synsets(words[word_idx])
        if synonyms:
            new_words[word_idx] = synonyms[0].lemmas()[0].name()
    return ' '.join(new_words)

def back_translation(sentence, src_lang='en', mid_lang='es'):
    translated = GoogleTranslator(source='auto', target=mid_lang).translate(sentence)
    back_translated = GoogleTranslator(source=mid_lang, target=src_lang).translate(translated)
    return back_translated

# Apply data augmentation on utterance alone
df["augmented"] = df["utterance"].apply(lambda x: synonym_replacement(x))
df["back_translated"] = df["utterance"].apply(lambda x: back_translation(x))

# Prepare final dataset
print("Preparing dataset...")
df_augmented = df.melt(id_vars=["sarcasm"], value_vars=["utterance", "augmented", "back_translated"], var_name="type", value_name="text")

# Train-test split
print("Splitting dataset...")
train_texts, val_texts, train_labels, val_labels = train_test_split(df_augmented['text'], df_augmented['sarcasm'], test_size=0.2, random_state=42)

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

print("Tokenizing dataset...")
def tokenize_function(texts):
    return tokenizer(texts.tolist(), padding="max_length", truncation=True, max_length=128, return_tensors="pt")

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

class SarcasmDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels.iloc[idx])
        return item

train_dataset = SarcasmDataset(train_encodings, train_labels)
val_dataset = SarcasmDataset(val_encodings, val_labels)

print("Initializing model...")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to=[]  # Disable W&B logging
)

print("Starting training...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

print("Evaluating model...")
predictions = trainer.predict(val_dataset)
preds = torch.argmax(torch.tensor(predictions.predictions), dim=1).numpy()
print(classification_report(val_labels, preds))

# Save model
print("Saving model...")
model.save_pretrained("./roberta_sarcasm_model")
tokenizer.save_pretrained("./roberta_sarcasm_model")
print("Training complete!")


Loading dataset...
Applying data augmentation...
Preparing dataset...
Splitting dataset...
Tokenizing dataset...
Initializing model...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,No log,0.592693
2,No log,0.503097
3,0.478400,0.600597


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Evaluating model...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


              precision    recall  f1-score   support

           0       0.75      0.97      0.85       200
           1       0.96      0.70      0.81       214

    accuracy                           0.83       414
   macro avg       0.86      0.83      0.83       414
weighted avg       0.86      0.83      0.83       414

Saving model...
Training complete!


Model Based on Context and Utterane Embedded with Speakers

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from nltk.corpus import wordnet
import random
from deep_translator import GoogleTranslator
import os

# Disable W&B logging
os.environ["WANDB_DISABLED"] = "true"

# Load dataset
print("Loading dataset...")
df = pd.read_csv("/content/sarcasm_data.csv")

# Function to add speaker to utterance and context
def add_speaker_info(row):
    # Add speaker to utterance
    row["utterance"] = f"{row['speaker']}: {row['utterance']}"

    # Add corresponding speakers to context
    row["context"] = [f"{sp}: {ctx}" for sp, ctx in zip(eval(row["context_speakers"]), eval(row["context"]))]

    return row

# Apply speaker modification
df = df.apply(add_speaker_info, axis=1)

# Save updated dataset
df.to_csv("/content/sarcasm_data_modified.csv", index=False)

print("Dataset updated successfully!")

# Data augmentation functions
print("Applying data augmentation...")

def synonym_replacement(sentence, n=2):
    words = sentence.split()
    new_words = words.copy()
    for _ in range(n):
        word_idx = random.randint(0, len(words) - 1)
        synonyms = wordnet.synsets(words[word_idx])
        if synonyms:
            new_words[word_idx] = synonyms[0].lemmas()[0].name()
    return ' '.join(new_words)

def back_translation(sentence, src_lang='en', mid_lang='es'):
    translated = GoogleTranslator(source='auto', target=mid_lang).translate(sentence)
    back_translated = GoogleTranslator(source=mid_lang, target=src_lang).translate(translated)
    return back_translated

# Apply data augmentation on utterance alone
df["augmented"] = df["utterance"].apply(lambda x: synonym_replacement(x))
df["back_translated"] = df["utterance"].apply(lambda x: back_translation(x))

# Prepare final dataset
print("Preparing dataset...")
df_augmented = df.melt(id_vars=["sarcasm"], value_vars=["utterance", "augmented", "back_translated"],
                        var_name="type", value_name="text")

# Train-test split
print("Splitting dataset...")
train_texts, val_texts, train_labels, val_labels = train_test_split(df_augmented['text'],
                                                                    df_augmented['sarcasm'],
                                                                    test_size=0.2, random_state=42)

# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Tokenization function
print("Tokenizing dataset...")
def tokenize_function(texts):
    return tokenizer(texts.tolist(), padding="max_length", truncation=True, max_length=128, return_tensors="pt")

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

# Custom dataset class
class SarcasmDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels.iloc[idx])
        return item

# Create dataset objects
train_dataset = SarcasmDataset(train_encodings, train_labels)
val_dataset = SarcasmDataset(val_encodings, val_labels)

# Initialize model
print("Initializing model...")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to=[]  # Disable W&B logging
)

# Trainer
print("Starting training...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

# Evaluate model
print("Evaluating model...")
predictions = trainer.predict(val_dataset)
preds = torch.argmax(torch.tensor(predictions.predictions), dim=1).numpy()
print(classification_report(val_labels, preds))

# Save model
print("Saving model...")
model.save_pretrained("./roberta_sarcasm_model")
tokenizer.save_pretrained("./roberta_sarcasm_model")
print("Training complete!")


Loading dataset...
Dataset updated successfully!
Applying data augmentation...
Preparing dataset...
Splitting dataset...
Tokenizing dataset...
Initializing model...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,No log,0.504256
2,No log,0.333875
3,0.446100,0.296002


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Evaluating model...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


              precision    recall  f1-score   support

           0       0.90      0.94      0.92       200
           1       0.95      0.90      0.92       214

    accuracy                           0.92       414
   macro avg       0.92      0.92      0.92       414
weighted avg       0.92      0.92      0.92       414

Saving model...
Training complete!


Model Based on Summarized Context

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_scheduler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
import random
from nltk.corpus import wordnet
from googletrans import Translator
import asyncio

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Dataset
def load_data(file_path):
    df = pd.read_csv(file_path)
    required_columns = {'utterance_preprocessed', 'summary', 'speaker', 'sarcasm'}
    assert required_columns.issubset(df.columns), \
        f"Dataset must contain the following columns: {required_columns}"
    return df

# Synonym Replacement Function
def synonym_replacement(text, n=1):
    words = text.split()
    random_indices = random.sample(range(len(words)), min(n, len(words)))
    new_words = words.copy()
    for idx in random_indices:
        synonyms = []
        for syn in wordnet.synsets(words[idx]):
            for lemma in syn.lemmas():
                if lemma.name().lower() != words[idx].lower():
                    synonyms.append(lemma.name().replace('_', ' '))
        if synonyms:
            new_words[idx] = random.choice(synonyms)
    return ' '.join(new_words)

# Back Translation Function (Async)
async def back_translate(text):
    """
    Perform back-translation on the input text.
    Translates the text to Spanish and back to English using Google Translator.
    """
    translator = Translator()
    try:
        # Translate to Spanish ('es') and back to English ('en')
        translated = await translator.translate(text, src='en', dest='es')
        back_translated = await translator.translate(translated.text, src='es', dest='en')
        return back_translated.text
    except Exception as e:
        print(f"Translation error: {e}")
        return text

# Wrap async function with create_task (handles event loop)
def run_back_translate(text):
    loop = asyncio.get_event_loop()
    task = loop.create_task(back_translate(text))
    return task

# Augment Data Function
def augment_data(df, n_replacements=1):
    if 'utterance_preprocessed' not in df.columns:
        raise ValueError("The DataFrame does not contain a column named 'utterance_preprocessed'.")
    augmented_df = df.copy()
    augmented_df['utterance_preprocessed'] = augmented_df['utterance_preprocessed'].apply(
        lambda x: synonym_replacement(x, n=n_replacements) if isinstance(x, str) else x
    )
    back_translated_df = df.copy()
    back_translated_df['utterance_preprocessed'] = back_translated_df['utterance_preprocessed'].apply(run_back_translate)
    return pd.concat([df, augmented_df, back_translated_df]).reset_index(drop=True)

# Custom Dataset Class
class SarcasmDataset(Dataset):
    def __init__(self, data, tokenizer, speaker_map, max_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.speaker_map = speaker_map
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        # Ensure the 'utterance_preprocessed' is a string before passing to tokenizer
        utterance = str(row['utterance_preprocessed'])
        inputs = self.tokenizer(
            utterance,
            row['summary'],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        speaker_id = self.speaker_map[row['speaker']]
        label = torch.tensor(row['sarcasm'], dtype=torch.long)

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'speaker_id': torch.tensor(speaker_id, dtype=torch.long),
            'labels': label
        }

# Sarcasm Detection Model with Speaker Embeddings
class SarcasmDetectionModel(torch.nn.Module):
    def __init__(self, roberta_model_name, num_labels, speaker_vocab_size, speaker_embedding_dim):
        super(SarcasmDetectionModel, self).__init__()
        self.roberta = RobertaForSequenceClassification.from_pretrained(roberta_model_name, num_labels=num_labels)
        self.speaker_embedding = torch.nn.Embedding(speaker_vocab_size, speaker_embedding_dim)
        self.classifier = torch.nn.Linear(self.roberta.config.hidden_size + speaker_embedding_dim, num_labels)

    def forward(self, input_ids, attention_mask, speaker_ids):
        roberta_outputs = self.roberta.roberta(input_ids, attention_mask=attention_mask)
        cls_output = roberta_outputs.last_hidden_state[:, 0, :]
        speaker_embeds = self.speaker_embedding(speaker_ids)
        combined_features = torch.cat((cls_output, speaker_embeds), dim=1)
        logits = self.classifier(combined_features)
        return logits

# Load Data
data_path = "/content/sarcasm_data.csv"  # Replace with your dataset path
df = load_data(data_path)

# Data Augmentation
df = augment_data(df, n_replacements=3)

# Map Speakers to IDs
unique_speakers = df['speaker'].unique()
speaker_map = {speaker: idx for idx, speaker in enumerate(unique_speakers)}

# Train-Test Split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

# Datasets and DataLoaders
train_dataset = SarcasmDataset(train_df, tokenizer, speaker_map)
val_dataset = SarcasmDataset(val_df, tokenizer, speaker_map)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Model Initialization
model = SarcasmDetectionModel(
    roberta_model_name="roberta-large",
    num_labels=2,
    speaker_vocab_size=len(speaker_map),
    speaker_embedding_dim=100  # Increased embedding dimension
)
model.to(device)

# Optimizer with Layer-wise Learning Rate Decay
optimizer_grouped_parameters = [
    {
        "params": [param for name, param in model.roberta.named_parameters() if "layer" in name],
        "lr": 1e-5,
    },
    {"params": model.classifier.parameters(), "lr": 2e-4},
    {"params": model.speaker_embedding.parameters(), "lr": 1e-3},
]
optimizer = AdamW(optimizer_grouped_parameters)

# Scheduler and Loss
epochs = 15  # Increased epochs
num_training_steps = len(train_loader) * epochs
num_warmup_steps = int(0.1 * num_training_steps)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)
criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.1)

# Training Loop with Gradient Accumulation
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        speaker_ids = batch['speaker_id'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask, speaker_ids)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")

# Evaluation Loop
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        speaker_ids = batch['speaker_id'].to(device)
        labels = batch['labels'].to(device)

        logits = model(input_ids, attention_mask, speaker_ids)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Validation Accuracy: {accuracy:.4f}")
print(classification_report(all_labels, all_preds))


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/15: 100%|██████████| 104/104 [01:50<00:00,  1.06s/it]


Epoch 1 Loss: 0.7168


Epoch 2/15: 100%|██████████| 104/104 [01:50<00:00,  1.06s/it]


Epoch 2 Loss: 0.6499


Epoch 3/15: 100%|██████████| 104/104 [01:50<00:00,  1.06s/it]


Epoch 3 Loss: 0.6016


Epoch 4/15: 100%|██████████| 104/104 [01:50<00:00,  1.06s/it]


Epoch 4 Loss: 0.5422


Epoch 5/15: 100%|██████████| 104/104 [01:50<00:00,  1.06s/it]


Epoch 5 Loss: 0.3750


Epoch 6/15: 100%|██████████| 104/104 [01:50<00:00,  1.06s/it]


Epoch 6 Loss: 0.2508


Epoch 7/15: 100%|██████████| 104/104 [01:50<00:00,  1.06s/it]


Epoch 7 Loss: 0.2223


Epoch 8/15: 100%|██████████| 104/104 [01:50<00:00,  1.06s/it]


Epoch 8 Loss: 0.2167


Epoch 9/15: 100%|██████████| 104/104 [01:50<00:00,  1.06s/it]


Epoch 9 Loss: 0.2131


Epoch 10/15: 100%|██████████| 104/104 [01:50<00:00,  1.06s/it]


Epoch 10 Loss: 0.2108


Epoch 11/15: 100%|██████████| 104/104 [01:50<00:00,  1.06s/it]


Epoch 11 Loss: 0.2084


Epoch 12/15: 100%|██████████| 104/104 [01:50<00:00,  1.06s/it]


Epoch 12 Loss: 0.2077


Epoch 13/15: 100%|██████████| 104/104 [01:50<00:00,  1.06s/it]


Epoch 13 Loss: 0.2071


Epoch 14/15: 100%|██████████| 104/104 [01:50<00:00,  1.06s/it]


Epoch 14 Loss: 0.2065


Epoch 15/15: 100%|██████████| 104/104 [01:50<00:00,  1.06s/it]


Epoch 15 Loss: 0.2070
Validation Accuracy: 0.9928
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       200
           1       1.00      0.99      0.99       214

    accuracy                           0.99       414
   macro avg       0.99      0.99      0.99       414
weighted avg       0.99      0.99      0.99       414

