## Sarcasm Detection Tuning:

In [None]:
import json
import re
import random
import numpy as np
import pandas as pd
import torch
import os

from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

from nltk.corpus import wordnet
import nltk
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

nltk.download('wordnet')

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)


[nltk_data] Downloading package wordnet to /root/nltk_data...


### Data Preparation (Loading, Cleaning, Augmentation, Tokenization)

In [None]:
# --- Data Loading and Preprocessing ---
path = "Sarcasm_Headlines_Dataset_v2.json"
data = []
# Assuming the file is in the current working directory
with open(path, "r") as f:
    for line in f:
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Skipping malformed JSON line: {line.strip()}. Error: {e}")
df = pd.DataFrame(data)
df = df[["headline", "is_sarcastic"]]

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)  # remove special chars
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_headline"] = df["headline"].apply(clean_text)

def synonym_replacement(sentence, n=1):
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set([w for w in words if len(w) > 3]))

    if len(random_word_list) == 0:
        return sentence

    for _ in range(n):
        # Choose a random word from the list
        word_to_replace = random.choice(random_word_list)
        synonyms = wordnet.synsets(word_to_replace)
        if not synonyms:
            continue

        # Choose a random synonym for better diversity
        synonym_words = random.choice(synonyms).lemma_names()
        synonym = synonym_words[0].replace("_", " ")

        # Find the index of the word to replace in the original words list
        # This handles multiple occurrences, replacing only the first one found.
        try:
            idx = new_words.index(word_to_replace)
            new_words[idx] = synonym
        except ValueError:
            continue # Should not happen if word_to_replace came from words

    return " ".join(new_words)

# Augmentation (only sarcastic data is augmented, as in the original notebook)
augmented_rows = []
for idx, row in df.iterrows():
    if row["is_sarcastic"] == 1:
        aug = synonym_replacement(row["clean_headline"])
        augmented_rows.append({"clean_headline": aug, "is_sarcastic": 1})

aug_df = pd.DataFrame(augmented_rows)
df_augmented = pd.concat([df[["clean_headline", "is_sarcastic"]], aug_df])
df_augmented.reset_index(drop=True, inplace=True)

# Data Splitting
train_df, temp_df = train_test_split(df_augmented, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# --- Tokenization and Dataset Prep ---
model_ckpt = "distilbert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_ckpt)

def tokenize(batch):
    return tokenizer(
        batch["clean_headline"],
        truncation=True,
        padding=False,
        max_length=32
    )

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

ds = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

ds = ds.map(tokenize, batched=True)
# Rename target column to 'labels' as required by Hugging Face models
ds = ds.rename_column("is_sarcastic", "labels")

ds.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define compute_metrics function for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)

    acc = accuracy_score(labels, preds)
    # Compute metrics for the positive class (sarcastic = 1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

comparison_results = []
print(f"Data loaded: Train={len(ds['train'])}, Validation={len(ds['validation'])}, Test={len(ds['test'])}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


Map:   0%|          | 0/33802 [00:00<?, ? examples/s]

Map:   0%|          | 0/4225 [00:00<?, ? examples/s]

Map:   0%|          | 0/4226 [00:00<?, ? examples/s]

Data loaded: Train=33802, Validation=4225, Test=4226


### Experiment 1: Baseline (LR $2\times 10^{-5}$, 3 Epochs)

In [None]:
# Re-initialize model for Run 1
model_1 = BertForSequenceClassification.from_pretrained(model_ckpt, num_labels=2)

training_args_1 = TrainingArguments(
    output_dir="./bert_sarcasm_baseline",
    learning_rate=2e-5, # Baseline LR
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3, # Baseline Epochs
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    report_to="none",
)

trainer_1 = Trainer(
    model=model_1,
    args=training_args_1,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("Starting Training for Baseline...")
trainer_1.train()

print("\n--- Baseline Test Set Evaluation ---")
metrics_1 = trainer_1.evaluate(ds["test"])
metrics_1['Experiment'] = 'distilbert-base-uncased (LR 2e-5, Epochs 3)'
comparison_results.append(metrics_1)

# Get detailed classification report for baseline
preds_1 = trainer_1.predict(ds["test"]).predictions.argmax(-1)
labels = ds["test"]["labels"]
print("\nClassification Report (distilbert-base-uncased):")
print(classification_report(labels, preds_1))


You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['bert.embeddings.LayerNorm.bias', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.word_embeddings.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.outp

Starting Training for Baseline...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.305,0.387697,0.828402,0.939893,0.781562,0.853447
2,0.2634,0.314455,0.883787,0.870308,0.961496,0.913632
3,0.1588,0.312081,0.894911,0.902031,0.937431,0.91939



--- Baseline Test Set Evaluation ---



Classification Report (distilbert-base-uncased):
              precision    recall  f1-score   support

           0       0.88      0.83      0.85      1498
           1       0.91      0.94      0.92      2728

    accuracy                           0.90      4226
   macro avg       0.89      0.88      0.89      4226
weighted avg       0.90      0.90      0.90      4226



### Experiment 2: Tuned (LR $1\times 10^{-5}$, 4 Epochs)

In [None]:
# Re-initialize model for Run 2
# This is crucial to ensure both runs start from the same pre-trained state.
model_2 = BertForSequenceClassification.from_pretrained(model_ckpt, num_labels=2)

training_args_2 = TrainingArguments(
    output_dir="./bert_sarcasm_tuned_lr_epochs",
    learning_rate=1e-5, # Tuned LR (Lower)
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4, # Tuned Epochs (Higher)
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    report_to="none",
)

trainer_2 = Trainer(
    model=model_2,
    args=training_args_2,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("Starting Training for Tuned LR/Epochs...")
trainer_2.train()

print("\n--- Tuned Test Set Evaluation ---")
metrics_2 = trainer_2.evaluate(ds["test"])
metrics_2['Experiment'] = 'distilbert-base-uncased (LR 1e-5, Epochs 4)'
comparison_results.append(metrics_2)

# Get detailed classification report for tuned run
preds_2 = trainer_2.predict(ds["test"]).predictions.argmax(-1)
print("\nClassification Report (distilbert-base-uncased (LR 1e-5, Epochs 4)):")
print(classification_report(labels, preds_2))


You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['bert.embeddings.LayerNorm.bias', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.word_embeddings.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0

Starting Training for Tuned LR/Epochs...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3547,0.398752,0.825562,0.931838,0.784524,0.851859
2,0.3135,0.300409,0.87858,0.885211,0.930766,0.907417
3,0.2284,0.314648,0.888994,0.889121,0.944095,0.915784
4,0.1938,0.322728,0.891834,0.89507,0.941133,0.917524



--- Tuned Test Set Evaluation ---



Classification Report (distilbert-base-uncased (LR 1e-5, Epochs 4)):
              precision    recall  f1-score   support

           0       0.87      0.81      0.84      1498
           1       0.90      0.93      0.92      2728

    accuracy                           0.89      4226
   macro avg       0.88      0.87      0.88      4226
weighted avg       0.89      0.89      0.89      4226



### Experiment 3: RoBERTa-base (LR $1\times 10^{-5}$, 4 Epochs)


In [None]:
from transformers import AutoTokenizer, RobertaForSequenceClassification, AutoModelForSequenceClassification

model_ckpt_3 = "roberta-base"
tokenizer_3 = AutoTokenizer.from_pretrained(model_ckpt_3)
model_3 = RobertaForSequenceClassification.from_pretrained(model_ckpt_3, num_labels=2)

training_args_3 = TrainingArguments(
    output_dir="./bert_sarcasm_tuned_lr_epochs",
    learning_rate=1e-5, # Tuned LR (Lower)
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4, # Tuned Epochs (Higher)
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    report_to="none",
)

trainer_3 = Trainer(
    model=model_3,
    args=training_args_3,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    tokenizer=tokenizer_3,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("Starting Training for Tuned LR/Epochs...")
trainer_3.train()

print("\n--- Tuned Test Set Evaluation ---")
metrics_3 = trainer_3.evaluate(ds["test"])
metrics_3['Experiment'] = 'roberta-base (LR 1e-5, Epochs 4)'
comparison_results.append(metrics_3)

# Get detailed classification report for tuned run
preds_3 = trainer_2.predict(ds["test"]).predictions.argmax(-1)
print("\nClassification Report (roberta-base (LR 1e-5, Epochs 4)):")
print(classification_report(labels, preds_3))


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_3 = Trainer(


Starting Training for Tuned LR/Epochs...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4281,0.42401,0.818698,0.825648,0.908182,0.864951
2,0.3694,0.378477,0.840947,0.84542,0.919289,0.880809
3,0.3345,0.371596,0.855858,0.863953,0.919289,0.890762
4,0.3094,0.358997,0.857751,0.871813,0.911514,0.891222



--- Tuned Test Set Evaluation ---



Classification Report (roberta-base (LR 1e-5, Epochs 4)):
              precision    recall  f1-score   support

           0       0.87      0.81      0.84      1498
           1       0.90      0.93      0.92      2728

    accuracy                           0.89      4226
   macro avg       0.88      0.87      0.88      4226
weighted avg       0.89      0.89      0.89      4226



### Experiment 4: BERT-base-uncased (LR $2\times 10^{-5}$, 3 Epochs)

In [None]:
model_ckpt_4 = "bert-base-uncased"
tokenizer_4 = AutoTokenizer.from_pretrained(model_ckpt_4)
model_4 = BertForSequenceClassification.from_pretrained(model_ckpt_4, num_labels=2)

training_args_4 = TrainingArguments(
    output_dir="./bert_sarcasm_tuned_lr_epochs",
    learning_rate=1e-5, # Tuned LR (Lower)
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4, # Tuned Epochs (Higher)
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    report_to="none",
)

trainer_4 = Trainer(
    model=model_4,
    args=training_args_4,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    tokenizer=tokenizer_4,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("Starting Training for Tuned LR/Epochs...")
trainer_4.train()

print("\n--- Tuned Test Set Evaluation ---")
metrics_4 = trainer_4.evaluate(ds["test"])
metrics_4['Experiment'] = 'bert-base-uncased (LR 1e-5, Epochs 4)'
comparison_results.append(metrics_4)

# Get detailed classification report for tuned run
preds_4 = trainer_2.predict(ds["test"]).predictions.argmax(-1)
print("\nClassification Report (bert-base-uncased (LR 1e-5, Epochs 4)):")
print(classification_report(labels, preds_4))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_4 = Trainer(


Starting Training for Tuned LR/Epochs...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1992,0.196893,0.931834,0.936347,0.958534,0.947311
2,0.1356,0.22714,0.946036,0.943349,0.974084,0.95847
3,0.0782,0.220453,0.953136,0.961979,0.964828,0.963401
4,0.0381,0.222162,0.958343,0.965009,0.970011,0.967504



--- Tuned Test Set Evaluation ---



Classification Report (bert-base-uncased (LR 1e-5, Epochs 4)):
              precision    recall  f1-score   support

           0       0.87      0.81      0.84      1498
           1       0.90      0.93      0.92      2728

    accuracy                           0.89      4226
   macro avg       0.88      0.87      0.88      4226
weighted avg       0.89      0.89      0.89      4226



### Final Comparison of Test Set Metrics

In [None]:
# Display the comparison table
comparison_df = pd.DataFrame(comparison_results)
comparison_df = comparison_df[['Experiment', 'eval_loss', 'eval_accuracy', 'eval_precision', 'eval_recall', 'eval_f1']]
comparison_df.columns = ['Experiment', 'Loss', 'Accuracy', 'Precision (Sarcastic)', 'Recall (Sarcastic)', 'F1-Score (Sarcastic)']
print("\n--- FINAL TUNING COMPARISON ---")
print(comparison_df.to_markdown(index=False, floatfmt=".4f"))


--- FINAL TUNING COMPARISON ---
| Experiment                                  |   Loss |   Accuracy |   Precision (Sarcastic) |   Recall (Sarcastic) |   F1-Score (Sarcastic) |
|:--------------------------------------------|-------:|-----------:|------------------------:|---------------------:|-----------------------:|
| distilbert-base-uncased (LR 2e-5, Epochs 3) | 0.2993 |     0.8982 |                  0.9086 |               0.9366 |                 0.9224 |
| distilbert-base-uncased (LR 1e-5, Epochs 4) | 0.3268 |     0.8888 |                  0.8978 |               0.9340 |                 0.9156 |
| roberta-base (LR 1e-5, Epochs 4)            | 0.3583 |     0.8587 |                  0.8851 |               0.8977 |                 0.8914 |
| bert-base-uncased (LR 1e-5, Epochs 4)       | 0.2075 |     0.9602 |                  0.9661 |               0.9725 |                 0.9693 |
