<h2 style="color:#1a237e; font-weight:bold; margin-top:20px;">Sentiment Analysis with English & Urdu: Classical to Transformer Models</h2>

**Paper:** *The Evolution of Sentiment Analysis: From Statistical Models to Pretrained Multilingual Transformers*  
**Authors:** Samee Arif, Moaiz Abrar

**Key Questions:**
1. How do different model types compare for English vs Urdu?
2. What is the impact of pretrained vs custom word embeddings?
3. Does cross-lingual training help Urdu sentiment classification?

<h2 style="color:#1a237e; font-weight:bold; margin-top:20px;">Setup and Imports</h2>

In [None]:
import torch
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, default_data_collator, TrainingArguments
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import Trainer
from transformers import BitsAndBytesConfig
from trl import SFTTrainer
from peft import LoraConfig

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
language = "english"

In [None]:
def tokenize_function(examples):
    encoding = tokenizer(examples["text"], padding="max_length", max_length=128, truncation=True)
    return encoding

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"]}

def compute_metrics_mt5(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds  = tokenizer.batch_decode(preds,  skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    cleaned_preds = []
    cleaned_labels = []
    for text in decoded_preds:
        s = re.sub(r'<extra_id_\d+>', '', text)
        s = re.sub(r'<.*?>',      '', s)
        s = s.strip()
        m = re.search(r'\d+', s)
        cleaned_preds.append(int(m.group()) if m else -1)

def compute_metrics_llama(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple) or preds.ndim == 3:
        preds = np.argmax(preds[0] if isinstance(preds, tuple) else preds, axis=-1)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds  = tokenizer.batch_decode(preds,  skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    def extract_int(s):
        s = re.sub(r'<extra_id_\d+>', '', s)
        s = re.sub(r'<.*?>',      '', s)
        s = s.strip()
        m = re.search(r'-?\d+', s)
        return int(m.group()) if m else -1

    cleaned_preds  = [extract_int(t) for t in decoded_preds]
    cleaned_labels = [extract_int(t) for t in decoded_labels]

    acc = accuracy_score(cleaned_labels, cleaned_preds)
    return {"accuracy": acc}

def add_prefix(example):
    return {"input": f"Sentiment: {example['text']}", "output": f"{example['label']}"}

def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input'], truncation=True, padding="max_length", max_length=MAX_LENGTH)
    target_encodings = tokenizer.batch_encode_plus(example_batch['output'], truncation=True, padding="max_length", max_length=MAX_LENGTH)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids'],
    }

    return encodings

def apply_chat_template(example, tokenizer):
    messages = example["messages"]
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": ""})
    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False)
    return example

<h2 style="color:#1a237e; font-weight:bold; margin-top:20px;">Fine-tuning and Evaluation</h2>

<h3 style="color:#2a4d8f; font-weight:bold; margin-top:15px;">1. BERT Large Uncased</h3>

In [None]:
dataset_name = f"sameearif/imdb-{language}"
model_name = "google-bert/bert-large-uncased"
epochs = 10
learning_rate = 5e-5

raw_dataset = load_dataset(dataset_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

train_dataset = raw_dataset["train"].map(tokenize_function, batched=True)
eval_dataset = raw_dataset["validation"].map(tokenize_function, batched=True)
test_dataset = raw_dataset["test"].map(tokenize_function, batched=True)

accuracy = evaluate.load("accuracy")
f1_score = evaluate.load("f1")

In [None]:
args = TrainingArguments(
    num_train_epochs=epochs,
    learning_rate=learning_rate,
    eval_strategy = "epoch",
    logging_strategy = "epoch",
    save_strategy = "epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
)
data_collator = default_data_collator
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

In [None]:
accuracy = trainer.evaluate(eval_dataset=test_dataset)
print(accuracy)

<h3 style="color:#2a4d8f; font-weight:bold; margin-top:15px;">2. XLM-Roberta Large</h3>

In [None]:
dataset_name = f"sameearif/imdb-{language}"
model_name = "FacebookAI/xlm-roberta-large"
epochs = 10
learning_rate = 5e-5

raw_dataset = load_dataset(dataset_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

train_dataset = raw_dataset["train"].map(tokenize_function, batched=True)
eval_dataset = raw_dataset["validation"].map(tokenize_function, batched=True)
test_dataset = raw_dataset["test"].map(tokenize_function, batched=True)

accuracy = evaluate.load("accuracy")

In [None]:
args = TrainingArguments(
    num_train_epochs=epochs,
    learning_rate=learning_rate,
    eval_strategy = "epoch",
    logging_strategy = "epoch",
    save_strategy = "epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
)
data_collator = default_data_collator
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

In [None]:
accuracy = trainer.evaluate(eval_dataset=test_dataset)
print(accuracy)

<h3 style="color:#2a4d8f; font-weight:bold; margin-top:15px;">3. mT5 Large</h3>

In [None]:
dataset_name = f"sameearif/imdb-{language}"
model_name = "google/mt5-large"
epochs = 10
learning_rate = 5e-5

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

raw_dataset = load_dataset(dataset_name)
raw_dataset["train"] = raw_dataset["train"].map(add_prefix, load_from_cache_file=False, num_proc=8)
raw_dataset["validation"] = raw_dataset["validation"].map(add_prefix, load_from_cache_file=False, num_proc=8)
raw_dataset["test"] = raw_dataset["test"].map(add_prefix, load_from_cache_file=False, num_proc=8)
train_dataset = raw_dataset["train"].map(convert_to_features, batched=True, load_from_cache_file=False, num_proc=8)
eval_dataset = raw_dataset["validation"].map(convert_to_features, batched=True, load_from_cache_file=False, num_proc=8)
columns = ['input_ids', 'attention_mask', 'labels']
train_dataset.set_format(type='torch', columns=columns)
eval_dataset.set_format(type='torch', columns=columns)

In [None]:
training_args = Seq2SeqTrainingArguments(
    num_train_epochs=epochs,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    predict_with_generate=True,
    generation_max_length=10
)
data_collator = DataCollatorForSeq2Seq(tokenizer)
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics_mt5
)
trainer.train()

In [None]:
accuracy = trainer.evaluate(eval_dataset=test_dataset)
print(accuracy)

<h3 style="color:#2a4d8f; font-weight:bold; margin-top:15px;">3. Llama-3.1-8B</h3>

In [None]:
dataset_name = f"sameearif/imdb-{language}-llama"
model_name = "meta-llama/Llama-3.1-8B-Instruct"

dataset = load_dataset(dataset_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
column_names = list(dataset["train"].features)
dataset = dataset.map(apply_chat_template,
                                num_proc=8,
                                fn_kwargs={"tokenizer": tokenizer},
                                remove_columns=column_names,
                                desc="Applying chat template")

if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.model_max_length = 512
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]
test_dataset = dataset["test"]

quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
)
model_kwargs = dict(
    torch_dtype="auto",
    use_cache=False,
    device_map="auto",
    quantization_config=quantization_config,
)

In [None]:
training_args = TrainingArguments(
    bf16=True,
    do_eval=True,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    learning_rate=2.0e-05,
    lr_scheduler_type="cosine",
    num_train_epochs=2,
    per_device_eval_batch_size=8,
    per_device_train_batch_size=8,
    hub_model_id=trained_model_id,
    compute_metrics=compute_metrics_llama
)
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)
trainer = SFTTrainer(
    model=model_id,
    model_init_kwargs=model_kwargs,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    tokenizer=tokenizer,
    packing=True,
    peft_config=peft_config,
    max_seq_length=tokenizer.model_max_length,
    generation_kwargs={"eos_token_id": terminators}
)
trainer.train()