In [None]:
!nvidia-smi


In [None]:
!nvidia-smi

In [None]:
!pip install -q transformers datasets evaluate accelerate -U


In [None]:
!pip install --upgrade transformers

In [None]:
import torch
import random
import numpy as np
from datasets import load_dataset

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
device


In [None]:
amazon = load_dataset("amazon_polarity")

# 1. Show dataset structure
print(amazon)

# 2. Show column names
print("Columns:", amazon['train'].column_names)

# 3. Size of splits
print("Train size =", len(amazon['train']))
print("Test size =", len(amazon['test']))

# 4. Show a few samples
for i in range(3):
    print(f"\nSample {i}:")
    print("  Title:", amazon['train'][i]['title'])
    print("  Content:", amazon['train'][i]['content'])
    print("  Label:", amazon['train'][i]['label'])

# 5. Check label distribution on a chunk
from collections import Counter
sample_labels = Counter(amazon['train']['label'])
print("\nLabel distribution in Amazon Train:", sample_labels)


In [None]:
TRAIN_SIZE = 4000
TEST_SIZE = 1000

amazon_train = amazon['train'].shuffle(seed=SEED).select(range(TRAIN_SIZE))
amazon_test  = amazon['test'].shuffle(seed=SEED).select(range(TEST_SIZE))

len(amazon_train), len(amazon_test)


In [None]:
from transformers import AutoTokenizer

MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
MAX_LEN = 128

# Combine title + content
def preprocess(batch):
    texts = []
    for t, c in zip(batch['title'], batch['content']):
        text = (t or "") + " " + (c or "")
        texts.append(text)
    return tokenizer(texts, truncation=True, max_length=MAX_LEN)

amazon_train = amazon_train.map(preprocess, batched=True, remove_columns=["title", "content"])
amazon_test  = amazon_test.map(preprocess, batched=True, remove_columns=["title", "content"])

amazon_train = amazon_train.rename_column("label", "labels")
amazon_test  = amazon_test.rename_column("label", "labels")

amazon_train.set_format("torch")
amazon_test.set_format("torch")


In [None]:
from transformers import (
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
import evaluate
import numpy as np

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)['accuracy'],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")['f1']
    }

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
from transformers import (TrainingArguments, Trainer, IntervalStrategy)

training_args = TrainingArguments(
    output_dir="amazon_model",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    eval_strategy=IntervalStrategy.EPOCH,
    save_strategy=IntervalStrategy.EPOCH,
    logging_strategy="steps",
    logging_steps=50,
    load_best_model_at_end=True,
    fp16=True,     # faster on GPU
    report_to="none"  # cleaner output
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=amazon_train,
    eval_dataset=amazon_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
train_output = trainer.train()
train_output


In [None]:
amazon_results = trainer.evaluate(amazon_test)
amazon_results


In [None]:
imdb = load_dataset("imdb")

# Structure
print(imdb)

# Columns
print("Columns:", imdb['train'].column_names)

# Sizes
print("Train size =", len(imdb['train']))
print("Test size =", len(imdb['test']))

# Sample reviews
for i in range(3):
    print(f"\nIMDB Sample {i}:")
    print("  Text:", imdb['train'][i]['text'][:300], "...")
    print("  Label:", imdb['train'][i]['label'])

# Label distribution (IMDB)
from collections import Counter
print("\nLabel distribution:", Counter(imdb['train']['label']))


In [None]:
IMDB_TEST = 2000
imdb_test = imdb['test'].shuffle(seed=SEED).select(range(IMDB_TEST))

def preprocess_imdb(batch):
    return tokenizer(batch['text'], truncation=True, max_length=MAX_LEN)

imdb_test = imdb_test.map(preprocess_imdb, batched=True, remove_columns=["text"])
imdb_test = imdb_test.rename_column("label", "labels")
imdb_test.set_format("torch")


In [None]:
imdb_results_before = trainer.evaluate(imdb_test)
imdb_results_before


In [None]:
IMDB_TRAIN = 4000
imdb_train = imdb['train'].shuffle(seed=SEED).select(range(IMDB_TRAIN))

imdb_train = imdb_train.map(preprocess_imdb, batched=True, remove_columns=["text"])
imdb_train = imdb_train.rename_column("label", "labels")
imdb_train.set_format("torch")

ft_args = TrainingArguments(
    output_dir="imdb_finetuned",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    fp16=True,
    load_best_model_at_end=True,
    report_to="none"
)

ft_trainer = Trainer(
    model=model,   # continue training same model
    args=ft_args,
    train_dataset=imdb_train,
    eval_dataset=imdb_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

ft_trainer.train()


In [None]:
imdb_results_after = ft_trainer.evaluate(imdb_test)
imdb_results_after


In [None]:
#epochs increased to 15 in amazon model
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="amazon_model",
    num_train_epochs=15,                # ⬅️ Increased from 2 → 15
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    load_best_model_at_end=True,
    fp16=True,                          # Faster on GPU
    report_to="none"                    # Clean output
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=amazon_train,
    eval_dataset=amazon_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [None]:
train_output = trainer.train()
train_output

In [None]:
#epochs increased to 50 in amazon model
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="amazon_model",
    num_train_epochs=50,                # ⬅️ Increased from 2 → 15
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    load_best_model_at_end=True,
    fp16=True,                          # Faster on GPU
    report_to="none"                    # Clean output
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=amazon_train,
    eval_dataset=amazon_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
train_output = trainer.train()
train_output

In [None]:
# Select a bigger dataset sample for high accuracy
TRAIN_SIZE = 80000
TEST_SIZE  = 8000

amazon_train = amazon['train'].shuffle(seed=SEED).select(range(TRAIN_SIZE))
amazon_test  = amazon['test'].shuffle(seed=SEED).select(range(TEST_SIZE))


In [None]:
MODEL_NAME = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="amazon_roberta_model",
    num_train_epochs=4,                # 3–4 epochs gives best results
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    warmup_steps=1000,
    weight_decay=0.1,                  # Strong regularization improves accuracy
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    load_best_model_at_end=True,
    fp16=True,                         # Mandatory for speed
    gradient_accumulation_steps=2,      # Effective batch size = 32
    report_to="none"
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=amazon_train,
    eval_dataset=amazon_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


In [None]:
#above cell with 80,000 samples + ROBERTa model - accuracy - 96.05%, epochs - 4
#model ready for imdb dataset

In [None]:
#Load the IMDB Dataset + Describe It
from datasets import load_dataset
imdb = load_dataset("imdb")

# Dataset structure
print(imdb)

# Columns
print("Columns:", imdb['train'].column_names)

# Sizes
print("Train size =", len(imdb['train']))
print("Test size =", len(imdb['test']))

# Example reviews
for i in range(3):
    print(f"\nIMDB Sample {i}:")
    print("Text:", imdb['train'][i]['text'][:300], "...")
    print("Label:", imdb['train'][i]['label'])

# Label distribution
from collections import Counter
print("\nLabel distribution:", Counter(imdb['train']['label']))


In [None]:
#Create a Smaller IMDB Subset
IMDB_TRAIN = 10000
IMDB_TEST = 5000

imdb_train = imdb['train'].shuffle(seed=SEED).select(range(IMDB_TRAIN))
imdb_test  = imdb['test'].shuffle(seed=SEED).select(range(IMDB_TEST))

len(imdb_train), len(imdb_test)


In [None]:
#Preprocess IMDB Using the SAME Tokenizer
def preprocess_imdb(batch):
    return tokenizer(batch['text'], truncation=True, max_length=MAX_LEN)

imdb_train = imdb_train.map(preprocess_imdb, batched=True, remove_columns=["text"])
imdb_test  = imdb_test.map(preprocess_imdb,  batched=True, remove_columns=["text"])

imdb_train = imdb_train.rename_column("label", "labels")
imdb_test  = imdb_test.rename_column("label", "labels")

imdb_train.set_format("torch")
imdb_test.set_format("torch")


In [None]:
#Test the Amazon-Trained Model Directly on IMDB
imdb_results_before = trainer.evaluate(imdb_test)
imdb_results_before


In [None]:
#Fine-Tune RoBERTa on IMDB
from transformers import TrainingArguments, Trainer

ft_args = TrainingArguments(
    output_dir="roberta_imdb_finetuned",
    num_train_epochs=4,                  # Best performance
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.1,                    # Regularization
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    load_best_model_at_end=True,
    fp16=True,
    gradient_accumulation_steps=2,        # Effective batch size = 32
    report_to="none"
)

ft_trainer = Trainer(
    model=model,            # same RoBERTa model, now continues training
    args=ft_args,
    train_dataset=imdb_train,
    eval_dataset=imdb_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

ft_trainer.train()


In [None]:
imdb_results_after = ft_trainer.evaluate(imdb_test)
imdb_results_after


In [None]:
!pip install transformers datasets accelerate torch matplotlib


In [None]:
amazon_train_losses = []
amazon_train_accs = []
amazon_val_losses = []
amazon_val_accs = []


In [None]:
from tqdm import tqdm
import torch
import torch.nn.functional as F

def train_amazon(model, train_loader, val_loader, optimizer, device, epochs=15):
    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0

        # -------- TRAINING LOOP -------- #
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
            optimizer.zero_grad()

            inputs = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**inputs)
            loss = outputs.loss
            logits = outputs.logits

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            preds = logits.argmax(dim=1)
            correct += (preds == inputs["labels"]).sum().item()
            total += inputs["labels"].size(0)

        train_accuracy = correct / total
        train_loss = total_loss / len(train_loader)

        amazon_train_losses.append(train_loss)
        amazon_train_accs.append(train_accuracy)

        # -------- VALIDATION LOOP -------- #
        model.eval()
        val_loss_total = 0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for batch in val_loader:
                inputs = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**inputs)

                val_loss_total += outputs.loss.item()
                preds = outputs.logits.argmax(dim=1)
                val_correct += (preds == inputs["labels"]).sum().item()
                val_total += inputs["labels"].size(0)

        val_loss = val_loss_total / len(val_loader)
        val_accuracy = val_correct / val_total

        amazon_val_losses.append(val_loss)
        amazon_val_accs.append(val_accuracy)

        print(f"Epoch {epoch+1}/{epochs} | Train Acc: {train_accuracy:.4f} | "
              f"Val Acc: {val_accuracy:.4f}")


In [None]:
imdb_train_losses = []
imdb_train_accs = []
imdb_val_losses = []
imdb_val_accs = []


In [None]:
def train_imdb(model, train_loader, val_loader, optimizer, device, epochs=5):
    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0

        # TRAIN LOOP
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
            optimizer.zero_grad()
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)

            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            preds = outputs.logits.argmax(dim=1)
            correct += (preds == inputs["labels"]).sum().item()
            total += inputs["labels"].size(0)

        train_loss = total_loss / len(train_loader)
        train_acc = correct / total

        imdb_train_losses.append(train_loss)
        imdb_train_accs.append(train_acc)

        # VALIDATION LOOP
        model.eval()
        val_loss_total = 0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for batch in val_loader:
                inputs = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**inputs)

                val_loss_total += outputs.loss.item()
                preds = outputs.logits.argmax(dim=1)
                val_correct += (preds == inputs["labels"]).sum().item()
                val_total += inputs["labels"].size(0)

        val_loss = val_loss_total / len(val_loader)
        val_acc = val_correct / val_total

        imdb_val_losses.append(val_loss)
        imdb_val_accs.append(val_acc)

        print(f"Epoch {epoch+1}/{epochs} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")


In [None]:
import matplotlib.pyplot as plt

# --- AMAZON MODEL PLOTS ---

epochs = range(1, len(amazon_train_losses) + 1)

plt.figure(figsize=(14,5))

# LOSS PLOT
plt.subplot(1, 2, 1)
plt.plot(epochs, amazon_train_losses, label='Training Loss')
plt.plot(epochs, amazon_val_losses, label='Validation Loss')
plt.title('Amazon Dataset - Loss Curve')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# ACCURACY PLOT
plt.subplot(1, 2, 2)
plt.plot(epochs, amazon_train_accs, label='Training Accuracy')
plt.plot(epochs, amazon_val_accs, label='Validation Accuracy')
plt.title('Amazon Dataset - Accuracy Curve')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()
#not used


In [None]:
history = trainer.state.log_history

train_loss = []
eval_loss = []
eval_accuracy = []

for entry in history:
    if "loss" in entry and "epoch" in entry:
        train_loss.append(entry["loss"])
    if "eval_loss" in entry:
        eval_loss.append(entry["eval_loss"])
    if "eval_accuracy" in entry:
        eval_accuracy.append(entry["eval_accuracy"])


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(14,5))

# LOSS
plt.subplot(1,2,1)
plt.plot(train_loss, label="Train Loss")
plt.plot(eval_loss, label="Validation Loss")
plt.legend()
plt.title("Loss Curve")

# ACCURACY
plt.subplot(1,2,2)
plt.plot(eval_accuracy, label="Validation Accuracy")
plt.legend()
plt.title("Validation Accuracy")

plt.show()
#idk


In [None]:
amazon_history = trainer.state.log_history

amazon_train_loss = []
amazon_eval_loss = []
amazon_eval_accuracy = []

for entry in amazon_history:
    if "loss" in entry and "epoch" in entry:
        amazon_train_loss.append(entry["loss"])
    if "eval_loss" in entry:
        amazon_eval_loss.append(entry["eval_loss"])
    if "eval_accuracy" in entry:
        amazon_eval_accuracy.append(entry["eval_accuracy"])
#for amazon roberta model

In [None]:
#plot amazon curves
import matplotlib.pyplot as plt

plt.figure(figsize=(14,5))

# Loss
plt.subplot(1,2,1)
plt.plot(amazon_train_loss, label='Train Loss')
plt.plot(amazon_eval_loss, label='Validation Loss')
plt.title("Amazon - Loss Curve")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

# Accuracy
plt.subplot(1,2,2)
plt.plot(amazon_eval_accuracy, label='Validation Accuracy')
plt.title("Amazon - Accuracy Curve")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()

plt.show()


In [None]:
#Extract metrics for IMDB fine-tuning
imdb_history = ft_trainer.state.log_history

imdb_train_loss = []
imdb_eval_loss = []
imdb_eval_accuracy = []

for entry in imdb_history:
    if "loss" in entry and "epoch" in entry:
        imdb_train_loss.append(entry["loss"])
    if "eval_loss" in entry:
        imdb_eval_loss.append(entry["eval_loss"])
    if "eval_accuracy" in entry:
        imdb_eval_accuracy.append(entry["eval_accuracy"])

In [None]:
#Plot IMDB curves
import matplotlib.pyplot as plt

plt.figure(figsize=(14,5))

# Loss
plt.subplot(1,2,1)
plt.plot(imdb_train_loss, label='Train Loss')
plt.plot(imdb_eval_loss, label='Validation Loss')
plt.title("IMDB - Loss Curve")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

# Accuracy
plt.subplot(1,2,2)
plt.plot(imdb_eval_accuracy, label='Validation Accuracy')
plt.title("IMDB - Accuracy Curve")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()

plt.show()
