In [None]:
!pip install -q torch torchvision torchaudio
!pip install -q transformers datasets accelerate scikit-learn wandb

In [None]:
import torch
import numpy as np
import pandas as pd

from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

import wandb
wandb.login()

In [None]:
dataset = load_dataset("syedkhalid076/Sentiment-Analysis")
dataset

In [None]:
print(dataset["train"][0])
print(dataset["train"][1])
print(dataset["train"][2])

In [None]:
model_name = "bert-base-uncased"
num_labels = 3  # negative, neutral, positive

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
)

id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {v: k for k, v in id2label.items()}
model.config.id2label = id2label
model.config.label2id = label2id

In [None]:
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")
tokenized_dataset["train"][0]

In [None]:
train_subset_size = 40000
val_subset_size = 8000

small_train = tokenized_dataset["train"].select(range(train_subset_size))
small_val = tokenized_dataset["validation"].select(range(val_subset_size))

len(small_train), len(small_val)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1_macro = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1_macro": f1_macro}

In [None]:
batch_size = 16
logging_dir = "./logs"
output_dir = "./bert_sentiment_output"

training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir=logging_dir,
    logging_steps=50,
    report_to=["wandb"],
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer

In [None]:
train_result = trainer.train()
train_result

trainer.save_model("./best_bert_sentiment")
tokenizer.save_pretrained("./best_bert_sentiment")

In [None]:
val_metrics = trainer.evaluate(eval_dataset=tokenized_dataset["validation"])
print("Validation metrics:", val_metrics)

test_metrics = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
print("Test metrics:", test_metrics)

In [None]:
predictions = trainer.predict(tokenized_dataset["test"])
pred_labels = np.argmax(predictions.predictions, axis=-1)
true_labels = predictions.label_ids

cm = confusion_matrix(true_labels, pred_labels)
label_names = ["negative", "neutral", "positive"]
cm_df = pd.DataFrame(cm, index=label_names, columns=label_names)
cm_df

In [None]:
label_names = ["negative", "neutral", "positive"]

def predict_sentiment(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128,
    )
    if torch.cuda.is_available():
        model.to("cuda")
        inputs = {k: v.to("cuda") for k, v in inputs.items()}
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
    pred_id = int(np.argmax(probs))
    return label_names[pred_id], probs

print(predict_sentiment("This movie was absolutely amazing!"))
print(predict_sentiment("It was okay, nothing special."))
print(predict_sentiment("Terrible experience, I hated it."))

In [None]:
import shutil

shutil.make_archive("best_bert_sentiment", "zip", "best_bert_sentiment")

In [None]:
from google.colab import files
files.download("best_bert_sentiment.zip")