# Sentiment Analysis on IMDb Dataset
This notebook demonstrates a simple end-to-end pipeline for sentiment analysis using BERT and Hugging Face Transformers.

We will:
1. Load the IMDb dataset
2. Perform basic EDA (exploratory data analysis) with Seaborn
3. Fine-tune a BERT model using the Trainer API
4. Save the model and tokenizer
5. Load the saved model for inference


# Installing all dependencies

In [None]:
!pip install numpy pandas seaborn matplotlib torch transformers datasets evaluate

# 1. Imports and basic setup

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
from collections import Counter

import torch
from datasets import load_dataset
from evaluate import load as load_metric
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    pipeline
)

sns.set_style("whitegrid")
%matplotlib inline

# Check if we have a GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 2. Load the IMDb dataset

In [None]:
dataset = load_dataset("imdb")
print(dataset)

# Convert training split to a pandas DataFrame for EDA
df_train = dataset["train"].to_pandas()
df_test = dataset["test"].to_pandas()

df_train.head()

# 3. Basic EDA

## 3.1. Check class distribution

In [None]:
df_train['sentiment'] = df_train['label'].apply(lambda x: 'Positive' if x == 1 else 'Negative')

sns.countplot(data=df_train, x='sentiment')
plt.title("Sentiment Distribution in Training Set")
plt.show()

positive_count = (df_train['label'] == 1).sum()
negative_count = (df_train['label'] == 0).sum()
print(f"Positive reviews: {positive_count}, Negative reviews: {negative_count}")

## 3.2. Analyze text length

In [None]:
df_train['text_length'] = df_train['text'].apply(lambda x: len(x.split()))
print(df_train['text_length'].describe())

sns.histplot(data=df_train, x='text_length', bins=50, hue='sentiment')
plt.title("Distribution of Text Length")
plt.show()

# 4. Prepare data for fine-tuning BERT

## 4.1. Load BertTokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding='max_length',
        max_length=128
    )

encoded_dataset = dataset.map(tokenize_function, batched=True)

## 4.2. Split into train/val/test

In [None]:
train_full = encoded_dataset['train']
test_dataset = encoded_dataset['test']

split_data = train_full.train_test_split(test_size=0.1, seed=42)
train_dataset = split_data['train']
val_dataset = split_data['test']

print("Train size:", len(train_dataset))
print("Validation size:", len(val_dataset))
print("Test size:", len(test_dataset))

## 4.3. Load the model

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)
model.to(device)

# 5. Fine-tuning with Trainer

In [None]:
accuracy_metric = load_metric("accuracy")
f1_metric = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=preds, references=labels)["f1"]
    return {"accuracy": acc, "f1": f1}

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

# 6. Evaluation on the test set

In [None]:
test_metrics = trainer.evaluate(test_dataset)
print("Evaluation on test set:", test_metrics)

# 7. Save the model and tokenizer


In [None]:
model_dir = "output"

# Optionally set readable labels
model.config.id2label = {0: "NEGATIVE", 1: "POSITIVE"}
model.config.label2id = {"NEGATIVE": 0, "POSITIVE": 1}

# Save model
trainer.save_model(model_dir)  # saves model weights and config

# Save tokenizer
tokenizer.save_pretrained(model_dir)

# 8. Inference with the saved model


In [None]:
loaded_tokenizer = BertTokenizer.from_pretrained(model_dir)
loaded_model = BertForSequenceClassification.from_pretrained(model_dir)
loaded_model.to(device)
loaded_model.eval()

def predict_sentiment(text_list):
    if isinstance(text_list, str):
        text_list = [text_list]
    inputs = loaded_tokenizer(
        text_list,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        outputs = loaded_model(**inputs)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1)

    results = []
    for i, pred in enumerate(preds):
        label_id = pred.item()
        label_str = loaded_model.config.id2label[label_id]
        results.append({
            'text': text_list[i],
            'label': label_str
        })
    return results

sample_texts = [
    "I absolutely loved this movie. It's fantastic!",
    "Worst film ever. Completely boring and pointless."
]

predictions = predict_sentiment(sample_texts)
for p in predictions:
    print(p)

## 9. Conclusions
- The IMDb dataset is balanced, containing roughly an equal number of positive and negative reviews.
- Simple EDA shows that many reviews are relatively long.
- Fine-tuning BERT on 2 epochs can already yield high accuracy.
- Setting `id2label` and `label2id` allows us to get human-readable labels (POSITIVE/NEGATIVE) instead of LABEL_0/LABEL_1.
- We can now deploy this model by loading it in any environment that supports PyTorch and Hugging Face Transformers.

Thank you for checking this notebook!