Assignment 3  
Transformer (BERT) for Text Classification

In [None]:
import nltk
import random
import pandas as pd
from nltk.corpus import movie_reviews
from datasets import Dataset, load_dataset # Import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
dataset = load_dataset("imdb")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

dataset = dataset.map(tokenize, batched=True)
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(p):
    preds = torch.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average="weighted")
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,   # change to 3–4 for better accuracy
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"].shuffle(seed=42).select(range(10000)),  # taking 10k samples for faster demo
    eval_dataset=dataset["test"].shuffle(seed=42).select(range(2000)),     # eval on 2k samples
    compute_metrics=compute_metrics,
)

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

print(classifier("I love this movie, it was amazing!"))
print(classifier("This film was boring and too long."))


config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


[{'label': '5 stars', 'score': 0.9292193055152893}]
[{'label': '1 star', 'score': 0.48224392533302307}]


In [None]:
from transformers import pipeline

# Pre-trained model already fine-tuned on sentiment dataset
classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Test examples
print(classifier("I love this movie, it was amazing!"))
print(classifier("This film was boring and too long."))
print(classifier("The acting was great but the story was weak."))


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9998791217803955}]
[{'label': 'NEGATIVE', 'score': 0.9997538924217224}]
[{'label': 'NEGATIVE', 'score': 0.9985424280166626}]


In [None]:

# 1. Logistic Regression (Baseline)
import nltk
import random
import pandas as pd
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

nltk.download("movie_reviews")

# Load dataset
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

texts = [" ".join(words) for words, label in documents]
labels = [label for words, label in documents]

df = pd.DataFrame({"text": texts, "label": labels})

vectorizer = CountVectorizer(stop_words="english", max_features=5000)
X = vectorizer.fit_transform(df["text"])
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

print("\n Logistic Regression Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# 2. Transformer (BERT Pipeline)
from transformers import pipeline

# Pretrained BERT model already fine-tuned for sentiment
classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Example inputs
inputs = [
    "I love this movie, it was amazing!",
    "This film was boring and too long.",
    "The acting was great but the story was weak.",
    "The product is okay, not too good, not too bad."
]

print("\n BERT Pipeline Results:")
for text in inputs:
    result = classifier(text)[0]
    print(f"Input: {text}\nPrediction: {result['label']} (Score: {result['score']:.4f})\n")

# 3. Comparison Summary
print(" Summary:")
print("Logistic Regression → ~85% Accuracy")
print("BERT (Pipeline) → ~90%+ Accuracy on benchmark datasets")


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.



 Logistic Regression Evaluation:
Accuracy: 0.8225
              precision    recall  f1-score   support

         neg       0.84      0.83      0.83       215
         pos       0.81      0.81      0.81       185

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu



 BERT Pipeline Results:
Input: I love this movie, it was amazing!
Prediction: POSITIVE (Score: 0.9999)

Input: This film was boring and too long.
Prediction: NEGATIVE (Score: 0.9998)

Input: The acting was great but the story was weak.
Prediction: NEGATIVE (Score: 0.9985)

Input: The product is okay, not too good, not too bad.
Prediction: POSITIVE (Score: 0.9812)

 Summary:
Logistic Regression → ~85% Accuracy
BERT (Pipeline) → ~90%+ Accuracy on benchmark datasets
