In [1]:
!pip install --upgrade datasets
from IPython.display import clear_output
clear_output()
print('Completed')

Completed


Set a device

In [2]:
import torch
seed = 42

if torch.accelerator.is_available():
	device = torch.accelerator.current_accelerator().type
	torch.set_default_device(device)
	def set_seed(seed=seed):
		torch.cuda.manual_seed_all(seed)
	set_seed()
else:
	device = 'cpu'
	def set_seed(seed=seed):
		torch.manual_seed(seed)
	set_seed()

device

'cuda'

## Loading a tokenizer

In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained(
	'bert-base-uncased',  # Use something like BERT for classification
	num_labels=2,  # Binary classification - positive/negative
).to(device)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# # 'google/gemma-3-4b-it' 'meta-llama/Llama-3.2-3B-Instruct', 'meta-llama/Llama-3.2-11B-Vision-Instruct', 'bert-base-uncased', 'gpt2'
# inputs = tokenizer(['User: Hi! Assistant:'], return_tensors='pt').to(device)
# inputs['input_ids'][:5]  # test the tokenizer

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Load a dataset

In [4]:
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader, Dataset

dataset = load_dataset('imdb')  # Load IMDB dataset
# split into train/test
dataset = dataset['train'].train_test_split(test_size=0.2, shuffle=True, seed=seed)
print(f'Dataset loaded: {len(dataset["train"])} training samples & {len(dataset["test"])} test samples')

Dataset loaded: 20000 training samples & 5000 test samples


In [5]:
class IMDBDataset(Dataset):
	def __init__(self, data, tokenizer, max_length=100):
		self.data = data
		self.tokenizer = tokenizer
		self.max_length = max_length

	def __len__(self):
		return len(self.data)

	def __getitem__(self, idx):
		text = self.data[idx]['text']
		label = self.data[idx]['label']
		encoding = self.tokenizer(text, truncation=True, padding='max_length',
			 max_length=self.max_length, return_tensors='pt').to(device)
		return {
			'input_ids': encoding['input_ids'].squeeze(0),
			'attention_mask': encoding['attention_mask'].squeeze(0),
			'label': torch.tensor(label)
		}

batch_size = 32
generator = torch.Generator(device=device).manual_seed(seed)

train_data = IMDBDataset(dataset['train'], tokenizer)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, generator=generator)

test_data = IMDBDataset(dataset['test'], tokenizer)
test_loader  = DataLoader(test_data, batch_size=batch_size, shuffle=True, generator=generator)

## Create a model

In [6]:
# create a 3-layered neural network with ReLU activation
import torch.nn as nn
import torch.optim as optim

# model = nn.Sequential(
# 	nn.Linear(input_dim, hidden_dim),
# 	nn.ReLU(),
# 	nn.Linear(hidden_dim, output_dim),
# 	nn.Softmax(dim=1)
# ).to(device)
class Classifier(nn.Module):
	def __init__(self, model, hidden_dim=768, output_dim=2, dropout_rate=0.1):
		super(Classifier, self).__init__()
		self.model = model
		for param in self.model.parameters():
			# # Freeze BERT parameters to speed up training (optional)
			# param.requires_grad = False
			param.requires_grad = True

		# self.dropout = nn.Dropout(dropout_rate)
		# self.classifier = nn.Sequential(
		# 	nn.Linear(hidden_dim, hidden_dim // 2),
		# 	nn.ReLU(),
		# 	nn.Dropout(dropout_rate),
		# 	nn.Linear(hidden_dim // 2, output_dim)
		# )

	def forward(self, input_ids, attention_mask):
		# Get BERT embeddings - just use the [CLS] token representation
		outputs = self.model(input_ids, attention_mask)
		# with torch.no_grad():
		# 	outputs = self.model(input_ids, attention_mask)
		# 	pooled_output = outputs.last_hidden_state[:, 0, :]  # Get the [CLS] token representation

		# # Apply dropout and classification
		# x = self.dropout(pooled_output)
		# logits = self.classifier(x)

		logits = outputs.logits
		return logits

model = Classifier(model).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(
	model.parameters(),  # [p for p in model.parameters() if p.requires_grad],
	lr=2e-5,
	weight_decay=0.01
)
num_epochs = 5  # Increase number of epochs for better performance
total_steps = len(train_loader) * num_epochs
# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

from transformers import get_linear_schedule_with_warmup
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps,
)

## Train the Model

In [9]:
from sklearn.metrics import accuracy_score

def train_epoch(model, dataloader, optimizer, criterion, device):
	set_seed()
	model.train()
	epoch_loss = 0
	all_preds = []
	all_labels = []

	for batch in dataloader:
		input_ids = batch['input_ids'].to(device)
		attention_mask = batch['attention_mask'].to(device)
		labels = batch['label'].to(device)

		# Forward pass
		optimizer.zero_grad()
		outputs = model(input_ids, attention_mask)
		loss = criterion(outputs, labels)

		# Backward pass
		loss.backward()
		torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
		optimizer.step()

		# Record metrics
		epoch_loss += loss.item()
		preds = torch.argmax(outputs, dim=1).cpu().numpy() # Can be done on only a CPU
		all_preds.extend(preds)
		all_labels.extend(labels.cpu().numpy())

	# Calculate accuracy
	accuracy = accuracy_score(all_labels, all_preds)
	return epoch_loss / len(dataloader), accuracy

def evaluate(model, dataloader, criterion, device):
	set_seed()
	model.eval()  # set model to evaluation mode
	epoch_loss = 0
	all_preds = []
	all_labels = []

	with torch.no_grad():
		for batch in dataloader:
			input_ids = batch['input_ids'].to(device)
			attention_mask = batch['attention_mask'].to(device)
			labels = batch['label'].to(device)

			# Forward pass
			outputs = model(input_ids, attention_mask)
			loss = criterion(outputs, labels)

			epoch_loss += loss.item()
			preds = torch.argmax(outputs, dim=1).cpu().numpy()
			all_preds.extend(preds)
			all_labels.extend(labels.cpu().numpy())

	accuracy = accuracy_score(all_labels, all_preds)
	return epoch_loss / len(dataloader), accuracy

In [10]:
best_accuracy = 0
best_loss = float('inf')
best_model_state = None

print('Training...')
for epoch in range(num_epochs):
	train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device)
	print('.')
	val_loss, val_acc = evaluate(model, test_loader, criterion, device)
	print('.')

	scheduler.step()  # Update learning rate

	if val_acc > best_accuracy:  # Save best model
		best_accuracy = val_acc
		best_loss = val_loss
		best_model_state = model.state_dict().copy()

	print(f'Epoch {epoch+1}/{num_epochs}:')
	print(f'  Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}')
	print(f'  Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.4f}')

# Load best model and evaluation results
if best_model_state:
	model.load_state_dict(best_model_state)
	print(f'Loaded best model with validation accuracy: {best_accuracy:.4f}, Loss: {best_loss:.4f}')

	# # Final Evaluation - on the full test set
	# final_loss, final_accuracy = evaluate(model, test_loader, criterion, device)
	# print(f'Final Test Results:')
	# print(f'  Loss: {final_loss:.4f}, Accuracy: {final_accuracy:.4f}')

Training...
.
.
Epoch 1/5:
  Train Loss: 0.7100, Train Accuracy: 0.4837
  Val Loss: 0.7080, Val Accuracy: 0.4936
.
.
Epoch 2/5:
  Train Loss: 0.7040, Train Accuracy: 0.4959
  Val Loss: 0.6973, Val Accuracy: 0.5084
.
.
Epoch 3/5:
  Train Loss: 0.6921, Train Accuracy: 0.5284
  Val Loss: 0.6798, Val Accuracy: 0.5820
.
.
Epoch 4/5:
  Train Loss: 0.6713, Train Accuracy: 0.5965
  Val Loss: 0.6425, Val Accuracy: 0.6638
.
.
Epoch 5/5:
  Train Loss: 0.6158, Train Accuracy: 0.6893
  Val Loss: 0.5680, Val Accuracy: 0.7478
Loaded best model with validation accuracy: 0.7478, Loss: 0.5680


## Predict on custom text

In [11]:
def predict_sentiment(text, model, tokenizer, device):
	# Tokenize input text
	encoded_text = tokenizer(text, truncation=True,
        padding='max_length', max_length=512, return_tensors='pt')

	input_ids = encoded_text['input_ids'].to(device)
	attention_mask = encoded_text['attention_mask'].to(device)

	# Make prediction
	model.eval()
	with torch.no_grad():
		outputs = model(input_ids, attention_mask)
		prediction = torch.argmax(outputs, dim=1).item()

	sentiment = 'Positive' if prediction == 1 else 'Negative'
	return sentiment


# Test with some example reviews
test_reviews = [
	'This movie was fantastic! I loved it.',
	'The acting was terrible and the plot made no sense.',
	'A masterpiece of cinema that will be remembered for decades.',
	'I wasted two hours of my life watching this garbage.',
]

print('\nTesting with example reviews:')
for review in test_reviews:
	sentiment = predict_sentiment(review, model, tokenizer, device)
	print(f'Review: {review}')
	print(f'Predicted sentiment: {sentiment}\n')

# # Save the model (optional)
# torch.save(model.state_dict(), 'bert_sentiment_classifier.pt')
# print('Model saved to bert_sentiment_classifier.pt')


Testing with example reviews:
Review: This movie was fantastic! I loved it.
Predicted sentiment: Positive

Review: The acting was terrible and the plot made no sense.
Predicted sentiment: Negative

Review: A masterpiece of cinema that will be remembered for decades.
Predicted sentiment: Positive

Review: I wasted two hours of my life watching this garbage.
Predicted sentiment: Positive

