# Using BERT Model to Classify IMDB Reviews

In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from tqdm import tqdm


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/safiaboutaleb/Developer/cs178-group-project/IMDB_ven/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/safiaboutaleb/Developer/cs178-group-project/IMDB_ven/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/safiaboutaleb/Developer/cs178-group-project/IMDB_ven/lib/py

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
# load the dataframes
train_reviews = pd.read_csv("train_review_data.csv", usecols=["review", "label"])
test_reviews = pd.read_csv("test_review_data.csv", usecols=["review", "label"])

In [4]:
train_reviews.head()

Unnamed: 0,review,label
0,For a movie that gets no respect there sure ar...,1
1,Bizarre horror movie filled with famous faces ...,1
2,"A solid, if unremarkable film. Matthau, as Ein...",1
3,It's a strange feeling to sit alone in a theat...,1
4,"You probably all already know this by now, but...",1


In [5]:
test_reviews.head()

Unnamed: 0,review,label
0,"Based on an actual story, John Boorman shows t...",1
1,This is a gem. As a Film Four production - the...,1
2,"I really like this show. It has drama, romance...",1
3,This is the best 3-D experience Disney has at ...,1
4,"Of the Korean movies I've seen, only three had...",1


In [6]:
'''
create a dataloader 
* This will help us organize our movie reviews and their sentiment for our BERT model.
* This will tokenize the text, handle the length of the reviews, and mange the batching and shuffling 
'''

class IMDBDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.long)
        }

max_len = 512 # BERT has a max capacity of 512 length
train_dataset = IMDBDataset(
    train_reviews["review"].tolist(), 
    train_reviews["label"].tolist(), 
    tokenizer, 
    max_len
)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [7]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# training the model
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

num_epochs = 3 # start off with 3 for now

# check if a compatiable GPU is avaibale, if now we will run on the CPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model.to(device)  # move model to the selected device
model.train()

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    progress_bar = tqdm(train_loader, desc="Training", leave=False)
    
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
		# backward pass
        loss.backward()
        optimizer.step()
        
		# update progress bar with current loss
        progress_bar.set_postfix(loss=loss.item())

print("Training complete!")



Epoch 1/3


Training:   0%|          | 0/1563 [1:57:15<?, ?it/s, loss=0.805] 

KeyboardInterrupt: 

In [None]:
# evaluate model here using test data
test_dataset = IMDBDataset(
    test_reviews["review"].tolist(), 
    test_reviews["label"].tolist(), 
    tokenizer, 
    max_len
)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

true_labels = []
predicted_labels = []

model.eval()
with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        
        predictions = torch.argmax(outputs.logits, dim=1)
        
		# store the true labels and the predicted labels
        true_labels.extend(labels.cpu().numpy())
        predicted_labels.extend(predictions.cpu().numpy())

In [None]:
# calculate accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Test Accuracy: {accuracy:.2f}")