In [35]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

## HugginFace's Tokenizer

In [36]:
import tqdm as notebook_tqdm
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [37]:
new_sentence = 'follow the white rabbit neo'
new_tokens = tokenizer.tokenize(new_sentence)
new_tokens

['follow', 'the', 'white', 'rabbit', 'neo']

In [38]:
new_ids = tokenizer.convert_tokens_to_ids(new_tokens)
new_ids

[3582, 1996, 2317, 10442, 9253]

In [39]:
new_ids = tokenizer.encode(new_sentence)
new_ids

[101, 3582, 1996, 2317, 10442, 9253, 102]

In [40]:
tokenizer.convert_ids_to_tokens(new_ids)

['[CLS]', 'follow', 'the', 'white', 'rabbit', 'neo', '[SEP]']

In [41]:
tokenizer.encode(new_sentence, add_special_tokens=False)

[3582, 1996, 2317, 10442, 9253]

In [42]:
tokenizer(new_sentence, add_special_tokens=False, return_tensors='pt')

{'input_ids': tensor([[ 3582,  1996,  2317, 10442,  9253]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [43]:
sentence1 = 'follow the white rabbit neo'
sentence2 = 'no one can be told what the matrix is'
joined_sentences = tokenizer(sentence1, sentence2)
joined_sentences

{'input_ids': [101, 3582, 1996, 2317, 10442, 9253, 102, 2053, 2028, 2064, 2022, 2409, 2054, 1996, 8185, 2003, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [44]:
print(tokenizer.convert_ids_to_tokens(joined_sentences['input_ids']))

['[CLS]', 'follow', 'the', 'white', 'rabbit', 'neo', '[SEP]', 'no', 'one', 'can', 'be', 'told', 'what', 'the', 'matrix', 'is', '[SEP]']


In [45]:
separate_sentences = tokenizer([sentence1, sentence2], padding=True)
separate_sentences

{'input_ids': [[101, 3582, 1996, 2317, 10442, 9253, 102, 0, 0, 0, 0], [101, 2053, 2028, 2064, 2022, 2409, 2054, 1996, 8185, 2003, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [46]:
print(tokenizer.convert_ids_to_tokens(separate_sentences['input_ids'][0]))
print(separate_sentences['attention_mask'][0])

['[CLS]', 'follow', 'the', 'white', 'rabbit', 'neo', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]


In [47]:
first_sentences = [sentence1, 'another first sentence']
second_sentences = [sentence2, 'a second sentence here']
batch_of_pairs = tokenizer(first_sentences, second_sentences)
first_input = tokenizer.convert_ids_to_tokens(batch_of_pairs['input_ids'][0])
second_input = tokenizer.convert_ids_to_tokens(batch_of_pairs['input_ids'][1])
print(first_input)
print(second_input)

['[CLS]', 'follow', 'the', 'white', 'rabbit', 'neo', '[SEP]', 'no', 'one', 'can', 'be', 'told', 'what', 'the', 'matrix', 'is', '[SEP]']
['[CLS]', 'another', 'first', 'sentence', '[SEP]', 'a', 'second', 'sentence', 'here', '[SEP]']


In [48]:
def load_imdb_data(data_file):
    df = pd.read_csv(data_file)
    texts = df['review'].tolist()
    labels = [1 if sentiment == "positive" else 0 for sentiment in df['sentiment'].tolist()]
    return texts, labels

https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews?resource=download&select=IMDB+Dataset.csv

In [49]:
import os
import zipfile  # ✅ Import zipfile


# Install Kaggle API if not installed
!pip install -q kaggle

# Download the dataset
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

# Unzip the file
with zipfile.ZipFile("imdb-dataset-of-50k-movie-reviews.zip", "r") as zip_ref:
    zip_ref.extractall("imdb_data")

# ✅ Unzip the downloaded file
with zipfile.ZipFile("imdb-dataset-of-50k-movie-reviews.zip", "r") as zip_ref:
    zip_ref.extractall("imdb_data")

# ✅ Verify extraction
print("Files in extracted folder:", os.listdir("imdb_data"))



# Load dataset in Pandas
import pandas as pd
df = pd.read_csv("imdb_data/IMDB Dataset.csv", encoding="utf-8")

# Display first few rows
print(df.head())

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
imdb-dataset-of-50k-movie-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)
Files in extracted folder: ['IMDB Dataset.csv']
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [50]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [51]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [52]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [53]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [54]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return "positive" if preds.item() == 1 else "negative"

In [55]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5

In [56]:
# from sklearn.model_selection import train_test_split
# train_texts, val_texts, train_labels, val_labels = train_test_split(review, sentiment, test_size=0.2, random_state=42)

import pandas as pd
from sklearn.model_selection import train_test_split

# Extract the text (reviews) and labels (sentiment)
texts = df["review"].tolist()  # ✅ Extract review column
labels = df["sentiment"].map({"positive": 1, "negative": 0}).tolist()  # ✅ Convert sentiment to numerical labels

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)


In [57]:
from transformers import BertTokenizer  # ✅ Import BertTokenizer

# Define the model name (e.g., 'bert-base-uncased')
bert_model_name = "bert-base-uncased"

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained(bert_model_name)


from torch.utils.data import Dataset

# ✅ Define the dataset class
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": self.labels[idx],
        }


# ✅ Set a value for max_length
max_length = 128  # You can change this value based on your model

# ✅ Now this will work!
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)



# ✅ Now this will work!
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)

from torch.utils.data import DataLoader  # ✅ Import DataLoader

# ✅ Define batch size (if not already defined)
batch_size = 16  # You can change this based on your system's memory

# ✅ Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)


##########################################################################################

tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [58]:
import torch
import torch.nn as nn
from transformers import BertModel

# ✅ Define BERT Classifier
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # Extract the pooled output
        x = self.dropout(pooled_output)
        x = self.fc(x)  # Final classification layer
        return x

##############################################################################


num_classes = 2
import torch
from torch.optim import AdamW  # ✅ Use PyTorch's AdamW (instead of deprecated transformers version)
from transformers import get_scheduler  # ✅ Import scheduler function
from transformers import get_linear_schedule_with_warmup

# ✅ Define training parameters
num_epochs = 3  # Adjust as needed
learning_rate = 2e-5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

# ✅ Define optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# ✅ Define learning rate scheduler
total_steps = len(train_dataloader) * num_epochs  # Ensure train_dataloader is defined
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)




optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [59]:
import torch

import torch  # ✅ Ensure PyTorch is imported
import torch.nn as nn  # ✅ Import PyTorch's neural network module

# ✅ Define the training function
def train(model, train_dataloader, optimizer, scheduler, device):
    model.train()  # Set model to training mode
    total_loss = 0

    for batch in train_dataloader:
        optimizer.zero_grad()  # Reset gradients

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        # ✅ Ensure loss function is correctly defined
        loss_fn = nn.CrossEntropyLoss()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)

        loss.backward()  # Compute gradients
        optimizer.step()  # Update weights
        scheduler.step()  # Update learning rate
        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Training Loss: {avg_loss:.4f}")



    import torch
from sklearn.metrics import accuracy_score, classification_report

# ✅ Define the evaluation function
def evaluate(model, val_dataloader, device):
    model.eval()  # Set model to evaluation mode
    predictions = []
    true_labels = []

    with torch.no_grad():  # Disable gradient calculation
        for batch in val_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()  # Convert to numpy array
            labels = labels.cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(labels)

    # ✅ Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)

    # ✅ Generate classification report
    report = classification_report(true_labels, predictions, target_names=["Negative", "Positive"])

    return accuracy, report
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)  # ✅ Train the model
    accuracy, report = evaluate(model, val_dataloader, device)  # ✅ Now this works!
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)  # ✅ Print classification report

#########################################################################

# for epoch in range(num_epochs):
#     print(f"Epoch {epoch + 1}/{num_epochs}")
#     train(model, train_dataloader, optimizer, scheduler, device)
#     accuracy, report = evaluate(model, val_dataloader, device)
#     print(f"Validation Accuracy: {accuracy:.4f}")
#     print(report)

Epoch 1/3
Training Loss: 0.3099
Validation Accuracy: 0.8775
              precision    recall  f1-score   support

    Negative       0.94      0.81      0.87      4961
    Positive       0.83      0.95      0.89      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

Epoch 2/3
Training Loss: 0.1691
Validation Accuracy: 0.8970
              precision    recall  f1-score   support

    Negative       0.88      0.92      0.90      4961
    Positive       0.91      0.88      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Epoch 3/3
Training Loss: 0.0765
Validation Accuracy: 0.8983
              precision    recall  f1-score   support

    Negative       0.90      0.89      0.90      4961
    Positive       0.90      0.90      0.90      5039

  

In [60]:
torch.save(model.state_dict(), "bert_classifier.pth")

In [61]:
# Test sentiment prediction
test_text = "The movie was great and I really enjoyed the performances of the actors."
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print("The movie was great and I really enjoyed the performances of the actors.")
print(f"Predicted sentiment: {sentiment}")

The movie was great and I really enjoyed the performances of the actors.
Predicted sentiment: positive


In [62]:
# Test sentiment prediction
test_text = "Worst movie of the year."
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print("Worst movie of the year.")
print(f"Predicted sentiment: {sentiment}")

Worst movie of the year.
Predicted sentiment: negative


In [63]:
#just load the modwl and use it for inference
import torch
import torch.nn as nn
from transformers import BertModel  # Make sure you have transformers installed

# 1. Define your model class (exactly the same as when you saved it)
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits


# 2. Create an instance of your model (matching the saved model's architecture)
bert_model_name = "bert-base-uncased"
num_classes = 2
model = BERTClassifier(bert_model_name, num_classes)

# 3. Load the saved state dictionary
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #put the device before loading
model.to(device) #put the device before loading

try:
    model.load_state_dict(torch.load("bert_classifier.pth", map_location=device))
    print("Model loaded successfully!")
except FileNotFoundError:
    print("Error: Saved model file not found.")
except RuntimeError as e: # Catch potential size mismatch errors
    print(f"Error loading model: {e}")

# 4. (Optional) Put the model in evaluation mode
model.eval()  # Important for inference


  model.load_state_dict(torch.load("bert_classifier.pth", map_location=device))


Model loaded successfully!


BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [64]:
# Test sentiment prediction
test_text = "i like that."
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print(test_text)
print(f"Predicted sentiment: {sentiment}")

i like that.
Predicted sentiment: positive
