In [2]:
pip install datasets

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-an

In [3]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader
from datasets import load_dataset

# Load dataset from datasets library
dataset = load_dataset("imdb")

# Select 1000 observations for training and evaluation
train_dataset = dataset["train"].select(range(12460,12550))  # Select 1000 samples for training
eval_dataset = dataset["test"].select(range(12460,12550))   # Select 1000 samples for evaluation

# Define the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input text and add special tokens
train_encodings = tokenizer(train_dataset["text"], padding=True, truncation=True, return_tensors='pt')
eval_encodings = tokenizer(eval_dataset["text"], padding=True, truncation=True, return_tensors='pt')

# Add labels
train_labels = torch.tensor(train_dataset["label"])
eval_labels = torch.tensor(eval_dataset["label"])

# Create DataLoader
train_dataset = torch.utils.data.TensorDataset(train_encodings.input_ids, train_labels)
eval_dataset = torch.utils.data.TensorDataset(eval_encodings.input_ids, eval_labels)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=1)

# Fine-tuning BERT
optimizer = AdamW(model.parameters(), lr=2e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to device
model.to(device)

for epoch in range(3):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, labels=labels)  # Make sure to use keyword arguments
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print("Epoch {} Loss: {}".format(epoch+1, total_loss))


# Save the fine-tuned model
model.save_pretrained("fine_tuned_bert")

print("Fine-tuning completed and model saved.")

# Fine-tuning BERT
optimizer = AdamW(model.parameters(), lr=2e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to device
model.to(device)

for epoch in range(3):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print("Epoch {} Loss: {}".format(epoch+1, total_loss))



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1 Loss: 17.029699385166168
Epoch 2 Loss: 15.890617072582245
Epoch 3 Loss: 15.589321732521057
Fine-tuning completed and model saved.




Epoch 1 Loss: 15.97495025396347
Epoch 2 Loss: 14.721604704856873
Epoch 3 Loss: 10.309476643800735


In [4]:
# Define a function to calculate accuracy
def calculate_accuracy(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for input_ids, labels in dataloader:
            input_ids, labels = input_ids.to(device), labels.to(device)
            outputs = model(input_ids=input_ids)
            _, predicted = torch.max(outputs.logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

# After training, save the fine-tuned model
model.save_pretrained("fine_tuned_bert")

# Load fine-tuned model for inference
model = BertForSequenceClassification.from_pretrained('fine_tuned_bert')
model.to(device)

# Create a function for text inference
def predict_sentiment(text):
    model.eval()
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    sentiment = torch.argmax(probabilities, dim=1).item()
    return "Positive" if sentiment == 1 else "Negative"

# Example usage of the predict_sentiment function with multiple movie review texts
movie_review_texts = [
    "I recently watched the movie 'Inception' directed by Christopher Nolan and starring Leonardo DiCaprio. I must say, I was completely blown away by the film.",
    "The plot is incredibly intricate and keeps you on the edge of your seat from start to finish.",
    "Marlon Brando's performance was not that much legendary."
]

for text in movie_review_texts:
    predicted_sentiment = predict_sentiment(text)
    print("Text:", text)
    print("Predicted sentiment:", predicted_sentiment)
    print()




Text: I recently watched the movie 'Inception' directed by Christopher Nolan and starring Leonardo DiCaprio. I must say, I was completely blown away by the film.
Predicted sentiment: Positive

Text: The plot is incredibly intricate and keeps you on the edge of your seat from start to finish.
Predicted sentiment: Positive

Text: Marlon Brando's performance was not that much legendary.
Predicted sentiment: Negative

