# BERT models for Sentiment analysis and Q&A

# Sentiment Analysis using BERT

In [None]:
!pip install datasets


Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-

In [None]:
from datasets import load_dataset

sst2 = load_dataset("sst2")


# "label" will have different values for various sentiment levels


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [None]:

# # Access reviews and labels
# texts = sst2["train"]["sentence"]
# labels = sst2["train"]["label"]
texts = sst2["train"]["sentence"][:100]
labels = sst2["train"]["label"][:100]


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import numpy as np

# Define the dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Example data for sentiment analysis (0: negative, 1: positive)
# texts = ["I loved the movie!", "The movie was terrible.", "It was okay.", "Great film!"]
# labels = [1, 0, 0, 1]
# texts = data['review'].tolist()
# labels = data['sentiment'].tolist()
# Split the data into training and validation sets
#train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(texts,labels, test_size=0.2, random_state=42)

# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define dataset and dataloaders
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_length=128)
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
model.train()
for epoch in range(3):  # 3 epochs for demonstration
    print(f"Epoch {epoch + 1}/3")
    for batch in train_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Training loss: {loss.item()}")

# Validation
model.eval()
val_dataset = CustomDataset(val_texts, val_labels, tokenizer, max_length=128)
val_dataloader = DataLoader(val_dataset, batch_size=2, shuffle=False)

total_val_loss = 0
num_val_steps = 0
for batch in val_dataloader:
    with torch.no_grad():
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_val_loss += loss.item()
        num_val_steps += 1

avg_val_loss = total_val_loss / num_val_steps
print(f"Validation loss: {avg_val_loss}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Training loss: 0.7842108011245728
Epoch 2/3
Training loss: 0.789749026298523
Epoch 3/3
Training loss: 0.29087576270103455
Validation loss: 0.35953704118728635


In [None]:
# List of texts to score
texts_to_score = ["Interstellar movie is fantastic!", "Oppenheimer is not a great movie"]

# Tokenize the texts
tokenized_texts = tokenizer(texts_to_score, padding=True, truncation=True, max_length=128, return_tensors="pt")

# Perform inference
with torch.no_grad():
    outputs = model(**tokenized_texts)

# Get predictions
logits = outputs.logits
predicted_probs = torch.softmax(logits, dim=1)
predicted_classes = torch.argmax(predicted_probs, dim=1)

# Print the results
for text, label in zip(texts_to_score, predicted_classes):
    sentiment = "positive" if label == 1 else "negative"
    print(f"Text: {text} | Sentiment: {sentiment}")

Text: Interstellar movie is fantastic! | Sentiment: positive
Text: Oppenheimer is not a great movie | Sentiment: negative


# Q&A using BERT

In [None]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

#Model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# Question to ask
question = '''What are the primary colors?'''

# Paragraph providing information related to the question
paragraph = '''The primary colors are red, blue, and yellow. These colors are considered primary because they can be combined in various proportions to create a wide range of other colors, including secondary colors (green, purple, and orange) and tertiary colors.'''

# Encoding the question and paragraph using the tokenizer
encoding = tokenizer.encode_plus(text=question, text_pair=paragraph)

# Extracting the token embeddings
inputs = encoding['input_ids']  # Token embeddings

# Extracting the segment embeddings
sentence_embedding = encoding['token_type_ids']  # Segment embeddings

# Converting token IDs to tokens (words/punctuation)
tokens = tokenizer.convert_ids_to_tokens(inputs)  # Input tokens


In [None]:
# Using the model to predict start and end positions of the answer within the input sequence
start_scores, end_scores = model(
    input_ids=torch.tensor([inputs]),  # Input token embeddings
    token_type_ids=torch.tensor([sentence_embedding])  # Segment embeddings
)


In [None]:
# Generating output from the model by providing input token embeddings and segment embeddings
output = model(
    input_ids=torch.tensor([inputs]),  # Input token embeddings
    token_type_ids=torch.tensor([sentence_embedding])  # Segment embeddings
)

# Extracting the index of the start position of the predicted answer
start_index = torch.argmax(output.start_logits)

# Extracting the index of the end position of the predicted answer
end_index = torch.argmax(output.end_logits)


In [None]:
# Joining the tokens from start_index to end_index to form the predicted answer
answer = ' '.join(tokens[start_index:end_index+1])


In [None]:
# Initializing an empty string to store the corrected answer
corrected_answer = ''

# Iterating through each word in the answer
for word in answer.split():

    # If it's a subword token (indicated by starting with '##')
    if word[0:2] == '##':
        # Append the subword without the '##' prefix to the corrected answer
        corrected_answer += word[2:]
    else:
        # Append the word as is to the corrected answer
        corrected_answer += ' ' + word

# Printing the corrected answer
print(corrected_answer)


 red , blue , and yellow
