[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](http://colab.research.google.com/github/sk-classroom/asc-bert/blob/main/assignments/assignment_01.ipynb)

We will learn how to generate word embeddings using BERT. BERT produces contextualized word embeddings, where the embeddings are computed based on the context of the word. Thus, a single word can have different embeddings based on its context. 

# Preparation

In [1]:
# If you haven't installed the required packages, please install them using pip
# %pip install transformers plotly
# %pip install --upgrade nbformat
# %pip install torch
# 


In [2]:
import pandas as pd
import numpy as np
import transformers
import torch
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.decomposition import PCA
import plotly.express as px
import tqdm as notebook_tqdm
import re
import os

In [4]:
#  Define the model and tokenizer
model = transformers.BertModel.from_pretrained(
    "bert-base-uncased",output_hidden_states=True
)
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")


With BERT, we need to prepare text in ways that BERT can understand. 
Specifically, we prepend it with ```[CLS]``` and append ```[SEP]```. We will then convert the text to a tensor of token ids, which is ready to be fed into the model. 



In [5]:
def prepare_text(text):
    text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    segments_ids = torch.ones((1, len(indexed_tokens)), dtype=torch.long)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensor = segments_ids.clone()
    return tokenized_text, tokens_tensor, segments_tensor

> What is segment tensor?
BERT models are designed to process sentence pairs, differentiated by 0s and 1s to indicate the first and second sentence respectively. In the case of single-sentence inputs, we assign a vector of 1s to each token, indicating they all belong to the first sentence.

Let's get the BERT embeddings for the sentence "Bank is located in the city of London". 

First, let's prepare the text for BERT. 

In [6]:
text = "Bank is located in the city of London"
tokenized_text, tokens_tensor, segments_tensor = prepare_text(text)

Then, let's get the BERT embeddings for each token. 

In [7]:
outputs = model(tokens_tensor, segments_tensor)

The output includes `loss`, `logits`, and `hidden_states`. We will use `hidden_states`, which contains the embeddings of the tokens. 



In [8]:
hidden_states = outputs.hidden_states

print("how many layers? ", len(hidden_states))
print("Shape? ", hidden_states[0].shape)

how many layers?  13
Shape?  torch.Size([1, 10, 768])


The hidden states are a list of 13 tensors, where each tensor is of shape (batch_size, sequence_length, hidden_size). The first tensor is the input embeddings, and the subsequent tensors are the hidden states of the BERT layers. 

So, we have 13 choice of hidden states. Deep layers close to the output capture the context of the word from the previous layers.

Here we will take the average over the last four hidden states for each token. 

In [9]:
# TODO: Compute the embedding of the token
emb = torch.zeros((len(tokenized_text),768),dtype=torch.float32)
n_layers=4
for i in range(n_layers):
  emb+=hidden_states[-i-1].squeeze(0)
  emb/=n_layers
emb.shape

torch.Size([10, 768])

emb is of shape (sequence_length, hidden_size). Let us summarize the embeddings of the tokens into a function. 

In [10]:
def get_bert_embeddings(text):
    tokenized_text, tokens_tensor, segments_tensor = prepare_text(text)
    outputs = model(tokens_tensor, segments_tensor)
    hidden_states = outputs.hidden_states
    emb =  torch.zeros((len(tokenized_text),768),dtype=torch.float32)
    n_layers=4 #4 for a1, 1 for a2
    for i in range(n_layers):
      emb+=hidden_states[-i-1].squeeze(0)
      emb/=n_layers
    return emb, tokenized_text

# Embedding
Let's embed the text and get the embedding of the focal word 

In [11]:
labels = []  # label
emb = []  # embedding
sentences = []  # sentence

# TODO: Go through the data and get the embedding of the focal word.
for i,(word_pos, sentence, label) in train_data.iterrows():
    _emb, _tokenized_text=get_bert_embeddings(sentence)
    _emb=_emb[word_pos]
    emb.append(_emb)
    sentences.append(sentence)
    labels.append(label)
emb=torch.stack(emb)
labels=np.array(labels)

NameError: name 'train_data' is not defined

# Results 

Let's plot the embeddings of the focal word. 

In [None]:
def plot_result(emb, labels, sentences):
    xy = PCA(n_components=2).fit_transform(emb)

    fig = px.scatter(
        x=xy[:, 0],
        y=xy[:, 1],
        color=labels,
        hover_data=[sentences],
        title="PCA of Word Embeddings",
    )
    fig.update_layout(width=700, height=500)
    fig.update_traces(
        marker=dict(size=12, line=dict(width=2, color="DarkSlateGrey")),
        selector=dict(mode="markers"),
    )
    fig.show()

emb1 = emb.detach().numpy()
plot_result(emb1, labels, sentences)

# Trial on other data

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


# def save_assignment(emb, labels, assignment_id, data_dir):
#     K = len(set(labels))
#     xy = LinearDiscriminantAnalysis(n_components=K - 1).fit_transform(emb, labels)
#     xy_df = pd.DataFrame(xy)
#     xy_df["label"] = labels
#     xy_df.to_csv(f"{data_dir}/eval_test_{assignment_id}.csv", index=False)

In [12]:
#  Define the model and tokenizer
model = transformers.BertModel.from_pretrained(
    "bert-base-uncased",output_hidden_states=True
)
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")

def prepare_text(text):
    text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    segments_ids = torch.ones((1, len(indexed_tokens)), dtype=torch.long)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensor = segments_ids.clone()
    return tokenized_text, tokens_tensor, segments_tensor


how many layers?  13
Shape?  torch.Size([1, 10, 768])


In [21]:
with open('../Text data/a-midsummer-nights-dream_TXT_FolgerShakespeare.txt', 'r') as file:
    text = file.read()
text = re.sub(r'\n', ' ', text)  # Remove newline characters
text = re.sub(r'\s+', ' ', text)  # Remove extra whitespaces

# Split the textcorpus into a list of sentence strings
text_sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)

print(text_sentences[:1])

["A Midsummer Night's Dream by William Shakespeare Edited by Barbara A."]


In [30]:
def get_bert_embeddings(text):
    tokenized_text, tokens_tensor, segments_tensor = prepare_text(text)
    outputs = model(tokens_tensor, segments_tensor)
    hidden_states = outputs.hidden_states
    emb =  torch.zeros((len(tokenized_text),768),dtype=torch.float32)
    n_layers=4 #4 for a1, 1 for a2
    for i in range(n_layers):
      emb+=hidden_states[-i-1].squeeze(0)
      emb/=n_layers
    return emb, tokenized_text

In [33]:
_emb, _tokenized_text=get_bert_embeddings(text_sentences[0])
_emb

tensor([[-3.6686e-01, -1.2793e-01, -4.8854e-01,  ..., -8.6284e-02,
          8.3359e-02,  1.9924e-01],
        [-1.3591e-01,  6.0254e-02, -2.1157e-01,  ..., -1.8625e-01,
         -2.9574e-02,  3.0529e-01],
        [ 1.8012e-01, -1.9307e-01, -4.3354e-02,  ..., -3.5496e-03,
          2.1768e-02, -1.9646e-01],
        ...,
        [-1.1118e-01, -8.1374e-02, -1.2472e-01,  ...,  2.6131e-01,
          7.4739e-02,  2.3104e-01],
        [-4.7866e-01, -4.2583e-01, -4.1932e-01,  ...,  8.7290e-02,
         -1.7970e-01, -7.3816e-02],
        [ 3.9807e-03,  1.2268e-04, -9.7863e-03,  ..., -8.0885e-03,
         -1.5587e-02,  9.2318e-03]], grad_fn=<DivBackward0>)

In [31]:
for sentence in text_sentences:
    _emb, _tokenized_text=get_bert_embeddings(sentence)
    emb.append(_emb)

emb=torch.stack(emb)

AttributeError: 'Tensor' object has no attribute 'append'

In [13]:
# TODO: Compute the embedding of the token
emb = torch.zeros((len(tokenized_text),768),dtype=torch.float32)
n_layers=4
for i in range(n_layers):
  emb+=hidden_states[-i-1].squeeze(0)
  emb/=n_layers
emb.shape

def get_bert_embeddings(text):
    tokenized_text, tokens_tensor, segments_tensor = prepare_text(text)
    outputs = model(tokens_tensor, segments_tensor)
    hidden_states = outputs.hidden_states
    emb =  torch.zeros((len(tokenized_text),768),dtype=torch.float32)
    n_layers=4 #4 for a1, 1 for a2
    for i in range(n_layers):
      emb+=hidden_states[-i-1].squeeze(0)
      emb/=n_layers
    return emb, tokenized_text