# Warning :
# Do "File -> Save a copy in Drive" before you start modifying the notebook, otherwise your modifications will not be saved.

# BERT for Sentiment Analysis
# A) Compute BERT embedding for each review => CLS token

In [1]:
! pip install transformers



In [2]:
import transformers
import torch
import tensorflow as tf

# Downloading large review movie dataset (25000 reviews)

In [3]:
!wget https://thome.isir.upmc.fr/classes/RITAL/json_pol.json

--2024-02-25 06:28:17--  https://thome.isir.upmc.fr/classes/RITAL/json_pol.json
Resolving thome.isir.upmc.fr (thome.isir.upmc.fr)... 134.157.18.247
Connecting to thome.isir.upmc.fr (thome.isir.upmc.fr)|134.157.18.247|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 32663547 (31M) [application/json]
Saving to: ‘json_pol.json’


2024-02-25 06:28:20 (18.2 MB/s) - ‘json_pol.json’ saved [32663547/32663547]



In [4]:
import json
from collections import Counter

# Loading json
file = './json_pol.json'
with open(file,encoding="utf-8") as f:
    data = json.load(f)


# Quick Check
counter = Counter((x[1] for x in data))
print("Number of reviews : ", len(data))
print("----> # of positive : ", counter[1])
print("----> # of negative : ", counter[0])
print("")
print(data[0])

Number of reviews :  25000
----> # of positive :  12500
----> # of negative :  12500

['Although credit should have been given to Dr. Seuess for stealing the story-line of "Horton Hatches The Egg", this was a fine film. It touched both the emotions and the intellect. Due especially to the incredible performance of seven year old Justin Henry and a script that was sympathetic to each character (and each one\'s predicament), the thought provoking elements linger long after the tear jerking ones are over. Overall, superior acting from a solid cast, excellent directing, and a very powerful script. The right touches of humor throughout help keep a "heavy" subject from becoming tedious or difficult to sit through. Lastly, this film stands the test of time and seems in no way dated, decades after it was released.', 1]


# Getting the Tokenizer

In [5]:
model_name = "rttl-ai/bert-base-uncased-yelp-reviews"


from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/468 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

# Experiment the Tokenizer on the first train review

In [6]:
maxL = 512 # Max length of the sequence

string_tokenized = tokenizer.encode_plus(data[0][0], return_tensors="pt",
                                        add_special_tokens=True,  # add '[CLS]' and '[SEP]'
                            max_length=maxL,  # set max length
                            truncation=True,  # truncate longer messages
                            #pad_to_max_length=True
                            padding='max_length',  # add padding
                            return_attention_mask=True)

The output of the tokenizer string_tokenized (class BatchEncoding) returns two elements:


*   string_tokenized['input_ids']: the index of each token in the dictionary
*   string_tokenized['attention_mask']: a binary mask (0 to ignore the token, 1 to consider it). This is because we need tensor a fixed length and we have reviews with a variable number of words



In [7]:
print(string_tokenized['input_ids'])
print(string_tokenized['attention_mask'])

tensor([[  101,  2348,  4923,  2323,  2031,  2042,  2445,  2000,  2852,  1012,
          7367, 15808,  2015,  2005, 11065,  1996,  2466,  1011,  2240,  1997,
          1000, 18469, 11300,  2229,  1996,  8288,  1000,  1010,  2023,  2001,
          1037,  2986,  2143,  1012,  2009,  5028,  2119,  1996,  6699,  1998,
          1996, 24823,  1012,  2349,  2926,  2000,  1996,  9788,  2836,  1997,
          2698,  2095,  2214,  6796,  2888,  1998,  1037,  5896,  2008,  2001,
         13026,  2000,  2169,  2839,  1006,  1998,  2169,  2028,  1005,  1055,
          3653, 14808, 24996,  1007,  1010,  1996,  2245,  4013, 22776,  3787,
         26577,  2146,  2044,  1996,  7697, 22387,  3924,  2024,  2058,  1012,
          3452,  1010,  6020,  3772,  2013,  1037,  5024,  3459,  1010,  6581,
          9855,  1010,  1998,  1037,  2200,  3928,  5896,  1012,  1996,  2157,
         12817,  1997,  8562,  2802,  2393,  2562,  1037,  1000,  3082,  1000,
          3395,  2013,  3352,  6945,  6313,  2030,  

# Let's tokenize the whole dataset

In [8]:
import numpy as np

maxL = 512
temb = 768

inputs_tokens = []
attention_masks = []

for i in range(len(data)):
    if(i%2500==0):
        print(i)
    string_tokenized = tokenizer.encode_plus(data[i][0], return_tensors="pt",
                            add_special_tokens=True,  # add '[CLS]' and '[SEP]'
                            max_length=maxL,  # set max length
                            truncation=True,  # truncate longer messages
                            #pad_to_max_length=True
                            padding='max_length',  # add padding
                            return_attention_mask=True)

    # APPEND inputs token and input masks. YOUR CODE HERE
    inputs_tokens.append(string_tokenized['input_ids']) # ~~
    attention_masks.append(string_tokenized['attention_mask'])
    #E*19 + E*21 code transformers #last ligne

0
2500
5000
7500
10000
12500
15000
17500
20000
22500


# Let's create a 'TensorDataSet' FOR THE SAMPLES where each element is a triplet composed of token word index, token mask, and label

In [9]:
# Converting input tokens to torch tensors
inputs_tokenst = torch.cat(inputs_tokens, dim=0)
attention_maskst = torch.cat(attention_masks, dim=0)



# Converting labels to numpy then torch tensor
y = torch.zeros((len(data),))
for i in range(len(data)):
    y[i] = data[i][1]
#y = torch.from_numpy(y)

from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
dataset = TensorDataset(inputs_tokenst, attention_maskst, y)

# Lets download a BERT model for word embedding

In [10]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [11]:
print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

**You can use the BERT model for directly predicting polarity.** Let us apply that on the first review which has been tokenized with string_tokenized.

In [12]:
# Some preliminary test
import torch
import numpy as np
b_input_ids = string_tokenized['input_ids']
b_input_mask = string_tokenized['attention_mask']

model.eval()

output = model(input_ids=b_input_ids,attention_mask=b_input_mask, output_hidden_states=True)
print(output.logits) # The output of the logit of the two classes (polarity pos/neg)
last_hidden_states = output.hidden_states[-1] # The last layer before the class prediction: tensor of size nBatch (1 here) x MaxL (512) x temb (768)
print(last_hidden_states.shape)
print(last_hidden_states[0,0,1:10]) # The first 10 value of the first elements (=[CLS] TOKEN)
print(f" norm cls token={np.linalg.norm(last_hidden_states.detach().numpy()[0,0,:])}")

tensor([[ 1.9588,  2.7916,  0.5612, -2.2449, -3.3843]],
       grad_fn=<AddmmBackward0>)
torch.Size([1, 512, 768])
tensor([ 0.1124, -0.2079, -0.1135,  0.0599,  0.2796, -0.1295,  0.0430,  0.0620,
         0.0106], grad_fn=<SliceBackward0>)
 norm cls token=12.6652250289917


In [None]:
# If you need to clean GPU memory
#import gc
#gc.collect()
#torch.cuda.empty_cache()

# Most important STEP: we want to extract the [CLS] representation (1st token of the last layer before logits) for each review, and store it.  

In [16]:
# create DataLoaders with samplers
tb = int(100) # batch size
dataloader = DataLoader(dataset, batch_size=tb,shuffle=False) # dataloader
nbTrain = len(data)
features = np.zeros((nbTrain, temb))
nbtach = int(nbTrain/tb)
print(f"nb batches={nbtach}")
# Comuting CLS features
model.cuda()
for idx,batch in enumerate(dataloader):
        # Unpack this training batch from our dataloader:
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        if(idx%10==0):
            print(f"batch {idx} / {nbtach}")
        b_input_ids = batch[0].cuda()
        b_input_mask = batch[1].cuda()
        b_labels = batch[2].cuda().long()

        with torch.no_grad():
            # forward propagation (evaluate model on training batch)
            output = model(input_ids=b_input_ids,
                                 attention_mask=b_input_mask,
                                 #labels=b_labels,
                               output_hidden_states=True)
            last_hidden_states = output.hidden_states[-1] # WARNING: it is now a batch of size tbatch x nToken x embsize
            cls_representations = last_hidden_states[:, 0, :]
            features[idx*tb:idx*tb+tb,:] =  cls_representations.detach().cpu().numpy()# YOUR CODE HERE. Think in applying .detach().cpu().numpy()


nb batches=250
batch 0 / 250
batch 10 / 250
batch 20 / 250
batch 30 / 250
batch 40 / 250
batch 50 / 250
batch 60 / 250
batch 70 / 250
batch 80 / 250
batch 90 / 250
batch 100 / 250
batch 110 / 250
batch 120 / 250
batch 130 / 250
batch 140 / 250
batch 150 / 250
batch 160 / 250
batch 170 / 250
batch 180 / 250
batch 190 / 250
batch 200 / 250
batch 210 / 250
batch 220 / 250
batch 230 / 250
batch 240 / 250


# Now save the embedding of each review into disk!

In [17]:
# Saving the features and labels
import pickle
# Open a file and use dump()
with open('data.pkl', 'wb') as file:
    # A new file will be created
    pickle.dump([features,y], file)

In [18]:
import pickle

# Open the file in binary mode
with open('data.pkl', 'rb') as file:
    # Call load method to deserialze
    [features, y] = pickle.load(file)

In [19]:
import numpy as np
print(features.shape[0])
print(y)
print(np.linalg.norm(features[10]))

25000
tensor([1., 1., 1.,  ..., 0., 0., 0.])
12.189715586068397


# B) Train a logistic regression model on top of extracted embeddings. Conclude on the performances of BERT for the sentiment classification task

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

np.random.seed(0)

rs=10
[X_train, X_test, y_train, y_test]  = train_test_split(features, y, test_size=0.5, random_state=rs, shuffle=True)

# Entrainer lr
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# predictions
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.92672


# C) Fine-tuning BERT for sentiment classification

In [22]:
# We will fine-tune a smaller model
model_name = "haisongzhang/roberta-tiny-cased"
#model_name = "rttl-ai/bert-base-uncased-yelp-reviews"

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

# Let's tokenize the whole dataset

In [23]:
maxL = 512 # Max length of the sequence

string_tokenized = tokenizer.encode_plus(data[0][0], return_tensors="pt",
                                        add_special_tokens=True,  # add '[CLS]' and '[SEP]'
                            max_length=maxL,  # set max length
                            truncation=True,  # truncate longer messages
                            #pad_to_max_length=True
                            padding='max_length',  # add padding
                            return_attention_mask=True)

In [24]:
import numpy as np

maxL = 512


inputs_tokens = []
attention_masks = []

for i in range(len(data)):
    if(i%2500==0):
        print(i)
    string_tokenized = tokenizer.encode_plus(data[i][0], return_tensors="pt",
                                        add_special_tokens=True,  # add '[CLS]' and '[SEP]'
                            max_length=maxL,  # set max length
                            truncation=True,  # truncate longer messages
                            #pad_to_max_length=True
                            padding='max_length',  # add padding
                            return_attention_mask=True)

    inputs_tokens.append(string_tokenized['input_ids'])
    attention_masks.append(string_tokenized['attention_mask'])

0
2500
5000
7500
10000
12500
15000
17500
20000
22500


# Let's create 'TensorDataSets' FOR THE TRAIN/TEST SAMPLES where each element is a triplet composed of token word index, token mask, and label

In [25]:
import torch
# Converting input tokens to torch tensors
inputs_tokens = torch.cat(inputs_tokens, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
# Converting labels to torch tensor
y = torch.zeros((len(data),2), dtype=torch.float)
for i in range(len(data)):
    y[i][data[i][1]] = 1

from sklearn.model_selection import train_test_split

np.random.seed(0)
rs=10

inputs_tokens_train, inputs_tokens_test, attention_masks_train, attention_masks_test, y_train, y_test =train_test_split(inputs_tokens, attention_masks, y, test_size=0.5, random_state=rs)

print(inputs_tokens_train.shape)
print(inputs_tokens_test.shape)

print(attention_masks_train.shape)
print(attention_masks_test.shape)

print(y_train.shape)
print(y_test.shape)



torch.Size([12500, 512])
torch.Size([12500, 512])
torch.Size([12500, 512])
torch.Size([12500, 512])
torch.Size([12500, 2])
torch.Size([12500, 2])


In [26]:
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

dataset_train = TensorDataset(inputs_tokens_train,  attention_masks_train, y_train)
dataset_test = TensorDataset(inputs_tokens_test,  attention_masks_test, y_test)

# Lets download a BERT model for word embedding

In [27]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained(model_name)
print(model)

pytorch_model.bin:   0%|          | 0.00/112M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at haisongzhang/roberta-tiny-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 512, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerNorm): LayerNorm((512,), eps=1e-12, e

# FINE-TUNING THE MODEL

In [None]:
#import gc
#gc.collect()
#torch.cuda.empty_cache()

In [28]:
# Fonction to compute the accuracy on train/test sets
def accuracy(model, dataloader):
  model.eval()
  nbgood =0
  for idx,batch in enumerate(dataloader):
    b_input_ids = batch[0].cuda()
    b_input_mask = batch[1].cuda()
    b_labels = batch[2].cuda()

    with torch.no_grad():
      pred = model(input_ids=b_input_ids, attention_mask=b_input_mask)
      yhat = pred.logits.argmax(axis=1)
      ytrue = b_labels.argmax(axis=1)
      nbgood += (yhat==ytrue).sum()

  acc = nbgood / 125.0
  return acc.item()


In [29]:
import torch.nn as nn
import torch.optim as optim
tb = int(25) # batch size
# create DataLoaders train/test
train_dataloader = DataLoader(dataset_train, batch_size=tb,shuffle=True)
test_dataloader = DataLoader(dataset_test, batch_size=tb,shuffle=False)

nbepochs =2
loss = nn.CrossEntropyLoss() # cross entropy loss
optimizer = optim.Adam(model.parameters(), lr=1e-4)

model.train()
model.cuda()

# TRAINING LOOP
for e in range(nbepochs): # LOOP over epochs
  for idx,batch in enumerate(train_dataloader): # LOOP over batches
    b_input_ids = batch[0].cuda()
    b_input_mask = batch[1].cuda()
    b_labels = batch[2].cuda()

    # TODO: ZERO the gradient accumulator - YOUR CODE HERE
    optimizer.zero_grad()

    # TODO: Compute prediction (fodward pass) - YOUR CODE HERE
    outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)
    logits = outputs.logits

    # TODO: Compute loss (cross entropy) between predictions and labels - YOUR CODE HERE
    batch_loss = loss()(logits, b_labels)

    # TODO: Compute gradients (backward pass) - YOUR CODE HERE
    batch_loss.backward()

    # TODO: update parameters
    optimizer.step()

  print("epoch",e," acc train=",accuracy(model,train_dataloader)," acc test=",accuracy(model,test_dataloader) ) # Computing performances at the end of each epoch



epoch 0  acc train= 94.85600280761719  acc test= 89.59200286865234
epoch 1  acc train= 98.4320068359375  acc test= 89.85600280761719
