## **Load pre-trained BERT model and Tokenizer from huggingface**



In [None]:
## install hugging face tranformers lib. and torch lib.
!pip install transformers==3.5.1
!pip install torch==1.4.0

In [1]:
import torch
from transformers import BertModel,BertTokenizer

In [5]:
# Use bert-base-uncased model which is not case sensitive
# Find more details at https://huggingface.co/bert-base-uncased
Model_Name="bert-base-uncased"

In [108]:
##Load pre-trained bert-base-uncased
model=BertModel.from_pretrained(Model_Name)
##Load tokenizer used for pre-training bert-base-uncased
tokenizer = BertTokenizer.from_pretrained(Model_Name)

In [109]:
## Details about tokenizer
print(tokenizer)

PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


## **Pass sentence input to BERT for pre-processing**

In [110]:
input_sentence = 'I love Tranformers'

In [111]:
# encode_plus method will 
# 1) tokenize sentence using wordpiece/subword tokenizer 
# 2) add special tokens [CLS] ,[SEP] ,[PAD] ( Padding is aaded at the end with 0 , if tokens len is < pad_max_length ). 
#    Here token len=7 that's why two 0 are added in input_ids and same will not be included in attention 
#    hence two 0s are added in attention mask as well at the end 
# 3) convert tokens into IDs

pad_max_length=9
encoded_tokens=tokenizer.encode_plus(input_sentence,
                                    add_special_tokens = True,    
                                    truncation = True, 
                                    max_length=pad_max_length,
                                    padding = 'max_length', 
                                    return_attention_mask = True, 
                                    return_tensors = "pt")
print(encoded_tokens)

{'input_ids': tensor([[  101,  1045,  2293, 25283, 14192,  2545,   102,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0]])}


In [112]:
# map token IDs to tokens
# BERT is using wordpiece tokenizer, spliting word 'Tranformer' into multiple subwords
for t in encoded_tokens["input_ids"]:
  print(tokenizer.convert_ids_to_tokens(t))

['[CLS]', 'i', 'love', 'tran', '##form', '##ers', '[SEP]', '[PAD]', '[PAD]']


## **Get Embeding of Input from BERT**

In [113]:
# Pass input_id and attentions_mask to pre-trained model
last_hidden_state, pooler_output =model(encoded_tokens["input_ids"],attention_mask =encoded_tokens["attention_mask"] )

In [114]:
## pooler_output is a CLS token.
## it is aggregated representation of entire seq. of dimension 768
pooler_output.shape

torch.Size([1, 768])

In [98]:
## hidden features are of size (batch_size, num_of_tokens, embeding_dimension_of_each_token)
last_hidden_state.shape

torch.Size([1, 9, 768])

In [None]:
## get embedding for each word

last_hidden_state[0][0] # firt word 
last_hidden_state[0][1] # second word 

## **Get embeddings from all encoder layers**

In [100]:
## Set output_hidden_states = True 
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

In [101]:
# Pass same param to get embedding from all encoder layers
last_hidden_state, pooler_output, hidden_states = model(encoded_tokens["input_ids"],
                                                        attention_mask =encoded_tokens["attention_mask"] 
                                                        )

In [102]:
## 13 layes ( 12 BERT layers + 1 Input layers)
len(hidden_states)

13

In [103]:
hidden_states[0].shape

torch.Size([1, 9, 768])