In [1]:
# install huggingface Transformers [https://huggingface.co/transformers/installation.html]

# Many transformer based models in a single library: https://github.com/huggingface/transformers#model-architectures
! pip install transformers

# This week: we will use HuggingFace BERT implementations.
# Next sessions: Build an encoder-decoder seq-seq Transfomer from scratch using TF/Keras.

Collecting transformers
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 5.0 MB/s 
[?25hCollecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.6 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 57.0 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 54.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 35.1 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: Py

In [2]:
# Reference: https://medium.com/tensorflow/using-tensorflow-2-for-state-of-the-art-natural-language-processing-102445cda54a
# Ref: https://huggingface.co/transformers/notebooks.html

In [3]:
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)

2.6.0


## Tokenization

In [19]:
# Tokenization: map words to ids
# Refer: https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/01-training-tokenizers.ipynb#scrollTo=LgktNYt7ADPS

# simple example
s = """qpr keeper day heads for preston queens park rangers 
      keeper chris day is set to join preston on a month s loan.  
      day has been displaced by the arrival of simon royce  
      who is in his second month on loan from charlton.
      qpr have also signed italian generoso rossi. r s 
      manager ian holloway said:  some might say 
      it s a risk as he can t be recalled during 
      that month and simon royce can now be recalled by charlton.  
      but i have other irons in the fire. i have had a  yes  from a 
      couple of others should i need them.   
      day s rangers contract expires in the summer. 
      meanwhile  holloway is hoping to complete the 
      signing of middlesbrough defender andy davies - 
      either permanently or again on loan - before saturday s match at ipswich. 
      davies impressed during a recent loan spell at loftus road. holloway is
       also chasing bristol city midfielder tom doherty."""
words = s.split(" ")  # Split over space
vocabulary = dict(enumerate(set(words)))  # Map storing the word to it's corresponding id

print(vocabulary)

# Problems: cat(1123) vs cats(1346)

{0: '', 1: 'heads', 2: 'recalled', 3: 'can', 4: 's', 5: 'is', 6: 'royce', 7: 'during', 8: 'have', 9: 'second', 10: 'i', 11: 'match', 12: 'as', 13: 'summer.', 14: 'and', 15: 'said:', 16: 'at', 17: 'couple', 18: 'is\n', 19: 'signing', 20: 'hoping', 21: 'join', 22: 'contract', 23: 'irons', 24: 'manager', 25: 'fire.', 26: 'meanwhile', 27: 'ipswich.', 28: 't', 29: 'to', 30: 'he', 31: 'on', 32: 'risk', 33: 'now', 34: 'the', 35: 'recent', 36: 'doherty.', 37: 'a', 38: 'chris', 39: 'spell', 40: 'before', 41: 'might', 42: 'r', 43: 'bristol', 44: 'again', 45: 'say', 46: 'loftus', 47: 'month', 48: 'should', 49: 'in', 50: 'holloway', 51: 'his', 52: 'arrival', 53: 'but', 54: 'had', 55: 'preston', 56: 'italian', 57: 'need', 58: 'others', 59: 'loan', 60: 'yes', 61: 'displaced', 62: 'some', 63: 'signed', 64: 'middlesbrough', 65: 'charlton.\n', 66: 'keeper', 67: 'generoso', 68: 'loan.', 69: 'davies', 70: 'of', 71: 'saturday', 72: 'either', 73: 'chasing', 74: 'defender', 75: 'impressed', 76: 'tom', 77: '

### Sub-tokenization

- Why? : fast vs faster, cat vs cats
- example: cats --**bold text**> [cat, ##s]
- Image: https://nlp.fast.ai/images/multifit_vocabularies.png

<img src="https://nlp.fast.ai/images/multifit_vocabularies.png" alt="Smiley face" height="75%" width="75%">


### Tokenization in huggingface

In [20]:
from transformers import BertTokenizer
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased") 

In [21]:
# Refer BERT architecture from the previous videos in the course.

#https://huggingface.co/transformers/main_classes/tokenizer.html
print(bert_tokenizer.cls_token)

[CLS]


In [22]:
enc = bert_tokenizer.encode(s)
print(enc)

print(bert_tokenizer.decode(enc))

[101, 186, 1643, 1197, 13852, 1285, 4075, 1111, 3073, 6186, 5746, 1116, 2493, 2079, 1733, 13852, 22572, 4889, 1285, 1110, 1383, 1106, 2866, 3073, 6186, 1113, 170, 2370, 188, 4891, 119, 1285, 1144, 1151, 13577, 1118, 1103, 4870, 1104, 27466, 7130, 187, 7341, 2093, 1150, 1110, 1107, 1117, 1248, 2370, 1113, 4891, 1121, 22572, 1813, 13464, 119, 186, 1643, 1197, 1138, 1145, 1878, 1122, 19457, 1179, 5565, 5864, 1186, 187, 13159, 1182, 119, 187, 188, 2618, 178, 1389, 11134, 4164, 1163, 131, 1199, 1547, 1474, 1122, 188, 170, 3187, 1112, 1119, 1169, 189, 1129, 6901, 1219, 1115, 2370, 1105, 27466, 7130, 187, 7341, 2093, 1169, 1208, 1129, 6901, 1118, 22572, 1813, 13464, 119, 1133, 178, 1138, 1168, 3926, 1116, 1107, 1103, 1783, 119, 178, 1138, 1125, 170, 4208, 1121, 170, 2337, 1104, 1639, 1431, 178, 1444, 1172, 119, 1285, 188, 2079, 1733, 2329, 4252, 20082, 1116, 1107, 1103, 2247, 119, 17527, 11134, 4164, 1110, 4717, 1106, 2335, 1103, 6086, 1104, 2243, 1116, 12725, 6289, 8919, 1105, 1183, 5358, 24

In [23]:
print(bert_tokenizer.decode([117]))
print(bert_tokenizer.decode([106]))

,
!


In [24]:
enc = bert_tokenizer.encode("I see many cats and dogs")
print(enc)

print(bert_tokenizer.decode(enc))

[101, 146, 1267, 1242, 11771, 1105, 6363, 102]
[CLS] I see many cats and dogs [SEP]


## BERT Models
- DistillBERT
- RoBERTa
- https://miro.medium.com/max/2000/1*IFVX74cEe8U5D1GveL1uZA.png 
<img src="https://miro.medium.com/max/2000/1*IFVX74cEe8U5D1GveL1uZA.png " alt="Smiley face" height="75%" width="75%">

- https://miro.medium.com/max/1400/1*bSUO_Qib4te1xQmBlQjWaw.png
<img src="https://miro.medium.com/max/1400/1*bSUO_Qib4te1xQmBlQjWaw.png " alt="Smiley face" height="75%" width="75%">

- General Language Understanding Evaluation (GLUE)  : https://gluebenchmark.com/


In [10]:
import tensorflow as tf

# Refer: https://huggingface.co/transformers/model_doc/distilbert.html#

from transformers import DistilBertTokenizer, TFDistilBertModel

distil_bert = 'distilbert-base-uncased' # Name of the pretrained models

#DistilBERT 
tokenizer = DistilBertTokenizer.from_pretrained(distil_bert)
model = TFDistilBertModel.from_pretrained(distil_bert)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'activation_13', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


### Extract features using BERT

In [11]:
# obtain the 768-dim vector correpsoding to [CLS] which is a sentence vector

e = tokenizer.encode("Hello, my dog is cute")
print(e)

input = tf.constant(e)[None, :]  # Batch size 1 
print(input)
print(type(input)) # shape: [1,8]

output = model(input)

print(type(output))
print(len(output))
print(output) #shape[1,8,768]


[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]
tf.Tensor([[  101  7592  1010  2026  3899  2003 10140   102]], shape=(1, 8), dtype=int32)
<class 'tensorflow.python.framework.ops.EagerTensor'>
<class 'transformers.modeling_tf_outputs.TFBaseModelOutput'>
1
TFBaseModelOutput(last_hidden_state=<tf.Tensor: shape=(1, 8, 768), dtype=float32, numpy=
array([[[-1.82963774e-01, -7.40540549e-02,  5.02674095e-02, ...,
         -1.12606734e-01,  4.44930971e-01,  4.09413040e-01],
        [ 7.06680119e-04,  1.48253337e-01,  3.43283057e-01, ...,
         -8.60397667e-02,  6.94747686e-01,  4.33527604e-02],
        [-5.07205844e-01,  5.30855238e-01,  3.71626318e-01, ...,
         -5.62874436e-01,  1.37556717e-01,  2.84752369e-01],
        ...,
        [-4.22513336e-01,  5.73145002e-02,  2.43383020e-01, ...,
         -1.52226627e-01,  2.44624183e-01,  6.41548455e-01],
        [-4.93844360e-01, -1.88954756e-01,  1.26407951e-01, ...,
          6.32405356e-02,  3.69128406e-01, -5.82516044e-02],
        [ 8.3

In [12]:
#[CLS] corresponding vector
print((output[0])[0,0,:])  # shape: 768 dim vector

tf.Tensor(
[-1.82963774e-01 -7.40540549e-02  5.02674095e-02 -3.49530458e-01
 -7.28532523e-02 -2.63872445e-01  2.39293307e-01  4.79842156e-01
 -2.14802250e-01 -1.89516425e-01  8.99829119e-02 -1.29189178e-01
 -1.11275822e-01  3.16634476e-01 -8.25903788e-02  9.26224291e-02
 -2.09082961e-02  4.74875957e-01  1.28833681e-01  3.18708085e-03
 -1.53505534e-01 -3.57001662e-01  9.89357010e-04 -3.92756052e-03
  1.38443802e-02 -5.49409352e-02  8.45260695e-02  1.36564374e-01
  2.18252391e-01 -1.96798757e-01  2.47997232e-02  1.75569355e-01
 -3.97218466e-02 -1.10777520e-01  5.48526756e-02  6.07530065e-02
  1.72000788e-02 -1.07415304e-01 -8.76946002e-02  2.12042034e-01
 -4.05893624e-02 -3.17955948e-02  1.37657106e-01 -1.39004663e-01
 -4.68873233e-03 -3.97633076e-01 -2.60034633e+00 -1.08741559e-01
  4.86709736e-02 -3.61387700e-01  3.71814340e-01 -7.61096030e-02
  3.23911458e-02  2.31666565e-01  2.63016075e-01  3.18299800e-01
 -3.87970716e-01  2.98111439e-01 -4.93028015e-02 -3.59301530e-02
  1.58540770e-

In [13]:
# How about hidden layer outputs

#https://huggingface.co/transformers/model_doc/distilbert.html#distilbertconfig
from transformers import  DistilBertConfig

config = DistilBertConfig.from_pretrained(distil_bert, output_hidden_states=True)


e = tokenizer.encode("Hello, my dog is cute")
input = tf.constant(e)[None, :]  # Batch size 1 
model = TFDistilBertModel.from_pretrained(distil_bert, config=config)
print(model.config) # Every model has a config file 



Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'activation_13', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.10.2",
  "vocab_size": 30522
}



In [14]:
output = model(input)
print(len(output))

2


In [15]:
print(output[0])

tf.Tensor(
[[[-1.82963774e-01 -7.40540549e-02  5.02674095e-02 ... -1.12606734e-01
    4.44930971e-01  4.09413040e-01]
  [ 7.06680119e-04  1.48253337e-01  3.43283057e-01 ... -8.60397667e-02
    6.94747686e-01  4.33527604e-02]
  [-5.07205844e-01  5.30855238e-01  3.71626318e-01 ... -5.62874436e-01
    1.37556717e-01  2.84752369e-01]
  ...
  [-4.22513336e-01  5.73145002e-02  2.43383020e-01 ... -1.52226627e-01
    2.44624183e-01  6.41548455e-01]
  [-4.93844360e-01 -1.88954756e-01  1.26407951e-01 ...  6.32405356e-02
    3.69128406e-01 -5.82516044e-02]
  [ 8.32686663e-01  2.49481991e-01 -4.54395175e-01 ...  1.19975343e-01
   -3.92573088e-01 -2.77853400e-01]]], shape=(1, 8, 768), dtype=float32)


In [16]:
output[0].shape

TensorShape([1, 8, 768])

In [17]:
output[1][0].shape

TensorShape([1, 8, 768])

In [18]:
print(type(output[1]))
print(len(output[1])) # 7 Why?
print(output[1][6]) # Shape:(1,8,768)

<class 'tuple'>
7
tf.Tensor(
[[[-1.82963774e-01 -7.40540549e-02  5.02674095e-02 ... -1.12606734e-01
    4.44930971e-01  4.09413040e-01]
  [ 7.06680119e-04  1.48253337e-01  3.43283057e-01 ... -8.60397667e-02
    6.94747686e-01  4.33527604e-02]
  [-5.07205844e-01  5.30855238e-01  3.71626318e-01 ... -5.62874436e-01
    1.37556717e-01  2.84752369e-01]
  ...
  [-4.22513336e-01  5.73145002e-02  2.43383020e-01 ... -1.52226627e-01
    2.44624183e-01  6.41548455e-01]
  [-4.93844360e-01 -1.88954756e-01  1.26407951e-01 ...  6.32405356e-02
    3.69128406e-01 -5.82516044e-02]
  [ 8.32686663e-01  2.49481991e-01 -4.54395175e-01 ...  1.19975343e-01
   -3.92573088e-01 -2.77853400e-01]]], shape=(1, 8, 768), dtype=float32)
