## Pre-Trained DistilBERT

In [168]:
import torch
from transformers import DistilBertModel, DistilBertTokenizer, DistilBertConfig

In [169]:
# load base models (~65M params)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertModel.from_pretrained('distilbert-base-cased')

I0625 12:00:04.451119 4671026624 tokenization_utils.py:1022] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at /Users/shaharazulay/.cache/torch/transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
I0625 12:00:05.227613 4671026624 configuration_utils.py:265] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-config.json from cache at /Users/shaharazulay/.cache/torch/transformers/774d52b0be7c2f621ac9e64708a8b80f22059f6d0e264e1bdc4f4d71c386c4ea.f44aaaab97e2ee0f8d9071a5cd694e19bf664237a92aea20ebe04ddb7097b494
I0625 12:00:05.229297 4671026624 configuration_utils.py:301] Model config DistilBertConfig {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  

### Tokenize

In [170]:
input_sentence = "I can't help it, I love DNA!"

In [171]:
tokens = tokenizer.encode(input_sentence, add_special_tokens=True)
tokens

[101, 146, 1169, 112, 189, 1494, 1122, 117, 146, 1567, 5394, 106, 102]

In [172]:
input_ids = torch.tensor(tokens).unsqueeze(0)  # Batch size 1
input_ids

tensor([[ 101,  146, 1169,  112,  189, 1494, 1122,  117,  146, 1567, 5394,  106,
          102]])

In [173]:
outputs = model(input_ids)
outputs[0].shape

torch.Size([1, 13, 768])

In [174]:
sentence_embedding = outputs[0][0][0, :]  # For sentence embeddings we take the first ([CLS]) token embedding
sentence_embedding.shape

torch.Size([768])

## Training a BPE Tokenizer

In [179]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.normalizers import Lowercase, NFKC, Sequence
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.trainers import BpeTrainer

### Load some input corpus

In [176]:
BIG_FILE_URL = 'https://raw.githubusercontent.com/dscape/spell/master/test/resources/big.txt'

# Let's download the file and save it somewhere
from requests import get
with open('big.txt', 'wb') as big_f:
    response = get(BIG_FILE_URL, )
    
    if response.status_code == 200:
        big_f.write(response.content)
    else:
        print("Unable to get the file: {}".format(response.reason))


### Create a Tokenizer class

In [180]:
# First we create an empty Byte-Pair Encoding model (i.e. not trained model)
tokenizer = Tokenizer(BPE())

# Then we enable lower-casing and unicode-normalization
tokenizer.normalizer = Sequence([
    NFKC(),
    Lowercase()
])

# We use a while-space pre-tokenization
tokenizer.pre_tokenizer = WhitespaceSplit()

In [181]:
# We initialize our trainer, giving him the details about the vocabulary we want to generate
trainer = BpeTrainer(
    vocab_size=2000, 
    show_progress=True)

In [182]:
# Train the tokenizer on the corpus
tokenizer.train(trainer, ["big.txt"])

print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

Trained vocab size: 2000


In [183]:
# Save the tokenizer 
tokenizer.model.save('.')

['./vocab.json', './merges.txt']

### Look at what the tokenizer did

In [184]:
cat merges.txt

#version: 0.2 - Trained by `huggingface/tokenizers`
t h
th e
i n
a n
e r
o n
r e
a t
e n
e d
i s
an d
o f
o u
t o
a s
e s
o r
in g
a r
a l
i t
l e
h e
i c
i on
o m
s t
s e
a d
w h
l y
b e
en t
a c
o w
i m
i d
w i
r o
c h
v e
th at
u t
l l
n o
f or
w as
s u
wi th
l d
h is
v er
c e
t er
a m
a y
g h
i r
c on
u n
r i
l o
s ,
h a
at ion
l i
u r
d e
al l
y ou
m o
c t
no t
a b
h ad
f r
p e
re s
a g
h er
n e
p o
s o
u s
f e
b y
on e
c om
ou n
u l
e x
a in
h im
es s
the r
e m
gh t
a re
a p
i l
b ut
er e
ic h
- -
s h
p r
ou t
w e
fr om
p er
' s
o l
ou ld
e t
i f
e ,
es t
s a
wh ich
s .
p ro
o d
t r
u p
q u
an t
m an
p l
s he
d er
w ere
. .
ou s
a k
ou r
g o
d is
th is
wh o
s i
ar t
m e
the y
f o
es ,
m ent
d o
i ll
en d
. "
n ow
ar d
oun d
c o
u st
at e
m y
ha ve
ed ,
sa id
g r
u m
in d
o k
se l
the re
ac t
a f
o p
ar y
pr in

### Load the saved tokenizer

In [185]:
tokenizer.model = BPE('vocab.json', 'merges.txt')

### Tokenize

In [None]:
input_sentence = "I can't help it, I love DNA!"

In [187]:
encoding = tokenizer.encode(input_sentence)

print("Encoded string: {}".format(encoding.tokens))

Encoded string: ['i', "can't", 'help', 'it,', 'i', 'love', 'd', 'na', '!']


In [188]:
encoding.ids

[44, 1818, 1555, 739, 44, 873, 39, 347, 0]