<a href="https://colab.research.google.com/github/saprmarks/mlab/blob/main/days/w2d1/w2d1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [2]:
# if running on Google colab
!pip install einops
import torch as t
from torch import einsum
from einops import rearrange, repeat, reduce
import math

from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/mlab/

!pip install transformers
!pip install torchtyping
import days.w2d1.bert_tests as bert_tests

# if running elsewhere, install dependencies (einops, transformers, torchyping), then:
"""
import torch as t
from torch import einsum
from einops import rearrange, repeat, reduce
import math
import bert_tests # this command might need to be fiddled with depending on where this file is stored
"""

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting einops
  Downloading einops-0.4.1-py3-none-any.whl (28 kB)
Installing collected packages: einops
Successfully installed einops-0.4.1
Mounted at /content/gdrive
/content/gdrive/MyDrive/mlab
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 35.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 38.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 65.2 MB/s 
Collecting huggingface-hu

'\nimport torch as t\nfrom torch import einsum\nfrom einops import rearrange, repeat, reduce\nimport math\nimport bert_tests # this command might need to be fiddled with depending on where this file is stored\n'

# Day 1 Part 1: Attention

In [3]:
# outputs pre-softmax attention scores 
# as a Tensor of shape [batch_size, num_heads, seq_length (key), seq_length (query)]
def raw_attention_scores(token_activations, num_heads, project_query, project_key):
  queries = rearrange(project_query(token_activations), 'b sl (nh hs) -> b nh sl hs', nh=num_heads)
  keys    = rearrange(project_key(token_activations),   'b sl (nh hs) -> b nh sl hs', nh=num_heads)
  head_size = queries.size(-1)
  return einsum('bhqi,bhki->bhkq', queries, keys) / math.sqrt(head_size)

bert_tests.test_attention_pattern_fn(raw_attention_scores)

attention pattern raw MATCH!!!!!!!!
 SHAPE (2, 12, 3, 3) MEAN: -0.006871 STD: 0.1121 VALS [0.04628 -0.03086 -0.1037 -0.04798 0.1952 0.2377 0.09306 -0.06558 0.0466 0.1442...]


In [4]:
def bert_attention(token_activations, num_heads, attention_pattern, project_value, project_output):
  values = rearrange(project_value(token_activations), 'b sl (nh hs) -> b nh sl hs', nh=num_heads)
  attn_scores = attention_pattern.softmax(-2)
  attn = einsum('bhki,bhkq->bhqi', values, attn_scores)
  return project_output(rearrange(attn, 'b nh sl hs -> b sl (nh hs)'))

bert_tests.test_attention_fn(bert_attention)

attention MATCH!!!!!!!!
 SHAPE (2, 3, 768) MEAN: 0.004217 STD: 0.1183 VALS [-0.01917 0.07135 -0.1339 -0.1011 -0.03352 -0.01437 -0.09924 0.1358 0.06904 0.03049...]


In [5]:
from torch import nn

class MultiHeadedSelfAttention(nn.Module):
  def __init__(self, num_heads, hidden_size):
    super().__init__()
    self.head_size = 64
    self.num_heads = num_heads
    self.project_query = nn.Linear(hidden_size, num_heads * self.head_size)
    self.project_key   = nn.Linear(hidden_size, num_heads * self.head_size)
    self.project_value = nn.Linear(hidden_size, num_heads * self.head_size)
    self.project_output= nn.Linear(num_heads * self.head_size, hidden_size)

  def forward(self, input):
    raw_scores = raw_attention_scores(input, self.num_heads, self.project_query, self.project_key)
    return bert_attention(input, self.num_heads, raw_scores, self.project_value, self.project_output)

bert_tests.test_bert_attention(MultiHeadedSelfAttention)


bert MATCH!!!!!!!!
 SHAPE (2, 3, 768) MEAN: -0.001554 STD: 0.1736 VALS [-0.08316 -0.09165 -0.03188 -0.03013 0.1001 0.09549 -0.1046 0.07742 0.0424 0.05553...]


# Day 1 Part 2: Transformer Encoder block

In [6]:
from torch.nn.functional import gelu

def bert_mlp(token_activations, linear_1, linear_2):
  return linear_2(gelu(linear_1(token_activations)))

bert_tests.test_bert_mlp(bert_mlp)

bert mlp MATCH!!!!!!!!
 SHAPE (2, 3, 768) MEAN: -0.0001934 STD: 0.1044 VALS [-0.1153 0.1189 -0.0813 0.1021 0.0296 0.06182 0.0341 0.1446 0.2622 -0.08507...]


In [7]:
class BertMLP(nn.Module):
  def __init__(self, input_size, intermediate_size):
    super().__init__()
    self.linear_1 = nn.Linear(input_size, intermediate_size)
    self.linear_2 = nn.Linear(intermediate_size, input_size)

  def forward(self, input):
    return bert_mlp(input, self.linear_1, self.linear_2)

In [8]:
class LayerNorm(nn.Module):
  def __init__(self, normalized_dim):
    super().__init__()
    self.weight = nn.Parameter(t.ones(normalized_dim))
    self.bias   = nn.Parameter(t.zeros(normalized_dim))

  def forward(self, input):
    input = input - input.mean(-1, keepdim=True)
    input = input / (input.var(-1, keepdim=True, unbiased=False) + 1e-5).sqrt()
    return input * self.weight + self.bias

bert_tests.test_layer_norm(LayerNorm)


layer norm MATCH!!!!!!!!
 SHAPE (20, 10) MEAN: -1.907e-08 STD: 1.003 VALS [0.6906 -0.84 1.881 1.711 -0.5116 -0.9577 -0.1387 -0.6943 -0.6741 -0.4662...]


In [9]:
class BertBlock(nn.Module):
  def __init__(self, hidden_size, intermediate_size, num_heads, dropout):
    super().__init__()
    self.attention = MultiHeadedSelfAttention(num_heads, hidden_size)
    self.layer_norm1 = LayerNorm(hidden_size)
    self.mlp = BertMLP(hidden_size, intermediate_size)
    self.dropout = nn.Dropout(dropout)
    self.layer_norm2 = LayerNorm(hidden_size)

  def forward(self, input):
    post_attn = self.layer_norm1(input + self.attention(input))
    return self.layer_norm2(post_attn + self.dropout(self.mlp(post_attn)))

bert_tests.test_bert_block(BertBlock)


bert MATCH!!!!!!!!
 SHAPE (2, 3, 768) MEAN: 1.656e-09 STD: 1 VALS [0.007132 -0.04372 0.6502 -0.5972 -1.097 0.7267 0.1275 -0.6035 -0.2226 0.2145...]


# Day 1 Part 3: BERT Embedding

In [10]:
class Embedding(nn.Module):
  def __init__(self, vocab_size, embed_size):
    super().__init__()
    self.emb_matrix = nn.Parameter(t.randn(vocab_size, embed_size))

  def forward(self, input):
    return self.emb_matrix[input]

bert_tests.test_embedding(Embedding)

embedding MATCH!!!!!!!!
 SHAPE (2, 3, 5) MEAN: -0.06748 STD: 1.062 VALS [1.176 -0.1914 0.8212 1.047 -0.481 0.7106 -1.304 -1.307 -0.438 -0.2764...]


In [11]:
def bert_embedding(
    input_ids,      # : [batch, seqlen]
    token_type_ids, # : [batch, seqlen]
    position_embedding,   # : Embedding
    token_embedding,      # : Embedding
    token_type_embedding, # : Embedding, 
    layer_norm, # : LayerNorm, 
    dropout     # : nn.Dropout
):
  seqlen = input_ids.size(1)
  positions = t.arange(0, seqlen, device=input_ids.device)
  emb = token_embedding(input_ids) + token_type_embedding(token_type_ids) + position_embedding(positions)
  return layer_norm(dropout(emb))

bert_tests.test_bert_embedding_fn(bert_embedding)

bert embedding MATCH!!!!!!!!
 SHAPE (2, 3, 768) MEAN: 8.278e-10 STD: 1 VALS [-1.319 -0.4378 -2.074 0.9679 0.9274 1.479 -0.501 -1.9 -0.212 0.7961...]


In [12]:
class BertEmbedding(nn.Module):
  def __init__(self, vocab_size, hidden_size, max_position_embeddings, type_vocab_size, dropout):
    super().__init__()
    self.token_embedding      = Embedding(vocab_size, hidden_size)
    self.position_embedding   = Embedding(max_position_embeddings, hidden_size)
    self.token_type_embedding = Embedding(type_vocab_size, hidden_size)
    self.layer_norm = LayerNorm(hidden_size)
    self.dropout = nn.Dropout(dropout)

  def forward(self, input_ids, token_type_ids):
    return bert_embedding(
        input_ids, 
        token_type_ids,
        self.position_embedding,
        self.token_embedding,
        self.token_type_embedding,
        self.layer_norm,
        self.dropout)
    
bert_tests.test_bert_embedding(BertEmbedding)

bert embedding MATCH!!!!!!!!
 SHAPE (2, 3, 768) MEAN: 1.242e-09 STD: 1 VALS [-0.009385 -0.4919 0.9852 -0.3535 -3.624 1.333 1.163 1.449 1.063 0.246...]


# Day 1 Part 4: Putting it all together

In [13]:
class Bert(nn.Module):
  def __init__(
      self, vocab_size, hidden_size, max_position_embeddings, type_vocab_size, 
      dropout, intermediate_size, num_heads, num_layers
):
    super().__init__()
    self.embedding = BertEmbedding(
        vocab_size, hidden_size, max_position_embeddings, type_vocab_size, dropout)
    self.transformer = nn.Sequential(
        *[BertBlock(hidden_size, intermediate_size, num_heads, dropout) for _ in range(num_layers)])
    self.linear = nn.Linear(hidden_size, hidden_size)
    self.layer_norm = LayerNorm(hidden_size)
    self.unembed = nn.Linear(hidden_size, vocab_size)

  def forward(self, input_ids):
    token_type_ids = t.zeros(*input_ids.shape, dtype=int, device=input_ids.device)
    return self.unembed(self.layer_norm(gelu(self.linear(self.transformer(self.embedding(input_ids, token_type_ids))))))

bert_tests.test_bert(Bert)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

bert MATCH!!!!!!!!
 SHAPE (1, 4, 28996) MEAN: 0.003031 STD: 0.5765 VALS [-0.5742 -0.432 0.1186 -0.7165 -0.5261 0.4967 1.223 0.3165 -0.3247 -0.5716...]


# Day 1 Part 5: Load pretrained weights

In [14]:
my_bert = Bert(
    vocab_size=28996, hidden_size=768, max_position_embeddings=512, 
    type_vocab_size=2, dropout=0.1, intermediate_size=3072, 
    num_heads=12, num_layers=12
)
pretrained_bert = bert_tests.get_pretrained_bert()

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
import re
def mapkey(k):
  k = k.replace('_embedding.weight', '_embedding.emb_matrix')
  k = k.replace('.pattern', '')
  k = k.replace('out', 'output')
  k = re.sub(r'(?<!(dual\.|ding\.))layer_norm', 'layer_norm1', k)
  k = re.sub(r'residual\.mlp(?=[1-9])', 'mlp.linear_', k)
  k = re.sub(r'residual\.layer_norm', 'layer_norm2', k)
  k = k.replace('lm_head.mlp', 'linear')
  k = k.replace('lm_head.layer_norm1', 'layer_norm')
  k = k.replace('lm_head.unembedding', 'unembed')
  k = re.sub(r'classification.*', '', k)
  return k

for k in pretrained_bert.state_dict(): 
  if mapkey(k) not in my_bert.state_dict(): print(k)

# should only display the classification heads

classification_head.weight
classification_head.bias


In [16]:
load_dict = {}
for k,v in pretrained_bert.state_dict().items():
  load_dict[mapkey(k)] = v
load_dict.pop('') # get rid of the data for the classification heads
my_bert.load_state_dict(load_dict)

<All keys matched successfully>

In [17]:
bert_tests.test_same_output(my_bert, pretrained_bert, tol=1e-4)

comparing Berts MATCH!!!!!!!!
 SHAPE (10, 20, 28996) MEAN: -2.732 STD: 2.413 VALS [-5.65 -6.041 -6.096 -6.062 -5.946 -5.777 -5.977 -6.015 -6.028 -5.935...]


# Day 2 Part 0: Tokenization

In [18]:
import transformers
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")

In [19]:
uncased_tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")
print(tokenizer.decode(uncased_tokenizer("Hi, my name is bert").input_ids))

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

[CLS] colleges 天 largest happened smile donation [SEP]


# Day 2 Part 1: Inference

In [20]:
# get the predictions of model on input at positions in idx
def get_logits(tokens, model, idx):
  tokens = t.LongTensor(tokens).unsqueeze(0)
  idx = t.LongTensor(idx)
  return model(tokens)[0,idx]

def get_masked_logits(input, tokenizer, model):
  tokens = tokenizer(input).input_ids
  idx = [i for i,v in enumerate(tokens) if v == tokenizer.mask_token_id]
  return get_logits(tokens, model, idx)

# k is the number of guesses to show for each masked token
def show_completions(input, tokenizer, model, k):
  probs = get_masked_logits(input, tokenizer, model).softmax(-1)
  for i in range(probs.size(0)):
    print("Masked token %d:" % i)
    guesses = t.topk(probs, k)
    for j in range(k):
      print("   %2d%% " % (100 * guesses.values[i,j]), tokenizer.decode(guesses.indices[i,j]))

show_completions("I woke up and got out of [MASK] feeling very [MASK].", tokenizer, my_bert, 5)


Masked token 0:
   98%  bed
    0%  there
    0%  it
    0%  here
    0%  sleep
Masked token 1:
   14%  tired
    4%  sleepy
    3%  happy
    3%  good
    3%  exhausted


# Day 2 Part 2: Fine tuning on classification

In [None]:
class BertClassifier(nn.Module):
  def __init__(
      self, vocab_size, hidden_size, max_position_embeddings, type_vocab_size, 
      dropout, intermediate_size, num_heads, num_layers, num_classes
):
    super().__init__()
    self.embedding = BertEmbedding(
        vocab_size, hidden_size, max_position_embeddings, type_vocab_size, dropout)
    self.transformer = nn.Sequential(
        *[BertBlock(hidden_size, intermediate_size, num_heads, dropout) for _ in range(num_layers)])
    self.linear = nn.Linear(hidden_size, hidden_size)
    self.layer_norm = LayerNorm(hidden_size)
    self.unembed = nn.Linear(hidden_size, vocab_size)
    self.dropout = nn.Dropout(dropout)
    self.classifier = nn.Linear(hidden_size, num_classes)

  def forward(self, input_ids):
    token_type_ids = t.zeros(*input_ids.shape, dtype=int, device=input_ids.device)
    trans_out = self.transformer(self.embedding(input_ids, token_type_ids))
    logits = self.unembed(self.layer_norm(gelu(self.linear(trans_out))))
    classes = self.classifier(self.dropout(trans_out[:,0]))
    return (logits, classes)

bert_tests.test_bert_classification(BertClassifier)

bert MATCH!!!!!!!!
 SHAPE (1, 4, 28996) MEAN: 0.003031 STD: 0.5765 VALS [-0.5742 -0.432 0.1186 -0.7165 -0.5261 0.4967 1.223 0.3165 -0.3247 -0.5716...]
bert MATCH!!!!!!!!
 SHAPE (1, 2) MEAN: 0.09479 STD: 1.411 VALS [-0.903 1.093]


In [None]:
!pip install torchtext
!pip install torchdata



In [None]:
import torchtext
data_train, data_test = torchtext.datasets.IMDB(root='.data', split=('train', 'test'))

In [None]:
import random
# return a list of batches, each batch consisting of (tokenized data, labels)
def preprocess_imdb_data(dataset, tokenizer, batch_size, max_seq_len=512):
  dataset = list(dataset)
  dataset.sort(key=lambda x: len(x[1])) # sort by review length
  batches = []
  for i in range(len(dataset) // batch_size):
    batch = dataset[i*batch_size:(i+1)*batch_size]
    reviews    = [x[1] for x in batch]
    tokenized_reviews = tokenizer(reviews, padding='longest', max_length=max_seq_len, truncation=True).input_ids
    tokenized_reviews = t.LongTensor(tokenized_reviews)
    sentiments = [0 if x[0]=='neg' else 1 for x in batch]
    sentiments = t.LongTensor(sentiments)
    batches.append((tokenized_reviews, sentiments))
  random.shuffle(batches)
  return batches

train_batches = preprocess_imdb_data(data_train, tokenizer, batch_size=8)
test_batches  = preprocess_imdb_data(data_test,  tokenizer, batch_size=8)

KeyboardInterrupt: ignored

In [None]:
model = BertClassifier(
    vocab_size=28996, hidden_size=768, max_position_embeddings=512, 
    type_vocab_size=2, dropout=0.1, intermediate_size=3072, 
    num_heads=12, num_layers=12, num_classes=2
)

# import weights to the bert part of the model
load_dict = {}
for k,v in pretrained_bert.state_dict().items():
  load_dict[mapkey(k)] = v
load_dict.pop('') # get rid of the data for the classification heads
my_bert.load_state_dict(load_dict)

In [None]:
def sentiment_train(batches, model, steps=300, lr=1e-5):
  t.cuda.empty_cache()
  device = t.device("cuda:0" if t.cuda.is_available() else "cpu")
  model.train()
  model.to(device)

  optimizer = t.optim.Adam(model.parameters(), lr=lr)
  loss_fn = nn.CrossEntropyLoss()
  running_loss = 0.
  for n, batch in enumerate(batches):
    optimizer.zero_grad()
    data, labels = batch
    data = data.to(device)
    labels = labels.to(device)
    _, classes = model(data)
    loss = loss_fn(classes, labels)
    loss.backward()
    optimizer.step()
    running_loss += loss.detach().cpu()
    if n % 10 == 9: 
      print("Batch %d loss: %f" % (n + 1, running_loss / 10))
      running_loss = 0.
    if n >= steps: break

sentiment_train(train_batches, model)

In [None]:
def get_accuracy(data, labels, model):
  with t.no_grad():
    _, classes = model(data)
  classes = classes.softmax(-1)
  classes = (classes[:,0] < .5)
  correct = (classes == labels).sum()
  return correct / labels.size(0)

def sentiment_test(batches, model, steps=100):
  t.cuda.empty_cache()
  device = t.device("cuda:0" if t.cuda.is_available() else "cpu")
  model.to(device)
  model.eval()

  running_accuracy = 0.
  for n, batch in enumerate(batches):
    data, labels = batch
    data = data.to(device)
    labels = labels.to(device)
    running_accuracy += get_accuracy(data, labels, model)
    if n % 10 == 9:
      print("Batch %d accuracy: %f" % (n + 1, running_accuracy / 10))
      running_accuracy = 0.
    if n >= steps: break 

sentiment_test(test_batches, model)



I could not get this to work :'(

# Day 2 Part 3: Training from scratch on masked language modeling

In [39]:
!pip install torchdata
!pip install torchtext
import torchtext
data_train, data_test = torchtext.datasets.WikiText2(split = ('train', 'test'))

def batch_mlm_data(dataset, tokenizer, batch_size=16, seq_len=256):
  text = '\n'.join(dataset).replace('<unk>', '[UNK]')
  data = t.LongTensor(tokenizer(text).input_ids)

  # truncate, shuffle, and divide the data into batches
  data = data[:data.size(0) // (batch_size * seq_len) * batch_size * seq_len]
  data = rearrange(data, '(x sl) -> x sl', sl=seq_len)
  data = data[t.randperm(data.size(0))]
  data = rearrange(data, '(n b) sl -> n b sl', b=batch_size)

  return data

train_batches = batch_mlm_data(data_train, tokenizer, batch_size=32)
test_batches  = batch_mlm_data(data_test, tokenizer)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [22]:
tiny_bert = Bert(
    vocab_size = tokenizer.vocab_size, 
    hidden_size = 256, 
    max_position_embeddings = 256, 
    type_vocab_size = 2,
    dropout = .1,
    intermediate_size = 1024, 
    num_heads = 8, 
    num_layers = 2
)

In [46]:
def mask_data(data, tokenizer, mask_prob=.15):
  mask_ids = (t.rand(*data.shape) < mask_prob)
  masked = t.clone(data)
  masked[mask_ids] = tokenizer.mask_token_id
  labels = data[mask_ids]
  return (masked, mask_ids, labels)

def mlm_train(model, tokenizer, batches, lr=3e-5, epochs=2, print_rate=50):
  t.cuda.empty_cache()
  device = t.device("cuda:0" if t.cuda.is_available() else "cpu")
  model.to(device)
  model.train()

  optimizer = t.optim.Adam(model.parameters(), lr=lr)
  loss_fn = t.nn.CrossEntropyLoss()
  #losses = []
  
  for epoch in range(epochs):
    print("Epoch %d:" % epoch)
    running_loss = 0.
    for n in range(batches.size(0)):
      optimizer.zero_grad()
      data = batches[n].to(device)
      masked, mask_ids, labels = mask_data(data, tokenizer)
      loss = loss_fn(model(masked)[mask_ids], labels)
      loss.backward()
      running_loss += loss.detach().cpu()
      optimizer.step()
      if n % print_rate == print_rate - 1:
        avg_loss = running_loss / print_rate
        print("Batch %d loss: %f" % (n+1, avg_loss))
        #losses.append(avg_loss)
        running_loss = 0.

mlm_train(tiny_bert, tokenizer, train_batches, lr=1e-3, epochs=10, print_rate=100)

Epoch 0:
Batch 100 loss: 6.343158
Batch 200 loss: 6.291035
Epoch 1:
Batch 100 loss: 6.011588
Batch 200 loss: 5.923726
Epoch 2:
Batch 100 loss: 5.747748
Batch 200 loss: 5.676980
Epoch 3:
Batch 100 loss: 5.562121
Batch 200 loss: 5.496933
Epoch 4:
Batch 100 loss: 5.370947
Batch 200 loss: 5.341330
Epoch 5:
Batch 100 loss: 5.236574
Batch 200 loss: 5.190966
Epoch 6:
Batch 100 loss: 5.110602
Batch 200 loss: 5.064290
Epoch 7:
Batch 100 loss: 4.982419
Batch 200 loss: 4.954484
Epoch 8:
Batch 100 loss: 4.870395
Batch 200 loss: 4.848655
Epoch 9:
Batch 100 loss: 4.780831
Batch 200 loss: 4.740136


In [47]:
show_completions("I woke up and got out of [MASK] feeling very [MASK].", tokenizer, tiny_bert, 5)
show_completions("My name is Sam[MASK] I like go on [MASK] hikes.", tokenizer, tiny_bert, 5)


Masked token 0:
   31%  the
    8%  a
    4%  their
    2%  his
    2%  this
Masked token 1:
   17%  down
    2%  [UNK]
    1%  back
    1%  up
    1%  time
Masked token 0:
   24%  ,
    9%  and
    6%  that
    6%  .
    3%  ;
Masked token 1:
   34%  the
   17%  a
    7%  her
    3%  their
    2%  this


In [49]:
%cd days/w2d1

/content/gdrive/MyDrive/mlab/days/w2d1


In [50]:
t.save(tiny_bert, "tiny_bert")

In [51]:
!ls

bert_run_sol.py  bert_tao.py	__pycache__  utils.py
bert_sol.py	 bert_tests.py	tiny_bert    w2d1.ipynb
