In [1]:
# imports

from transformers import BertTokenizer, BertForMaskedLM, BertForSequenceClassification
import torch
from torch.nn import Softmax

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [2]:
BERT_MODEL = 'bert-base-cased'  # using a cased tokenizer because case may matter in grammar / spelling

# load up a tokenizer and BERT with MLM head
bert_tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
model = BertForMaskedLM.from_pretrained(BERT_MODEL)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# note the decoder's output size is the size of the tokenizer's vocab. It is crucial to use a matching tokenizer
model.cls

BertOnlyMLMHead(
  (predictions): BertLMPredictionHead(
    (transform): BertPredictionHeadTransform(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    )
    (decoder): Linear(in_features=768, out_features=28996, bias=True)
  )
)

In [4]:
bert_tokenizer.vocab_size  # Looks good!

28996

In [5]:
def top_predictions(phrase, top_n=1):
    # add a pad token before and after the phrase. 
    #  I find this helps as BERT often will neglect the first and last token otherwise
    phrase = f'{bert_tokenizer.pad_token} {phrase} {bert_tokenizer.pad_token}'
    
    input_ids = bert_tokenizer.encode(phrase, return_tensors="pt")  # get the input_ids from the tokenizer
    
    outputs = model(input_ids, labels=input_ids)  # run the input ids against BERT with the labels set as the input ids
    
    # Get the nth most confident predicted tokens from the MLM head
    prediction_scores = outputs[1]
    predicted_tokens = prediction_scores.argsort()[:,:,-top_n].reshape(-1,)
    
    # Get the probability for each token
    token_probas = Softmax(dim=2)(prediction_scores.sort().values)[:,:,-top_n].reshape(-1, )
    
    for proba, token in zip(token_probas, predicted_tokens):
        print(f'Token: {bert_tokenizer.decode([token])} ({token})  Probability: {proba:.4f}')
        
    return predicted_tokens
        

In [6]:
top_predictions('Last time I went here, me bill was too high.', 1)

Token: . (119)  Probability: 0.0636
Token: " (107)  Probability: 0.9721
Token: Last (4254)  Probability: 0.8593
Token: time (1159)  Probability: 0.9999
Token: I (146)  Probability: 0.9995
Token: went (1355)  Probability: 0.4761
Token: here (1303)  Probability: 0.9999
Token: , (117)  Probability: 1.0000
Token: my (1139)  Probability: 0.9564
Token: bill (4550)  Probability: 0.9953
Token: was (1108)  Probability: 0.9999
Token: too (1315)  Probability: 1.0000
Token: high (1344)  Probability: 0.9989
Token: . (119)  Probability: 1.0000
Token: " (107)  Probability: 0.9807
Token: . (119)  Probability: 1.0000


tensor([ 119,  107, 4254, 1159,  146, 1355, 1303,  117, 1139, 4550, 1108, 1315,
        1344,  119,  107,  119])

In [7]:
top_predictions('My wonderful teacher is so great!', 1)

Token: . (119)  Probability: 0.0563
Token: " (107)  Probability: 0.9262
Token: My (1422)  Probability: 0.9989
Token: wonderful (7310)  Probability: 0.9551
Token: teacher (3218)  Probability: 0.9954
Token: is (1110)  Probability: 0.9981
Token: so (1177)  Probability: 0.9991
Token: great (1632)  Probability: 0.9953
Token: ! (106)  Probability: 1.0000
Token: " (107)  Probability: 0.9189
Token: . (119)  Probability: 0.9683


tensor([ 119,  107, 1422, 7310, 3218, 1110, 1177, 1632,  106,  107,  119])

In [8]:
top_predictions('My wonderful teacher is so great!', 2)  # 2nd choice  for wonderful is brilliant

Token: , (117)  Probability: 0.0202
Token: ' (112)  Probability: 0.0596
Token: my (1139)  Probability: 0.0006
Token: brilliant (8431)  Probability: 0.0154
Token: instructor (10332)  Probability: 0.0009
Token: was (1108)  Probability: 0.0014
Token: very (1304)  Probability: 0.0004
Token: wonderful (7310)  Probability: 0.0027
Token: . (119)  Probability: 0.0000
Token: ' (112)  Probability: 0.0763
Token: ! (106)  Probability: 0.0311


tensor([  117,   112,  1139,  8431, 10332,  1108,  1304,  7310,   119,   112,
          106])

In [9]:
top_predictions('My wonderful teacher is so great!', 3)  # 3rd choice  for wonderful is great

Token: the (1103)  Probability: 0.0174
Token: . (119)  Probability: 0.0066
Token: The (1109)  Probability: 0.0002
Token: great (1632)  Probability: 0.0093
Token: Teacher (14208)  Probability: 0.0007
Token: isn (2762)  Probability: 0.0001
Token: such (1216)  Probability: 0.0002
Token: brilliant (8431)  Probability: 0.0005
Token: ? (136)  Probability: 0.0000
Token: ! (106)  Probability: 0.0039
Token: ? (136)  Probability: 0.0004


tensor([ 1103,   119,  1109,  1632, 14208,  2762,  1216,  8431,   136,   106,
          136])

In [10]:
# Lookahead prediction

def look_ahead(phrase):
    # add a mask token at the end
    phrase = f'{phrase} {bert_tokenizer.mask_token} {bert_tokenizer.pad_token}'
    
    input_ids = bert_tokenizer.encode(phrase, return_tensors="pt")  # get the input_ids from the tokenizer
    
    outputs = model(input_ids, labels=input_ids)  # run the input ids against BERT with the labels set as the input ids
    
    # Get the nth most confident predicted tokens from the MLM head
    prediction_scores = outputs[1]
    
    for i in range(1, 4):
        print(f'Top Score {i}')
        predicted_tokens = prediction_scores.argsort()[:,:,-i].reshape(-1,)

        # Get the probability for each token
        token_probas = Softmax(dim=2)(prediction_scores.sort().values)[:,:,-i].reshape(-1, )

        for proba, token in list(zip(token_probas, predicted_tokens))[input_ids.shape[1] - 3:]:
            print(f'Token: {bert_tokenizer.decode([token])} ({token})  Probability: {proba:.4f}')
        print()
    return predicted_tokens


In [11]:
look_ahead('Can we split the')

Top Score 1
Token: time (1159)  Probability: 0.0528
Token: ? (136)  Probability: 0.9924
Token: . (119)  Probability: 0.9999

Top Score 2
Token: money (1948)  Probability: 0.0303
Token: . (119)  Probability: 0.0056
Token: ? (136)  Probability: 0.0001

Top Score 3
Token: numbers (2849)  Probability: 0.0271
Token: ! (106)  Probability: 0.0015
Token: ! (106)  Probability: 0.0000



tensor([1103,  117, 1284, 2866, 1412, 2849,  106,  106])

In [12]:
look_ahead('Where are we')

Top Score 1
Token: going (1280)  Probability: 0.8487
Token: ? (136)  Probability: 0.9920
Token: . (119)  Probability: 0.9986

Top Score 2
Token: now (1208)  Probability: 0.0605
Token: . (119)  Probability: 0.0046
Token: ? (136)  Probability: 0.0013

Top Score 3
Token: headed (2917)  Probability: 0.0298
Token: ! (106)  Probability: 0.0032
Token: ; (132)  Probability: 0.0000



tensor([ 107,  117, 1231, 1128, 2917,  106,  132])

In [13]:
look_ahead('This class is kind of')

Top Score 1
Token: unique (3527)  Probability: 0.0218
Token: . (119)  Probability: 0.9514
Token: . (119)  Probability: 0.9967

Top Score 2
Token: fun (4106)  Probability: 0.0216
Token: ; (132)  Probability: 0.0225
Token: ? (136)  Probability: 0.0010

Top Score 3
Token: special (1957)  Probability: 0.0206
Token: ! (106)  Probability: 0.0186
Token: ! (106)  Probability: 0.0008



tensor([ 107,  117, 4370, 1108, 1472, 4106, 1957,  106,  106])

In [14]:
# try fine-tuned model on Cola

# https://nyu-mll.github.io/CoLA/


In [15]:
import pandas as pd

# Load the dataset into a pandas dataframe.
cola_df = pd.read_csv("../data/cola.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(cola_df.shape[0]))

# Display 10 random rows from the data.
cola_df.sample(10)


Number of training sentences: 8,551



Unnamed: 0,sentence_source,label,label_notes,sentence
4856,ks08,0,*,Who do you think that has given the tickets to...
2310,l-93,0,*,Harriet alternated folk songs and pop songs to...
29,gj04,1,,Fred tracked the leak to its source.
7962,ad03,1,,Who's there?
2029,rhl07,1,,I sent the package to London.
4782,ks08,1,,Who do you believe invited Sara?
3167,l-93,1,,Sharon fainted.
7685,sks13,0,*,John convinced the rice to be cooked by Bill.
325,bc01,1,,"Louise is unhappy, isn't she?"
7130,sks13,0,*,This girl in the red coat will put a picture o...


In [16]:
from nlp import load_dataset, Dataset


cola_dataset = Dataset.from_pandas(cola_df.sample(1000, random_state=42))

# Dataset has a built in train test split method
cola_dataset = cola_dataset.train_test_split(test_size=0.2)

train_set = cola_dataset['train']
test_set = cola_dataset['test']

# We will pad our dataset so that our input matrices are the same length and truncate anything longer than 512 tokens
def preprocess(data):
    return bert_tokenizer(data['sentence'], padding=True, truncation=True)

train_set = train_set.map(preprocess, batched=True, batch_size=len(train_set))
test_set = test_set.map(preprocess, batched=True, batch_size=len(test_set))

train_set.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_set.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [17]:
sequence_classification_model = BertForSequenceClassification.from_pretrained(
    BERT_MODEL, num_labels=2,
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False # Whether the model returns all hidden-states.
)

# freeze all but the last 2 encoder layers in BERT to speed up training
for param in list(sequence_classification_model.bert.parameters())[:165]:
    param.requires_grad = False  # disable training in BERT

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [18]:
from datasets import load_metric
import numpy as np

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


from transformers import Trainer, TrainingArguments

batch_size = 32
epochs = 4

warmup_steps = 50
weight_decay = 0.02

training_args = TrainingArguments(
    output_dir='./gs/results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    logging_dir='./gs/logs',
    logging_strategy='steps',
    logging_steps=1,
    logging_first_step=True
)

# Define the trainer: 

trainer = Trainer(
    model=sequence_classification_model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set,
    compute_metrics=compute_metrics
)

In [19]:
# Get initial metrics
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 200
  Batch size = 32


{'eval_loss': 0.9257231950759888,
 'eval_accuracy': 0.225,
 'eval_runtime': 10.2968,
 'eval_samples_per_second': 19.423,
 'eval_steps_per_second': 0.68}

In [20]:
trainer.train()

***** Running training *****
  Num examples = 800
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 100


Step,Training Loss
1,0.9048
2,0.903
3,0.8712
4,0.7954
5,0.9393
6,0.9243
7,0.8234
8,0.8013
9,0.8443
10,0.7983




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=100, training_loss=0.6203889310359955, metrics={'train_runtime': 314.013, 'train_samples_per_second': 10.191, 'train_steps_per_second': 0.318, 'total_flos': 50977766976000.0, 'train_loss': 0.6203889310359955, 'epoch': 4.0})

In [21]:
# Get fine-tuned metrics
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 200
  Batch size = 32


{'eval_loss': 0.4896756708621979,
 'eval_accuracy': 0.805,
 'eval_runtime': 14.3165,
 'eval_samples_per_second': 13.97,
 'eval_steps_per_second': 0.489,
 'epoch': 4.0}

In [22]:
def is_grammatically_correct(text):
    input_ids = bert_tokenizer.encode(text, return_tensors='pt')
    return float(Softmax(dim=1)(sequence_classification_model(input_ids).logits)[0][1])
    
    

In [23]:
is_grammatically_correct('Me bar tab is too high')

0.49089136719703674

In [24]:
is_grammatically_correct('My bar tab is too high')

0.6457576751708984

In [25]:
top_predictions('Me bar tab is to high', 1)

Token: . (119)  Probability: 0.0747
Token: " (107)  Probability: 0.2778
Token: me (1143)  Probability: 0.2538
Token: bar (2927)  Probability: 0.8779
Token: ta (27629)  Probability: 0.9995
Token: ##b (1830)  Probability: 0.9972
Token: is (1110)  Probability: 0.9868
Token: to (1106)  Probability: 0.8474
Token: high (1344)  Probability: 0.9143
Token: . (119)  Probability: 0.8100
Token: . (119)  Probability: 0.9988


tensor([  119,   107,  1143,  2927, 27629,  1830,  1110,  1106,  1344,   119,
          119])