# BERT for grammar / spell check

In [1]:
# imports

from transformers import BertTokenizer, BertForMaskedLM, BertForSequenceClassification
import torch
from torch.nn import Softmax

In [2]:
BERT_MODEL = 'bert-base-cased'  # using a cased tokenizer because case may matter in grammar / spelling

# load up a tokenizer and BERT with MLM head
bert_tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
model = BertForMaskedLM.from_pretrained(BERT_MODEL)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# note the decoder's output size is the size of the tokenizer's vocab. It is crucial to use a matching tokenizer
model.cls

BertOnlyMLMHead(
  (predictions): BertLMPredictionHead(
    (transform): BertPredictionHeadTransform(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (transform_act_fn): GELUActivation()
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    )
    (decoder): Linear(in_features=768, out_features=28996, bias=True)
  )
)

In [4]:
bert_tokenizer.vocab_size  # Looks good!

28996

In [5]:
def top_predictions(phrase, top_n=1):
    # add a pad token before and after the phrase. 
    #  I find this helps as BERT often will neglect the first and last token otherwise
    phrase = f'{bert_tokenizer.pad_token} {phrase} {bert_tokenizer.pad_token}'
    
    input_ids = bert_tokenizer.encode(phrase, return_tensors="pt")  # get the input_ids from the tokenizer
    
    outputs = model(input_ids)  # run the input ids against BERT
    
    # Get the nth most confident predicted tokens from the MLM head
    prediction_scores = outputs.logits
    predicted_tokens = prediction_scores.argsort()[:,:,-top_n].reshape(-1,)
    
    # Get the probability for each token
    token_probas = Softmax(dim=2)(prediction_scores.sort().values)[:,:,-top_n].reshape(-1, )
    
    for proba, token in zip(token_probas, predicted_tokens):
        print(f'Token: {bert_tokenizer.decode([token])} ({token})  Probability: {proba:.4f}')
        
    return predicted_tokens
        

In [6]:
top_predictions('Last time I went here, me bill was too high.', 1)

Token: . (119)  Probability: 0.0636
Token: " (107)  Probability: 0.9721
Token: Last (4254)  Probability: 0.8593
Token: time (1159)  Probability: 0.9999
Token: I (146)  Probability: 0.9995
Token: went (1355)  Probability: 0.4761
Token: here (1303)  Probability: 0.9999
Token: , (117)  Probability: 1.0000
Token: my (1139)  Probability: 0.9564
Token: bill (4550)  Probability: 0.9953
Token: was (1108)  Probability: 0.9999
Token: too (1315)  Probability: 1.0000
Token: high (1344)  Probability: 0.9989
Token: . (119)  Probability: 1.0000
Token: " (107)  Probability: 0.9807
Token: . (119)  Probability: 1.0000


tensor([ 119,  107, 4254, 1159,  146, 1355, 1303,  117, 1139, 4550, 1108, 1315,
        1344,  119,  107,  119])

In [7]:
top_predictions('My wonderful teacher is so great!', 1)

Token: . (119)  Probability: 0.0563
Token: " (107)  Probability: 0.9262
Token: My (1422)  Probability: 0.9989
Token: wonderful (7310)  Probability: 0.9551
Token: teacher (3218)  Probability: 0.9954
Token: is (1110)  Probability: 0.9981
Token: so (1177)  Probability: 0.9991
Token: great (1632)  Probability: 0.9953
Token: ! (106)  Probability: 1.0000
Token: " (107)  Probability: 0.9189
Token: . (119)  Probability: 0.9683


tensor([ 119,  107, 1422, 7310, 3218, 1110, 1177, 1632,  106,  107,  119])

In [8]:
top_predictions('My wonderful teacher is so great!', 2)  # 2nd choice  for wonderful is brilliant

Token: , (117)  Probability: 0.0202
Token: ' (112)  Probability: 0.0596
Token: my (1139)  Probability: 0.0006
Token: brilliant (8431)  Probability: 0.0154
Token: instructor (10332)  Probability: 0.0009
Token: was (1108)  Probability: 0.0014
Token: very (1304)  Probability: 0.0004
Token: wonderful (7310)  Probability: 0.0027
Token: . (119)  Probability: 0.0000
Token: ' (112)  Probability: 0.0763
Token: ! (106)  Probability: 0.0311


tensor([  117,   112,  1139,  8431, 10332,  1108,  1304,  7310,   119,   112,
          106])

In [9]:
top_predictions('My wonderful teacher is so great!', 3)  # 3rd choice  for wonderful is great

Token: the (1103)  Probability: 0.0174
Token: . (119)  Probability: 0.0066
Token: The (1109)  Probability: 0.0002
Token: great (1632)  Probability: 0.0093
Token: Teacher (14208)  Probability: 0.0007
Token: isn (2762)  Probability: 0.0001
Token: such (1216)  Probability: 0.0002
Token: brilliant (8431)  Probability: 0.0005
Token: ? (136)  Probability: 0.0000
Token: ! (106)  Probability: 0.0039
Token: ? (136)  Probability: 0.0004


tensor([ 1103,   119,  1109,  1632, 14208,  2762,  1216,  8431,   136,   106,
          136])

In [10]:
# Lookahead prediction

def look_ahead(phrase):
    # add a mask token at the end
    phrase = f'{phrase} {bert_tokenizer.mask_token} {bert_tokenizer.pad_token}'
    
    input_ids = bert_tokenizer.encode(phrase, return_tensors="pt")  # get the input_ids from the tokenizer
    
    outputs = model(input_ids)  # run the input ids against BERT
    
    # Get the nth most confident predicted tokens from the MLM head
    prediction_scores = outputs.logits
    
    for i in range(1, 4):
        print(f'Top Score {i}')
        predicted_tokens = prediction_scores.argsort()[:,:,-i].reshape(-1,)

        # Get the probability for each token
        token_probas = Softmax(dim=2)(prediction_scores.sort().values)[:,:,-i].reshape(-1, )

        for proba, token in list(zip(token_probas, predicted_tokens))[input_ids.shape[1] - 3:]:
            print(f'Token: {bert_tokenizer.decode([token])} ({token})  Probability: {proba:.4f}')
        print()
    return predicted_tokens


In [11]:
look_ahead('Can we split the')

Top Score 1
Token: time (1159)  Probability: 0.0528
Token: ? (136)  Probability: 0.9924
Token: . (119)  Probability: 0.9999

Top Score 2
Token: money (1948)  Probability: 0.0303
Token: . (119)  Probability: 0.0056
Token: ? (136)  Probability: 0.0001

Top Score 3
Token: numbers (2849)  Probability: 0.0271
Token: ! (106)  Probability: 0.0015
Token: ! (106)  Probability: 0.0000



tensor([1103,  117, 1284, 2866, 1412, 2849,  106,  106])

In [12]:
look_ahead('Where are we')

Top Score 1
Token: going (1280)  Probability: 0.8487
Token: ? (136)  Probability: 0.9920
Token: . (119)  Probability: 0.9986

Top Score 2
Token: now (1208)  Probability: 0.0605
Token: . (119)  Probability: 0.0046
Token: ? (136)  Probability: 0.0013

Top Score 3
Token: headed (2917)  Probability: 0.0298
Token: ! (106)  Probability: 0.0032
Token: ; (132)  Probability: 0.0000



tensor([ 107,  117, 1231, 1128, 2917,  106,  132])

In [13]:
look_ahead('This class is kind of')

Top Score 1
Token: unique (3527)  Probability: 0.0218
Token: . (119)  Probability: 0.9514
Token: . (119)  Probability: 0.9967

Top Score 2
Token: fun (4106)  Probability: 0.0216
Token: ; (132)  Probability: 0.0225
Token: ? (136)  Probability: 0.0010

Top Score 3
Token: special (1957)  Probability: 0.0206
Token: ! (106)  Probability: 0.0186
Token: ! (106)  Probability: 0.0008



tensor([ 107,  117, 4370, 1108, 1472, 4106, 1957,  106,  106])

In [14]:
# try fine-tuned model on Cola

# https://nyu-mll.github.io/CoLA/


In [15]:
import pandas as pd

# Load the dataset into a pandas dataframe.
cola_df = pd.read_csv("../data/cola.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(cola_df.shape[0]))

# Display 10 random rows from the data.
cola_df.sample(10)


Number of training sentences: 8,551



Unnamed: 0,sentence_source,label,label_notes,sentence
5989,c_13,1,,Has Bill eaten his tuna?
1558,r-67,1,,Joe is taller than Mary.
8099,ad03,0,*,The child wail
3536,ks08,1,,They can run.
1608,r-67,1,,Tom says that it's going to rain but I don't b...
5085,ks08,1,,It's mainly his attitude which convinced the t...
8062,ad03,1,,He thought that Dracula was the Prince of Dark...
1315,r-67,1,,Did Merv show up and did you play chess?
4064,ks08,1,,John bothers me.
2719,l-93,1,,They lent me a bicycle.


In [16]:
from nlp import load_dataset, Dataset
from transformers import DataCollatorWithPadding

cola_dataset = Dataset.from_pandas(cola_df.sample(3000, random_state=42))

# We will pad our dataset so that our input matrices are the same length and truncate anything longer than 512 tokens
def preprocess_function(data):
    return bert_tokenizer(data['sentence'], truncation=True)

cola_dataset = cola_dataset.map(preprocess_function, batched=True)

# Dataset has a built in train test split method
cola_dataset = cola_dataset.train_test_split(test_size=0.2)

# DataCollatorWithPadding creates batch of data. It also dynamically pads text to the 
#  length of the longest element in the batch, making them all the same length. 
#  It's possible to pad your text in the tokenizer function with padding=True, dynamic padding is more efficient.
data_collator = DataCollatorWithPadding(tokenizer=bert_tokenizer)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [17]:
sequence_classification_model = BertForSequenceClassification.from_pretrained(
    BERT_MODEL, num_labels=2,
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False # Whether the model returns all hidden-states.
)
sequence_classification_model.config.id2label = {0: 'INCORRECT', 1: 'CORRECT'}

# freeze all but the last 2 encoder layers in BERT to speed up training
for name, param in sequence_classification_model.bert.named_parameters():
    if 'encoder.layer.10' in name:
        break
    param.requires_grad = False  # disable training in BERT

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [18]:
from datasets import load_metric
import numpy as np
from transformers import Trainer, TrainingArguments

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

batch_size = 32
epochs = 2

training_args = TrainingArguments(
    output_dir='./gs/results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size * 2,
    logging_dir='./gs/logs',
    logging_strategy='steps',
    logging_steps=10,
    logging_first_step=True,
    evaluation_strategy='epoch',
    eval_steps=1,
    save_strategy='epoch'
)

# Define the trainer: 

trainer = Trainer(
    model=sequence_classification_model,
    args=training_args,
    train_dataset=cola_dataset['train'],
    eval_dataset=cola_dataset['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

In [19]:
# Get initial metrics
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 600
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: label_notes, sentence, __index_level_0__, sentence_source. If label_notes, sentence, __index_level_0__, sentence_source are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


{'eval_loss': 0.6792384386062622,
 'eval_accuracy': 0.6216666666666667,
 'eval_runtime': 68.016,
 'eval_samples_per_second': 8.821,
 'eval_steps_per_second': 0.147}

In [20]:
trainer.train()

***** Running training *****
  Num examples = 2400
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 150
The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: label_notes, sentence, __index_level_0__, sentence_source. If label_notes, sentence, __index_level_0__, sentence_source are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5815,0.576743,0.716667
2,0.4822,0.577665,0.74


***** Running Evaluation *****
  Num examples = 600
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: label_notes, sentence, __index_level_0__, sentence_source. If label_notes, sentence, __index_level_0__, sentence_source are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
Saving model checkpoint to ./gs/results/checkpoint-75
Configuration saved in ./gs/results/checkpoint-75/config.json
Model weights saved in ./gs/results/checkpoint-75/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 600
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: label_notes, sentence, __index_level_0__, sentence_source. If label_notes, sentence, __index_level_0__, sentence_source are not expected by `BertForSequenceClassifica

TrainOutput(global_step=150, training_loss=0.5357241650422414, metrics={'train_runtime': 761.5255, 'train_samples_per_second': 6.303, 'train_steps_per_second': 0.197, 'total_flos': 58048876588800.0, 'train_loss': 0.5357241650422414, 'epoch': 2.0})

In [21]:
# Get fine-tuned metrics
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 600
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: label_notes, sentence, __index_level_0__, sentence_source. If label_notes, sentence, __index_level_0__, sentence_source are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


{'eval_loss': 0.5776650905609131,
 'eval_accuracy': 0.74,
 'eval_runtime': 68.3487,
 'eval_samples_per_second': 8.779,
 'eval_steps_per_second': 0.146,
 'epoch': 2.0}

In [22]:
# Save the best model
trainer.save_model()

Saving model checkpoint to ./gs/results
Configuration saved in ./gs/results/config.json
Model weights saved in ./gs/results/pytorch_model.bin


In [27]:
from transformers import pipeline

# make a classification pipeline
pipe = pipeline("text-classification", './gs/results', tokenizer=BERT_MODEL, return_all_scores=True)

loading configuration file ./gs/results/config.json
Model config BertConfig {
  "_name_or_path": "./gs/results",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "INCORRECT",
    "1": "CORRECT"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": null,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.19.4",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading configuration file ./gs/results/config.json
Model config BertConfig {
  "_name_or_path": "./gs/results",
  "arc

In [32]:
print(pipe('Me bar tab is to high.'))

print(pipe('Me bar tab is too high.'))

print(pipe('My bar tab is too high.'))

[[{'label': 'INCORRECT', 'score': 0.8543031215667725}, {'label': 'CORRECT', 'score': 0.14569686353206635}]]
[[{'label': 'INCORRECT', 'score': 0.36100929975509644}, {'label': 'CORRECT', 'score': 0.6389906406402588}]]
[[{'label': 'INCORRECT', 'score': 0.0773123949766159}, {'label': 'CORRECT', 'score': 0.9226875901222229}]]
