# Advanced Natural Language Engineering - MRSCC

This assignment asks us to compete in The Microsoft Research Sentence Completion Challenge - MRSCC (Zweig and Burges, 2011), it requires a system to be able to predict which is the most likely word (from a set of 5 possibilities) to complete a sentence. 

### Loading challenge data

For this challenge we are provided with:

1.   A training corpus of 19th century novels data (522 files)
2.   1040 sentences with one missing word and 5 options to choose from

This dataset was constructed from Project Gutenberg data. Seed sentences were selected from five of Sir
Arthur Conan Doyle’s Sherlock Holmes novels, and then imposter words were suggested with the
aid of a language model trained on over 500 19th century novels. The strategy for competing in this challenge will be to create training and validation data from the complete corpus. This will then help us make predictions in the unseen MRSCC challenge data.

# 1. Data Preprocessing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!sudo apt-get install libdb++-dev
!export BERKELEYDB_DIR=/usr
!pip3 install bsddb3
!pip install gutenberg
!pip install transformers
!pip install datasets
!pip install nltk
!pip install pytorch-lightning
!pip install transformers
import os
import random
import re
import nltk
nltk.download('punkt')
import pandas as pd
import math
import numpy as np
from nltk import word_tokenize as tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import operator
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup, DistilBertForMaskedLM, DistilBertTokenizer, RobertaForMaskedLM, RobertaTokenizer
import pytorch_lightning as pl
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

In [None]:
import os
import random
mrscc_dir = '/content/drive/MyDrive/university/2021/ANLE/lab2resources/sentence-completion'

def get_train_val(training_dir=mrscc_dir,split=1):
    filenames=os.listdir(training_dir)
    n=len(filenames)
    print("There are {} files in the training directory: {}".format(n,training_dir))
    random.seed(7) #if you want the same random split every time
    random.shuffle(filenames)
    index=int(n*split)
    return(filenames[:index],filenames[index:])

trainingdir=os.path.join(mrscc_dir,"Holmes_Training_Data/")
training,testing=get_train_val(trainingdir)

There are 522 files in the training directory: /content/drive/MyDrive/university/2021/ANLE/lab2resources/sentence-completion/Holmes_Training_Data/


In [None]:
len(training)

522

In [None]:
def processfiles(files, training_dir, filter="Conan Doyle"):
  texts = []
  for i, afile in enumerate(files):
      text = ""
      try:
          with open(os.path.join(training_dir,afile)) as instream:
            for line in instream:
              text += line
            if re.search(filter, text, re.IGNORECASE) or i%2==0:
              print("sherlock found or random index triggered at {}".format(i))
              texts.append(strip_headers(text).strip())              
      except UnicodeDecodeError:
          print("UnicodeDecodeError processing {}: ignoring rest of file".format(afile))
  return texts

In [None]:
texts = processfiles(training, trainingdir)

In [None]:
len(texts)

265

In [None]:
print(texts[0])

In [None]:
!nvidia-smi

Sun Apr 18 16:11:47 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.67       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Prepare Data

In [None]:
from transformers import AutoTokenizer
import datasets
from datasets import load_dataset, Dataset

In [None]:
texts_dict = {'text':[t for t in texts]}

In [None]:
datasets = Dataset.from_dict(texts_dict)

In [None]:
datasets = datasets.train_test_split(test_size=0.1)

In [None]:
model_checkpoint = "roberta-base"
from transformers import AutoTokenizer
def tokenize_function(examples):
    return tokenizer(examples["text"])
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=2, remove_columns=["text"])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…


 

HBox(children=(FloatProgress(value=0.0, description='#0', max=1.0, style=ProgressStyle(description_width='init…

 

HBox(children=(FloatProgress(value=0.0, description='#1', max=1.0, style=ProgressStyle(description_width='init…



 

HBox(children=(FloatProgress(value=0.0, description='#0', max=1.0, style=ProgressStyle(description_width='init…

 

HBox(children=(FloatProgress(value=0.0, description='#1', max=1.0, style=ProgressStyle(description_width='init…





In [None]:
block_size = 128
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=2,
)

 

HBox(children=(FloatProgress(value=0.0, description='#0', max=1.0, style=ProgressStyle(description_width='init…

 

HBox(children=(FloatProgress(value=0.0, description='#1', max=1.0, style=ProgressStyle(description_width='init…



  

HBox(children=(FloatProgress(value=0.0, description='#0', max=1.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='#1', max=1.0, style=ProgressStyle(description_width='init…





In [None]:
# quick check to see if we lost any samples
text_token_total = 0
for text in tokenized_datasets['train']:
  text_token_total+=len(text['input_ids'])

print(f"we should have {int(text_token_total/128)} samples of data in our train dataset")
print(f"we have {len(lm_datasets['train'])} samples in our dataset")

we should have 204655 samples of data in our train dataset
we have 204654 samples in our dataset


In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
print([lm_datasets['train'].__getitem__(0)])
print(data_collator([lm_datasets['train'].__getitem__(0)]))

In [None]:
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels'],
        num_rows: 204654
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels'],
        num_rows: 29046
    })
})

# Hugging Face Trainer

In [None]:
from transformers import AutoModelForMaskedLM
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# pre-trained checkpoints
# checkpoint dir for 82.3 model /content/drive/MyDrive/university/2021/ANLE/roberta mrscc
# trained_checkpoint = '/content/drive/MyDrive/university/2021/ANLE/roberta mrscc'

# training from base model
model_checkpoint = "roberta-base"

# declare model and model arguents
trainer_model = AutoModelForMaskedLM.from_pretrained(model_checkpoint, return_dict=True)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

training_args = TrainingArguments(
    "test-mlm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs = 3,
    per_device_train_batch_size = 32,
    save_steps=20000
)

trainer = Trainer(
    model=trainer_model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
    data_collator=data_collator,
)

# train model
trainer.train()

In [None]:
# trainer.save_model('/content/drive/MyDrive/university/2021/ANLE/roberta mrscc v1.1')

# Pytorch lightning

Pytorch lightning setup

In [None]:
# unfortunately hugging face datasets are unpickable which means they won't run natively with ray tune, here we convert to pandas
train,val = pd.DataFrame(),pd.DataFrame()
train['input_ids'] = collated_datasets['train']['input_ids']
train['attention_mask'] = collated_datasets['train']['attention_mask']
train['labels'] = collated_datasets['train']['labels']
val['input_ids'] = collated_datasets['test']['input_ids']
val['attention_mask'] = collated_datasets['test']['attention_mask']
val['labels'] = collated_datasets['test']['labels']

In [None]:
train

In [None]:
class RoBERTaDataset(Dataset):

  def __init__(self, df: pd.DataFrame):
    self.df = df

  def __len__(self):
    return len(self.df)

  def __getitem__(self, index: int):
    row = self.df.iloc[index]
    return {'input_ids':torch.tensor(row.input_ids).flatten(),
        'attention_mask':torch.tensor(row.attention_mask).flatten(),
        'labels':torch.tensor(row.labels).flatten()}

In [None]:
test = RoBERTaDataset(train.head())
test.__getitem__(0)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
train_dataset = RoBERTaDataset(train)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
model.cuda()
model.train()
optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

In [None]:
class RoBERTaDataModule(pl.LightningDataModule):

  def __init__(self, train, val,batch_size=16):
    super().__init__()
    self.train = train
    self.val = val
    self.batch_size = batch_size

  def setup(self):
    self.train_dataset = train
    self.test_dataset = val

  def train_dataloader(self):
    return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=2)

  def val_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=2)

  def test_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=2)

In [None]:
from transformers import DataCollatorForLanguageModeling
class RoBERTaMaskedLMModel(pl.LightningModule):

  def __init__(self, config: dict):
    super().__init__()
    self.roberta = RobertaForMaskedLM.from_pretrained(model_name, return_dict=True)
    self.config = config

  def forward(self, input_ids, attention_mask, labels=None):
    outputs = self.roberta(input_ids, attention_mask=attention_mask, labels=labels)
    return outputs

  def training_step(self, batch, batch_index):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    output = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    self.log("validation loss ", output.loss, prog_bar = True, logger=True)
    return {"loss":output.loss}

  def validation_step(self, batch, batch_index):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    output = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    self.log("validation loss ", output.loss, prog_bar = True, logger=True)
    return {"loss":output.loss}

  def training_epoch_end(self, outputs):
    avg_loss = torch.stack([x["loss"] for x in outputs]).mean()  
    self.log("ptl/train_loss", avg_loss)

  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x["loss"] for x in outputs]).mean()  
    self.log("ptl/val_loss", avg_loss)

  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['w_decay'])
    # needs scheduler here to work
    return optimizer

In [None]:
config = {
    "lr":1e-5,
    "w_decay":0.01,
    "n_epochs":3,
    # 32 max for colab GPUs
    "batch_size":16,
    "gpus":1,
}
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForMaskedLM.from_pretrained(model_checkpoint)             

In [None]:
data_module = RoBERTaDataModule(collated_datasets['train'], collated_datasets['test'], batch_size=config['batch_size'])
data_module.setup()

In [None]:
trainer = pl.Trainer(max_epochs=config['n_epochs'],gpus=config['gpus'],progress_bar_refresh_rate=30)
trainer.fit(model, data_module)   

In [None]:
%load_ext tensorboard
%tensorboard --logdir lightning_logs

# HyperParam Tuning with Ray Tune

In [None]:
%%capture
!pip install "ray[tune]"

In [None]:
import shutil
import tempfile
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.utilities.cloud_io import load as pl_load
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
from ray.tune.integration.pytorch_lightning import TuneReportCallback, TuneReportCheckpointCallback
import math

In [None]:
callback = TuneReportCallback({
    "loss": "ptl/val_loss",
}, on="validation_end")

In [None]:
def train_tune(config, gpus=0):
  model_name = 'roberta-base'
  tokenizer = RobertaTokenizer.from_pretrained(model_name)
  model = RoBERTaMaskedLMModel(config)
  data_module = RoBERTaDataModule(train, val, batch_size=config['batch_size'])
  data_module.setup()
  trainer = pl.Trainer(max_epochs=config['n_epochs'],gpus=config['gpus'],progress_bar_refresh_rate=60,
                      logger=TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version="."),
                        callbacks=[callback], num_sanity_val_steps=10)
  trainer.fit(model, data_module)

In [None]:
def tune_roberta(config, num_samples=3, gpus_per_trial=0):
  scheduler = ASHAScheduler(
      metric='loss',
      mode='min',
      grace_period=3,
      reduction_factor=2)

  reporter = CLIReporter(
      parameter_columns=["lr", 'w_decay'],
      metric_columns=["loss", "training_iteration"])

  trainable = tune.with_parameters(
      train_tune,
      gpus=gpus_per_trial)
  analysis = tune.run(
      trainable,
      resources_per_trial={
          "cpu": 1,
          "gpu": gpus_per_trial
      },
      config=config,
      scheduler=scheduler,
      progress_reporter=reporter,
      num_samples=num_samples,
      name="tune_roberta")

In [None]:
config = {
    "lr":tune.choice([1e-6, 1e-5, 1e-4]),
    "w_decay":tune.choice([0.0001, 0.001, 0.01]),
    "n_epochs":10,
    # 32 max for colab GPUs
    "batch_size":32,
    "gpus":1
}

model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
analysis = tune_roberta(config, num_samples=10, gpus_per_trial=1)

In [None]:
import pickle

In [None]:
%reload_ext tensorboard
%tensorboard --logdir ~/ray_results

# Test RoBERTa model on task **with (and without)** finetuning 

Two different strategies were used. 

1. Missing word was covered with one < mask > token e.g. the flowers were placed < mask > on the table
2. Each candidate word was inputted into each sentence. Then the sentence is tokenised and all tokens that are relevant to the candidate word are masked e.g. the flowers were placed *instantaenously* on the table will have several < mask > tokens as the token ID of *instantaenously* consists of several token IDs


In [None]:
import pandas as pd, csv
import re
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
questions=pd.read_csv(os.path.join(mrscc_dir,"testing_data.csv"))
answers=pd.read_csv(os.path.join(mrscc_dir,"test_answer.csv"))
choices = ['a','b','c','d','e']
questions.rename(columns={'a)':'a','b)':'b','c)':'c','d)':'d','e)':'e'}, inplace=True)
word_answers, question_with_answer, question_with_mask = [], [], []
for index,row in questions.iterrows():
  answer = answers.iloc[index].answer
  word_answers.append(row[answer])
  question_with_answer.append(re.sub("_____",row[answer],row.question))
  question_with_mask.append(re.sub("_____",tokenizer.mask_token,row.question))
questions['answer'] = word_answers
questions['question_with_answer'] = question_with_answer
questions['question'] = question_with_mask
questions.head()

In [None]:
questions.head()

In [None]:
# replace _____ with choice token
def correct_mask(df, target="<mask>"):
  replaced = []
  for index, row in df.iterrows():
    row_replaced = []
    for choice in choices:
      row_replaced.append(re.sub('<mask>', row[choice], row.question))
    replaced.append(row_replaced)
  return replaced

replaced = correct_mask(questions)
questions['replaced'] = replaced

In [None]:
questions.head()

In [None]:
questions.replaced.iloc[0]

Multi Mask

In [None]:
import numpy as np
def make_encodings(df, tokenizer):
  sent_encodings, word_encodings, mask_ix = [],[],[]
  for index, row in df.iterrows():
    _sent_encoding, _word_encoding, _mask_ix = [],[],[]
    for i,(word,sent) in enumerate(zip(row[choices], row.replaced)):

      # we have to add in an extra space before the target word as the BPE tokenizer
      # that RoBERTa uses consumes greedily and will break up target words without the space
      encoded_word = tokenizer.encode(str(" " + word), add_special_tokens=False)

      # print(word, encoded_word)
      encoded_sent = tokenizer.encode_plus(sent,add_special_tokens=True,
                          return_tensors='pt', truncation=True,
                          padding='max_length', max_length=64,
                          return_attention_mask = True)
      
      # find mask indicies for encoded sentence
      word_token_len = len(encoded_word)
      possibles = np.where(np.array(encoded_sent['input_ids'][0]) == np.array(encoded_word)[0])[0]
      ix = 0
      for p in possibles:
          check = np.array(encoded_sent['input_ids'][0])[p:p+word_token_len]
          if np.all(check == encoded_word):
              ix = p    
      tokens_to_mask_ix = list(range(ix, (ix+word_token_len)))
      encoded_sent['input_ids'][0][tokens_to_mask_ix] = tokenizer.mask_token_id
      _sent_encoding.append(encoded_sent)
      _word_encoding.append(encoded_word)
      _mask_ix.append(tokens_to_mask_ix)
      
    sent_encodings.append(_sent_encoding)
    word_encodings.append(_word_encoding)
    mask_ix.append(_mask_ix)
  return sent_encodings, word_encodings, mask_ix

sent_encodings, word_encodings, mask_ix = make_encodings(questions.iloc[:2], tokenizer)

In [None]:
sent_encodings, word_encodings, mask_ix = make_encodings(questions, tokenizer)
questions['s_encodings'] = sent_encodings
questions['w_encodings'] = word_encodings
questions['mask_ix'] = mask_ix

In [None]:
# define original model and re-trained model and test both
checkpoint = '/content/drive/MyDrive/university/2021/ANLE/roberta mrscc'
model_original = RobertaForMaskedLM.from_pretrained(checkpoint)

In [None]:
# model = trainer_model
# model.cpu()

In [None]:
# multi mask
import torch
def make_prediction_for_row(row):
  scores = {"original":[],"retrained":[]}
  for word_tokens,sent_encoding,mask_ix in zip(row.w_encodings, row.s_encodings, row.mask_ix):
    so,sr = [], []
    logits_original = model_original(sent_encoding['input_ids'], attention_mask=sent_encoding['attention_mask']).logits
    # logits_retrained = model(sent_encoding['input_ids'], attention_mask=sent_encoding['attention_mask']).logits
    for (token,mask) in zip(word_tokens, mask_ix):
      # get logits for masked token
      mask_token_logits_o = logits_original[0, mask, :]
      # mask_token_logits_r = logits_retrained[0, mask, :]
      # add logits to score
      so.append(mask_token_logits_o[token])
      # sr.append(mask_token_logits_r[token])
    # avg score
    scores['original'].append(float(torch.mean(torch.stack(so))))
    # scores['retrained'].append(float(torch.mean(torch.stack(sr))))
  return scores

scores = make_prediction_for_row(questions.iloc[0])
print(scores['original'])
print(scores['retrained'])

In [None]:
import numpy as np
prediction_o = np.argmax(scores['original'])
print(choices[prediction_o])

In [None]:
po,pr=[],[]
pr_ix = []
for index, row in questions.iterrows():
  scores = make_prediction_for_row(row)
  prediction_o = np.argmax(scores['original'])
  po.append(choices[prediction_o])

In [None]:
correct = 0
for prediction,answer in zip(po, answers.answer):
  if prediction==answer:
    correct += 1
print("Percentage of original predictions which are correct {}".format(correct/len(questions)))

In [None]:
answer_analysis = pd.DataFrame()
answer_analysis['question'] = questions.question
answer_analysis['prediction'] = po
answer_analysis['answer'] = answers.answer
answer_analysis['is_correct'] = (answer_analysis['prediction']==answer_analysis['answer'])
answer_analysis

In [None]:
pred_counts = answer_analysis.prediction.value_counts()
answer_counts = answer_analysis.answer.value_counts()

In [None]:
answer_counts
answer_counts=dict(answer_counts)

In [None]:
class_pred_proportions = {c:0 for c in choices}
for index,row in answer_analysis.iterrows():
  if row.is_correct:
    class_pred_proportions[row.answer] += 1
class_pred_proportions = {c:v/answer_counts.get(c) for c,v in class_pred_proportions.items()}
print(class_pred_proportions)

In [None]:
correct_counts = answer_analysis.is_correct.value_counts()
correct_counts/len(questions)

In [None]:
# question length analysis
incorrect_lens, correct_lens = [],[]
for i,(index,row) in enumerate(answer_analysis.iterrows()):
  if row.is_correct:
    correct_lens.append(len(row.question))
  else: 
    incorrect_lens.append(len(row.question))
print(np.mean(incorrect_lens), np.mean(correct_lens))

In [None]:
len(texts)

In [None]:
tokens = []
for text in texts:
  tokenized_text = [i for i in tokenize(text.lower())]
  tokens.extend(tokenized_text)

In [None]:
token_counts = {}
for token in tokens:
  token_counts[str(token)] = token_counts.get(token, 0)+1
token_counts

In [None]:
word_pred, word_ans = [],[]
for (index,row1),(index,row2) in zip(questions.iterrows(),answer_analysis.iterrows()):
  word_pred.append(row1[row2.prediction])
  word_ans.append(row1.answer)

answer_analysis['word_pred'] = word_pred
answer_analysis['word_ans'] = word_ans
answer_analysis

In [None]:
correct_pred_freq, incorrect_pred_freq = [],[]
correct_ans_freq, incorrect_ans_freq = [],[]
for index,row in answer_analysis.iterrows():
  if not row.is_correct:
    incorrect_pred_freq.append(token_counts.get(row.word_pred,0))
    incorrect_ans_freq.append(token_counts.get(row.word_ans,0))
  else:
    correct_pred_freq.append(token_counts.get(row.word_pred,0))
    correct_ans_freq.append(token_counts.get(row.word_ans, 0))
    
print(f'incorrect prediction token frequency in training corpus {np.mean(incorrect_pred_freq)}')
print(f'incorrect answer token frequency in training corpus {np.mean(incorrect_ans_freq)}')

Single Mask

In [None]:
# def make_encodings(df, tokenizer):
#   sent_encodings, word_encodings = [],[]
#   for index, row in df.iterrows():
#     _word_encoding = []
#     for i,word in enumerate(row[choices]):
#       _word_encoding.append(tokenizer.encode(str(" " + word), add_special_tokens=False))
#     sent_encodings.append(tokenizer(row.question, add_special_tokens=True, return_tensors='pt'))
#     word_encodings.append(_word_encoding)
#   return sent_encodings, word_encodings

# sent_encodings, word_encodings = make_encodings(questions.head(), tokenizer)

In [None]:
# word_encodings

In [None]:
# sent_encodings

In [None]:
# import numpy as np
# import torch
# sent = 'what is the capital of <mask>'
# sent = tokenizer.encode(sent, return_tensors='pt')
# print(sent)
# mask_token_index = np.where(sent == tokenizer.mask_token_id)[1]
# print(mask_token_index)
# print(sent.shape)
# logits = model(sent).logits
# mask_logits = logits[0,mask_token_index,:]
# mask_logits.shape
# pred = torch.argmax(mask_logits)
# print(tokenizer.decode(pred))


In [None]:
# def make_prediction_for_row(row):
#   scores = []
#   sent = row.s_encodings
#   mask_token_index = np.where(sent['input_ids'][0] == tokenizer.mask_token_id)
#   logits = model(sent['input_ids'], attention_mask=sent['attention_mask']).logits
#   mask_logits = torch.squeeze(logits[0,mask_token_index,:])
#   for choice in row.w_encodings:
#     score = []
#     for token_id in choice:
#       score.append(float(mask_logits[token_id]))
#     scores.append(np.mean(score))
#   # print(scores)
#   return scores
    
# test = make_prediction_for_row(questions.iloc[0])