# Eval template for MLM

In [1]:
import sys
import os
import random
import uuid

import numpy as np
import torch
import transformers
import datasets

torch.cuda.is_available()
cuda = torch.device('cuda')

## Loading Model

In [2]:
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased", cache_dir='../bert_base_cache')
model = transformers.AutoModelForMaskedLM.from_pretrained("bert-base-uncased", cache_dir='../bert_base_cache')

model = model.to(device=cuda)
print(model.device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


cuda:0


## Loading SemEval Data

In [3]:
semeval_laptop = datasets.load_dataset(
    '../dataset_scripts/semeval2014_task4/semeval2014_task4.py',
    data_files={
        'train': '../dataset_files/semeval_2014/Laptops_Test_Gold.xml',
        'test': '../dataset_files/semeval_2014/Laptop_Train_v2.xml'
    },
    cache_dir='../dataset_cache')

semeval_laptop = semeval_laptop["test"]
print(semeval_laptop[0])

Using custom data configuration default-b5c3bb350489de8a
Reusing dataset sem_eval2014_task4_dataset (../dataset_cache\sem_eval2014_task4_dataset\default-b5c3bb350489de8a\0.0.1\f33ba7108331ad17be3f9fb710ca001edb383fba797c6ed0938354e6812ca969)


{'aspect': 'cord', 'sentiment': 2, 'text': 'I charge it at night and skip taking the cord with me because of the good battery life.'}


## Manually created Prompts

In [4]:
sentiment_prompts = [
    "The {aspect} is [MASK].",
    "I [MASK] the {aspect}."
]

## Adding prompts to review text

In [7]:
def add_prompts(reviews, prompts):

    texts = []
    sentiments = []
    ids = []
    aspect_prompts = []
    
    for i in range(len(reviews["aspect"])):
        
        aspect = reviews["aspect"][i]
        text = reviews["text"][i]
        sentiment = reviews["sentiment"][i]
        
        review_id = str(uuid.uuid1())
        
        for p in prompts:
            aspect_prompt = p.format(aspect=aspect)

            texts.append(text)
            sentiments.append(sentiment)
            ids.append(review_id)
            aspect_prompts.append(aspect_prompt)

    return {"text":texts, "prompt": aspect_prompts, "label": sentiments, "review_id": ids}

In [8]:
prompt_dataset = semeval_laptop.map(
    lambda e: add_prompts(e, sentiment_prompts),
    remove_columns=semeval_laptop.column_names,
    batched=True)

print(prompt_dataset[0])
print(len(prompt_dataset))

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


{'label': 2, 'prompt': 'The cord is [MASK].', 'review_id': '6866a59d-813d-11eb-b122-7085c2c04498', 'text': 'I charge it at night and skip taking the cord with me because of the good battery life.'}
4626


## Tokenize and Ron model

In [11]:
def run_model(reviews, tokenizer, model, device):
    
    batch_tokens = tokenizer(reviews["text"], reviews["prompt"], 
                             truncation='only_first', padding='max_length', max_length=256, return_tensors="pt")
    batch_tokens.to(device=device)
    
    masked_indexes = []

    for tokens_input_ids in batch_tokens.data["input_ids"]:

        masked_index = torch.nonzero(tokens_input_ids == tokenizer.mask_token_id, as_tuple=False).item()
        masked_indexes.append(masked_index)
        
    outputs = model(**batch_tokens)
    
    output_list = []
    for i in range(len(outputs["logits"])):
        masked_index = masked_indexes[i]
        output_list.append(outputs["logits"][i][masked_index])
    
    return {"logit_tensor":output_list, "prompt": reviews["prompt"], "label": reviews["label"], "text": reviews["text"]}




In [12]:
model_output = prompt_dataset.map(
    lambda e: run_model(e, tokenizer, model, cuda),
    remove_columns=prompt_dataset.column_names,
    batched=True, batch_size=2, num_proc=None)



HBox(children=(FloatProgress(value=0.0, max=2313.0), HTML(value='')))




In [None]:
print(len(model_output["logit_tensor"][0]))

In [None]:
print(model_output)