In [1]:
import sys
import os
import random
import uuid

import numpy as np
import torch
import torch.nn.functional as F
import transformers
import datasets

torch.cuda.is_available()
cuda = torch.device('cuda')

model_name = "MLM_BERT-Base_Trained_on_Restaurant"
validate_model = False

In [2]:
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased", cache_dir='../bert_base_cache')
model = transformers.AutoModelForMaskedLM.from_pretrained("bert-base-uncased", cache_dir='../bert_base_cache')

model = model.to(device=cuda)
print(model.device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


cuda:0


In [3]:
semeval_laptop = datasets.load_dataset(
    '../dataset_scripts/semeval2014_task4/semeval2014_task4.py',
    data_files={
        'test': r'C:\Users\ibirl\Documents\dsProjects\zero_shot_atsc\zero_shot_atsc\dataset_files\semeval_2014\Laptops_Test_Gold.xml',
        'train': r'C:\Users\ibirl\Documents\dsProjects\zero_shot_atsc\zero_shot_atsc\dataset_files\semeval_2014\Laptop_Train_v2.xml'
    },
    cache_dir='../dataset_cache')

semeval_restaurant = None

if validate_model:
    semeval_restaurant = datasets.load_dataset(
        'dataset_scripts/semeval2014_task4/semeval2014_task4.py',
        data_files={
            'test': r'dataset_files\semeval_2014\Restaurants_Test_Gold.xml',
            'train': r'dataset_files\semeval_2014\Restaurants_Train_v2.xml'
        },
        cache_dir='../dataset_cache')
    
    semeval_restaurant = semeval_restaurant["train"]
    
semeval_laptop = semeval_laptop["train"]


#semeval_dataset = datasets.concatenate_datasets([semeval_laptop, semeval_restaurant])
#print(semeval_dataset[0])
#print(semeval_dataset[-1])

Using custom data configuration default-9b6d847dca00ea02
Reusing dataset sem_eval2014_task4_dataset (../dataset_cache\sem_eval2014_task4_dataset\default-9b6d847dca00ea02\0.0.1\f33ba7108331ad17be3f9fb710ca001edb383fba797c6ed0938354e6812ca969)


In [4]:
sentiment_prompts = [
    "The {aspect} is [MASK].",
    "I [MASK] the {aspect}."
]

In [5]:
def add_prompts(reviews, prompts):

    texts = []
    sentiments = []
    ids = []
    aspect_prompts = []
    
    for i in range(len(reviews["aspect"])):
        
        aspect = reviews["aspect"][i]
        text = reviews["text"][i]
        sentiment = reviews["sentiment"][i]
        
        review_id = str(uuid.uuid1())
        
        for p in prompts:
            aspect_prompt = p.format(aspect=aspect)

            texts.append(text)
            sentiments.append(sentiment)
            ids.append(review_id)
            aspect_prompts.append(aspect_prompt)

    return {"text":texts, "prompt": aspect_prompts, "label": sentiments, "review_id": ids}

prompt_dataset = semeval_laptop.map(
    lambda e: add_prompts(e, sentiment_prompts),
    remove_columns=semeval_laptop.column_names,
    batched=True)

if validate_model:
    prompt_dataset_val = semeval_restaurant.map(
        lambda e: add_prompts(e, sentiment_prompts),
        remove_columns=semeval_laptop.column_names,
        batched=True)

print(prompt_dataset[0])
print(len(prompt_dataset))

Loading cached processed dataset at ../dataset_cache\sem_eval2014_task4_dataset\default-9b6d847dca00ea02\0.0.1\f33ba7108331ad17be3f9fb710ca001edb383fba797c6ed0938354e6812ca969\cache-f03e2478987f984c.arrow


{'label': 2, 'prompt': 'The cord is [MASK].', 'review_id': '811f8505-81ea-11eb-8b93-7085c2c04498', 'text': 'I charge it at night and skip taking the cord with me because of the good battery life.'}
4626


In [6]:
def run_model(reviews, tokenizer, model, device):
    
    batch_tokens = tokenizer(reviews["text"], reviews["prompt"], 
                             truncation='only_first', padding='max_length', max_length=256, return_tensors="pt")
    batch_tokens.to(device=device)
    
    masked_indexes = []

    for tokens_input_ids in batch_tokens.data["input_ids"]:

        masked_index = torch.nonzero(tokens_input_ids == tokenizer.mask_token_id, as_tuple=False).item()
        masked_indexes.append(masked_index)
        
    outputs = model(**batch_tokens)
    
    output_list = []
    for i in range(len(outputs["logits"])):
        masked_index = masked_indexes[i]
        output_list.append(outputs["logits"][i][masked_index])
    
    return {"logit_tensor":output_list, "prompt": reviews["prompt"], "label": reviews["label"], "text": reviews["text"]}

model_output = prompt_dataset.map(
    lambda e: run_model(e, tokenizer, model, cuda),
    remove_columns=prompt_dataset.column_names,
    batched=True, batch_size=4, num_proc=None)

if validate_model:
    model_output_val = prompt_dataset_val.map(
        lambda e: run_model(e, tokenizer, model, cuda),
        remove_columns=prompt_dataset.column_names,
        batched=True, batch_size=4, num_proc=None)


HBox(children=(FloatProgress(value=0.0, max=2313.0), HTML(value='')))




Logistic regression code adapted from : https://towardsdatascience.com/logistic-regression-on-mnist-with-pytorch-b048327f8d19

In [7]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        outputs = self.linear(x)
        return outputs

In [8]:
model_output.set_format(type='torch', columns=['logit_tensor', 'label'])
dataloader = torch.utils.data.DataLoader(model_output, batch_size=32)

if validate_model:
    model_output_val.set_format(type='torch', columns=['logit_tensor', 'label'])
    dataloader_val = torch.utils.data.DataLoader(model_output_val, batch_size=32)


In [9]:
epochs = 5
input_dim = 30522
output_dim = 3
lr_rate = 0.0001

LR_model = LogisticRegression(input_dim, output_dim)
LR_model.to(device=cuda)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(LR_model.parameters(), lr=lr_rate)

for epoch in range(int(epochs)):
    for batch in dataloader:
        
        logit_tensors = batch["logit_tensor"]
        logit_tensors = logit_tensors.float().to(device=cuda)
        
        labels = batch["label"]
        labels = labels.to(device=cuda)

        optimizer.zero_grad()
        outputs = LR_model(logit_tensors)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    if validate_model:
        correct = 0
        total = 0
        for batch_val in dataloader_val:

            logit_tensors = batch_val["logit_tensor"]
            logit_tensors = logit_tensors.float().to(device=cuda)

            labels = batch_val["label"]
            labeles = labels.to(device=cuda)

            outputs = LR_model(logit_tensors)
            _, predicted = torch.max(outputs.data, 1)
            total+= labels.size(0)
            # for gpu, bring the predicted and labels back to cpu for python operations to work
            predicted = predicted.to(device="cpu")
            labels = labels.to(device="cpu")


            correct+= (predicted == labels).sum()
        accuracy = 100 * correct/total
        print("Epoch: {}. Loss: {}. Validation Accuracy: {}.".format(epoch, loss.item(), accuracy))

In [10]:
torch.save(LR_model.state_dict(), "logit_models/" + model_name + ".pt")