# Initial Setups

## (Google Colab use only)

In [None]:
# Use Google Colab
use_colab = True

# Is this notebook running on Colab?
# If so, then google.colab package (github.com/googlecolab/colabtools)
# should be available in this environment

# Previous version used importlib, but we could do the same thing with
# just attempting to import google.colab
try:
    from google.colab import drive
    colab_available = True
except:
    colab_available = False

if use_colab and colab_available:
    drive.mount('/content/drive')

    # cd to the appropriate working directory under my Google Drive
    %cd 'drive/My Drive/zero_shot_atsc'
    
    # Install packages specified in requirements
    !pip install -r requirements.txt
    
    # List the directory contents
    !ls

## Experiment Parameters

In [None]:
#path to pretrained MLM model folder or the string "bert-base-uncased"
bert_model_path = '../trained_models/lm_further_pretraining_bert_amazon_electronics'

#in domain will be used for testing and classification model validation
#out domain will be used to train the classifier
#Values are "Laptop" or "Restaurant"
in_domain_dataset_name = "Laptops"
out_domain_dataset_name = "Restaurants"

#Prompts to be added to the end of each review text
sentiment_prompts = [
    "The {aspect} is [MASK].",
    "I [MASK] the {aspect}.",
    "I felt the {aspect} was [MASK]",
    "The {aspect} made me feel [MASK]"]

# Random seed
random_seed = 696

## Package Imports

In [None]:
import sys
import os
import random
import uuid

import numpy as np
import pandas as pd
import torch
import transformers
import datasets
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.metrics import confusion_matrix, accuracy_score

# Random seed settings
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)

cuda


## PyTorch GPU Settings

In [None]:
if torch.cuda.is_available():    
    torch_device = torch.device('cuda')

    # Set this to True to make your output immediately reproducible
    # Note: https://pytorch.org/docs/stable/notes/randomness.html
    torch.backends.cudnn.deterministic = False
    
    # Disable 'benchmark' mode: Set this False if you want to measure running times more fairly
    # Note: https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936
    torch.backends.cudnn.benchmark = True
    
    # Faster Host to GPU copies with page-locked memory
    use_pin_memory = True
    
    # Number of compute devices to be used for training
    training_device_count = torch.cuda.device_count()

    # CUDA libraries version information
    print("CUDA Version: " + str(torch.version.cuda))
    print("cuDNN Version: " + str(torch.backends.cudnn.version()))
    print("CUDA Device Name: " + str(torch.cuda.get_device_name()))
    print("CUDA Capabilities: "+ str(torch.cuda.get_device_capability()))
    print("Number of CUDA devices: "+ str(training_device_count))
    
else:
    torch_device = torch.device('cpu')
    use_pin_memory = False
    
    # Number of compute devices to be used for training
    training_device_count = 1

print()
print("PyTorch device selected:", torch_device)

# Prepare training data for prompt-based classifier

## Load the pretrained LM

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased", cache_dir='../bert_base_cache')

# Loads pretrained bert model as a normal bert model
model = transformers.BertModel.from_pretrained(pretrained_model_name_or_path=bert_model_path, cache_dir='../bert_base_cache')

# Freezes all layers
for param in model.parameters():
    param.requires_grad = False

# Send model to gpu
model = model.to(device=torch_device)
print(model.device)

Some weights of BertModel were not initialized from the model checkpoint at ../trained_models/lm_further_pretraining_bert_amazon_electronics and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda:0


## Load the SemEval dataset

In [None]:
# Load semeval for both domains
in_domain_semeval_dataset = datasets.load_dataset(
    os.path.abspath('../dataset_scripts/semeval2014_task4/semeval2014_task4.py'),
    data_files={
        'test': os.path.abspath('../dataset_files/semeval_2014/{domain}_Test_Gold.xml'.format(domain=in_domain_dataset_name)),
        'train': os.path.abspath('../dataset_files/semeval_2014/{domain}_Train_v2.xml'.format(domain=in_domain_dataset_name))
    },
    cache_dir='../dataset_cache')


out_domain_semeval_dataset = datasets.load_dataset(
    os.path.abspath('../dataset_scripts/semeval2014_task4/semeval2014_task4.py'),
    data_files={
        'test': os.path.abspath('../dataset_files/semeval_2014/{domain}_Test_Gold.xml'.format(domain=out_domain_dataset_name)),
        'train': os.path.abspath('../dataset_files/semeval_2014/{domain}_Train_v2.xml'.format(domain=out_domain_dataset_name))
    },
    cache_dir='../dataset_cache')

train_dataset = out_domain_semeval_dataset["train"]
val_dataset = in_domain_semeval_dataset["train"]
test_dataset = in_domain_semeval_dataset["test"]

print(test_dataset[0])

Using custom data configuration default-3a5e3b6e10fdd547
Reusing dataset sem_eval2014_task4_dataset (../dataset_cache\sem_eval2014_task4_dataset\default-3a5e3b6e10fdd547\0.0.1\f33ba7108331ad17be3f9fb710ca001edb383fba797c6ed0938354e6812ca969)
Using custom data configuration default-790c778f2f732468
Reusing dataset sem_eval2014_task4_dataset (../dataset_cache\sem_eval2014_task4_dataset\default-790c778f2f732468\0.0.1\f33ba7108331ad17be3f9fb710ca001edb383fba797c6ed0938354e6812ca969)


{'aspect': 'Boot time', 'sentiment': 0, 'text': 'Boot time is super fast, around anywhere from 35 seconds to 1 minute.'}


## Append prompts to review text

In [None]:
# Takes in batches from the dataset and makes an example for every prompt, text pair
def add_prompts(reviews, prompts):

    # Collect the output from each example in the batch
    texts = []
    sentiments = []
    ids = []
    aspect_prompts = []
    
    for i in range(len(reviews["aspect"])):
        
        aspect = reviews["aspect"][i]
        text = reviews["text"][i]
        sentiment = reviews["sentiment"][i]
        
        # ID to identify the review, aspect pair for regrouping later
        review_aspect_id = str(uuid.uuid1())
        
        for p in prompts:
            aspect_prompt = p.format(aspect=aspect)

            texts.append(text)
            sentiments.append(sentiment)
            ids.append(review_aspect_id)
            aspect_prompts.append(aspect_prompt)

    return {"text":texts, "prompt": aspect_prompts, "label": sentiments, "review_aspect_id": ids}

In [None]:
# Map to add_prompts
train_prompt_dataset = train_dataset.map(
    lambda e: add_prompts(e, sentiment_prompts),
    remove_columns=train_dataset.column_names,
    batched=True)

val_prompt_dataset = val_dataset.map(
    lambda e: add_prompts(e, sentiment_prompts),
    remove_columns=val_dataset.column_names,
    batched=True)

test_prompt_dataset = test_dataset.map(
    lambda e: add_prompts(e, sentiment_prompts),
    remove_columns=test_dataset.column_names,
    batched=True)

print(test_prompt_dataset[0])
print(len(test_prompt_dataset))

Loading cached processed dataset at ../dataset_cache\sem_eval2014_task4_dataset\default-790c778f2f732468\0.0.1\f33ba7108331ad17be3f9fb710ca001edb383fba797c6ed0938354e6812ca969\cache-6268c5a73dc375bd.arrow
Loading cached processed dataset at ../dataset_cache\sem_eval2014_task4_dataset\default-3a5e3b6e10fdd547\0.0.1\f33ba7108331ad17be3f9fb710ca001edb383fba797c6ed0938354e6812ca969\cache-3c33b4c07a4b0f9e.arrow
Loading cached processed dataset at ../dataset_cache\sem_eval2014_task4_dataset\default-3a5e3b6e10fdd547\0.0.1\f33ba7108331ad17be3f9fb710ca001edb383fba797c6ed0938354e6812ca969\cache-acaa1f1ef2d50386.arrow


{'label': 0, 'prompt': 'The Boot time is [MASK].', 'review_aspect_id': 'b052c13b-82b9-11eb-8344-7085c2c04498', 'text': 'Boot time is super fast, around anywhere from 35 seconds to 1 minute.'}
2552


## Encode training data

In [None]:
# Tokenize every example and run it through the bert model
# Outputs the last hidden state of the model
def run_model(reviews, tokenizer, model, device):
    
    # Only the review text is truncated so that the mask token always exists in the tokens
    batch_tokens = tokenizer(reviews["text"], reviews["prompt"], 
                             truncation='only_first', padding='max_length', max_length=256, return_tensors="pt")

    batch_tokens.to(device=device)
    
    # Figures out where the mask token was placed
    masked_indexes = []

    for tokens_input_ids in batch_tokens.data["input_ids"]:
        masked_index = torch.nonzero(tokens_input_ids == tokenizer.mask_token_id, as_tuple=False).item()
        masked_indexes.append(masked_index)
    
    # Run the batch through the model
    outputs = model(**batch_tokens)
    
    # Extracts the last hidden states from the batch output
    output_list = []
    
    for i in range(len(outputs["last_hidden_state"])):
        masked_index = masked_indexes[i]
        output_list.append(outputs["last_hidden_state"][i][masked_index])
    
    return {"hidden_state":output_list, "label": reviews["label"], "review_aspect_id": reviews["review_aspect_id"]}

In [None]:
# Maps the dataset using run_model
train_model_output = train_prompt_dataset.map(
    lambda e: run_model(e, tokenizer, model, torch_device),
    remove_columns=train_prompt_dataset.column_names,
    batched=True, batch_size=4, num_proc=None)

val_model_output = val_prompt_dataset.map(
    lambda e: run_model(e, tokenizer, model, torch_device),
    remove_columns=train_prompt_dataset.column_names,
    batched=True, batch_size=4, num_proc=None)

test_model_output = test_prompt_dataset.map(
    lambda e: run_model(e, tokenizer, model, torch_device),
    remove_columns=train_prompt_dataset.column_names,
    batched=True, batch_size=4, num_proc=None)

HBox(children=(FloatProgress(value=0.0, max=3602.0), HTML(value='')))

In [None]:
# Regroups the outputs by review, aspect pairs and concats the outputs into one long tensor
def concat_tensors(examples, device):
    # Make sure that everything in the batch is from the same review, aspect pair
    ids = examples["review_aspect_id"]
    labels = examples["label"]
    
    for a in ids:
        for b in ids:
            assert a == b
            
    for a in labels:
        for b in labels:
            assert a == b
    
    hidden_state_tensors = torch.FloatTensor(examples["hidden_state"]).to(device=device)
    cat_hs_tensor = torch.cat(tuple(hidden_state_tensors), 0).to(device=device)
    
    return {"hidden_state": [cat_hs_tensor], "label": [examples["label"][0]], "review_aspect_id": [examples["review_aspect_id"][0]]}

In [None]:
# Map to concat_tensors
# Batch size has to be the same as sentiment prompts so that the examples in a batches all come from the same review
train_hs_dataset = train_model_output.map(
    lambda e: concat_tensors(e, torch_device),
    remove_columns=train_model_output.column_names,
    batched=True, batch_size=len(sentiment_prompts))

val_hs_dataset = val_model_output.map(
    lambda e: concat_tensors(e, torch_device),
    remove_columns=val_model_output.column_names,
    batched=True, batch_size=len(sentiment_prompts))

test_hs_dataset = test_model_output.map(
    lambda e: concat_tensors(e, torch_device),
    remove_columns=test_model_output.column_names,
    batched=True, batch_size=len(sentiment_prompts))

#print(train_hs_dataset[0])
print(len(train_hs_dataset))

# Zero-shot ATSC with Prompts

## Train a classifier

In [None]:
# This is the classification model that was trained to convert hidden state values to a class prediction
class SentimentClassifier(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SentimentClassifier, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        outputs = self.linear(x)
        return outputs

In [None]:
# Output dataset to tensors from a dataloader
train_hs_dataset.set_format(type='torch', columns=['hidden_state', 'label'])
dataloader = torch.utils.data.DataLoader(train_hs_dataset, batch_size=32)

val_hs_dataset.set_format(type='torch', columns=['hidden_state', 'label'])
val_dataloader = torch.utils.data.DataLoader(val_hs_dataset, batch_size=32)

In [None]:
# Train classifier
epochs = 10

# Bert hidden State size
input_dim = 768 * len(sentiment_prompts)
print(input_dim)
output_dim = 3
lr_rate = 0.0001

classifier_model = SentimentClassifier(input_dim, output_dim)
classifier_model.to(device=torch_device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier_model.parameters(), lr=lr_rate)

for epoch in range(int(epochs)):
    for batch in dataloader:
        hidden_states = batch["hidden_state"]
        hidden_states = hidden_states.float().to(device=torch_device)
        
        labels = batch["label"]

        labels = labels.to(device=torch_device)

        optimizer.zero_grad()
        outputs = classifier_model(hidden_states)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validate the model using val dataset
    correct = 0
    total = 0

    for batch_val in val_dataloader:

        hidden_states = batch_val["hidden_state"]
        hidden_states = hidden_states.float().to(device=torch_device)

        labels = batch_val["label"]
        labeles = labels.to(device=torch_device)

        outputs = classifier_model(hidden_states)
        _, predicted = torch.max(outputs.data, 1)
        total+= labels.size(0)

        # for gpu, bring the predicted and labels back to cpu for python operations to work
        predicted = predicted.to(device="cpu")
        labels = labels.to(device="cpu")

        correct+= (predicted == labels).sum()
    accuracy = 100 * correct/total
    print("Epoch: {}. Loss: {}. Validation Accuracy: {}.".format(epoch, loss.item(), accuracy))

## Run the classifier on hidden states

In [None]:
classifier_model.eval()
test_hs_dataset.set_format(type='torch', columns=['hidden_state', 'label'])

# Run the LR model on the dataset
def make_predictions(examples, classifier_model, torch_device):
    
    cat_hs_tensor = examples["hidden_state"].float().to(device=torch_device)
    class_probs = classifier_model(cat_hs_tensor)

    predictions = []
    for cp in class_probs:
        predictions.append(torch.argmax(cp))
    
    return {"prediction": predictions, "label": examples["label"].tolist()}

# Batch size has to be the same as sentiment prompts so that the examples in a batches all come from the same review
predictions = test_hs_dataset.map(
    lambda e: make_predictions(e, classifier_model, torch_device),
    batched=True, batch_size=7, num_proc=None)

## Convert prompt class probs to predictions

In [None]:
# This regroups the models by the review_aspect_id
# and makes a prediction based upon each groups class prob output from the LR model
def make_predictions(example, device):
    ids = example["review_aspect_id"]
    labels = example["label"]
    for a in ids:
        for b in ids:
            assert a == b
            
    for a in labels:
        for b in labels:
            assert a == b
            
    class_probs = torch.FloatTensor(example["class_probs"])
    prediction_mean = torch.mean(class_probs, 0)
    prediction = torch.argmax(prediction_mean)
    
    return {"prediction": [prediction], "label": [example["label"][0]], "review_aspect_id": [example["review_aspect_id"][0]]}

predictions = LR_output.map(
    lambda e: make_predictions(e, torch_device),
    remove_columns=LR_output.column_names,
    batched=True, batch_size=len(sentiment_prompts), num_proc=None)

## Results

In [None]:
# Calculate metrics and confusion matrix based upon predictions and true labels
accuracy = accuracy_score(predictions["label"], predictions["prediction"])

print("Accuracy: {:.2%}".format(accuracy))

cm = confusion_matrix(predictions["label"], predictions["prediction"])
df_cm = pd.DataFrame(cm, index = [i for i in ["positive", "negative", "neutral"]],
                  columns = [i for i in ["positive", "negative", "neutral"]])

plt.figure(figsize = (10,7))
ax = sn.heatmap(df_cm, annot=True)

ax.set(xlabel='Predicted Label', ylabel='True Label')
plt.show()