# Initial Setups

## (Google Colab use only)

In [1]:
# Use Google Colab
use_colab = True

# Is this notebook running on Colab?
# If so, then google.colab package (github.com/googlecolab/colabtools)
# should be available in this environment

# Previous version used importlib, but we could do the same thing with
# just attempting to import google.colab
try:
    from google.colab import drive
    colab_available = True
except:
    colab_available = False

if use_colab and colab_available:
    drive.mount('/content/drive')

    # cd to the appropriate working directory under my Google Drive
    %cd '/content/drive/My Drive/zero_shot_atsc'
    
    # Install packages specified in requirements
    !pip install -r requirements.txt
    
    # List the directory contents
    !ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/zero_shot_atsc
 bert_base_cache
'Copy of prompt_atsc_bert_amazon_electronics.ipynb'
 dataset_cache
 dataset_files
 dataset_scripts
 environment.yml
 gypsum_logs
 LICENSE
 lm_further_pretraining_bert_amazon_electronics.ipynb
 lm_further_pretraining_bert_yelp_restaurants.ipynb
 lm_further_pretraining_gpt-2_amazon_electronics.ipynb
 lm_further_pretraining_gpt-2_yelp_restaurants.ipynb
 progress
 prompt_atsc_bert_amazon_electronics.ipynb
 README.md
 requirements.txt
 sbatch_lm_further_pretraining_bert_amazon_electronics.sh
 sbatch_lm_further_pretraining_bert_yelp_restaurants_10percent.sh
 sbatch_lm_further_pretraining_bert_yelp_restaurants.sh
 sbatch_lm_further_pretraining_gpt-2_amazon_electronics.sh
 sbatch_lm_further_pretraining_gpt-2_yelp_restaurants_10percent.sh
 sbatch_lm_further_pretraining_gpt-2_yelp_restaurants.sh
 sbatch_test.sh
 u

## Experiment Parameters

In [2]:
import os

# path to pretrained MLM model folder or the string "bert-base-uncased"
mlm_model_path = os.path.join(
    'progress', 'lm_further_pretraining_bert_amazon_electronics_bseoh_2021-03-06--18_59_53',
    'results', 'checkpoint-1180388')

# in domain will be used for testing
# out domain will be used to train the classifier
# Values are "Laptop" or "Restaurant"
in_domain_dataset_name = "Laptops"
out_domain_dataset_name = "Restaurants"

validation_dataset_proportion = 0.1 # Proportion to be reserved for validation

# Prompts to be added to the end of each review text
sentiment_prompts = [
    "The {aspect} is [MASK].",
    "I [MASK] the {aspect}.",
    "I felt the {aspect} was [MASK]",
    "The {aspect} made me feel [MASK]"]

# Training settings for logistic regression
lr_training_epochs = 10
lr_training_batch_size = 64
lr_training_learning_rate = 2e-6

# Random seed
random_seed = 696

## Package Imports

In [3]:
import sys
import os
import random

import numpy as np
import torch
import transformers
import datasets
import matplotlib.pyplot as plt
import seaborn as sn
import tqdm
from sklearn.metrics import confusion_matrix, accuracy_score

# Random seed settings
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)

# Print version information
print("Python version: " + sys.version)
print("NumPy version: " + np.__version__)
print("PyTorch version: " + torch.__version__)
print("Transformers version: " + transformers.__version__)

<torch._C.Generator at 0x7f468a82cb10>

## PyTorch GPU Settings

In [4]:
if torch.cuda.is_available():    
    torch_device = torch.device('cuda')

    # Set this to True to make your output immediately reproducible
    # Note: https://pytorch.org/docs/stable/notes/randomness.html
    torch.backends.cudnn.deterministic = False
    
    # Disable 'benchmark' mode: Set this False if you want to measure running times more fairly
    # Note: https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936
    torch.backends.cudnn.benchmark = True
    
    # Faster Host to GPU copies with page-locked memory
    use_pin_memory = True
    
    # Number of compute devices to be used for training
    training_device_count = torch.cuda.device_count()

    # CUDA libraries version information
    print("CUDA Version: " + str(torch.version.cuda))
    print("cuDNN Version: " + str(torch.backends.cudnn.version()))
    print("CUDA Device Name: " + str(torch.cuda.get_device_name()))
    print("CUDA Capabilities: "+ str(torch.cuda.get_device_capability()))
    print("Number of CUDA devices: "+ str(training_device_count))
    
else:
    torch_device = torch.device('cpu')
    use_pin_memory = False
    
    # Number of compute devices to be used for training
    training_device_count = 1

print()
print("PyTorch device selected:", torch_device)

CUDA Version: 10.1
cuDNN Version: 7603
CUDA Device Name: Tesla T4
CUDA Capabilities: (7, 5)
Number of CUDA devices: 1

PyTorch device selected: cuda


# Prepare training data for prompt-based classifier

## Load the SemEval dataset

In [5]:
# Load semeval for both domains
in_domain_semeval_dataset = datasets.load_dataset(
    os.path.abspath('dataset_scripts/semeval2014_task4/semeval2014_task4.py'),
    data_files={
        'test': 'dataset_files/semeval_2014/Laptops_Test_Gold.xml',
        'train': 'dataset_files/semeval_2014/Laptop_Train_v2.xml',
    },
    cache_dir='dataset_cache')


out_domain_semeval_dataset = datasets.load_dataset(
    os.path.abspath('dataset_scripts/semeval2014_task4/semeval2014_task4.py'),
    data_files={
        'test': 'dataset_files/semeval_2014/Restaurants_Test_Gold.xml',
        'train': 'dataset_files/semeval_2014/Restaurants_Train_v2.xml',
    },
    cache_dir='dataset_cache')

Using custom data configuration default
Reusing dataset sem_eval2014_task4_dataset (dataset_cache/sem_eval2014_task4_dataset/default-6a81dd9871e20329/0.0.1/f33ba7108331ad17be3f9fb710ca001edb383fba797c6ed0938354e6812ca969)
Using custom data configuration default
Reusing dataset sem_eval2014_task4_dataset (dataset_cache/sem_eval2014_task4_dataset/default-f51da598aeeb968e/0.0.1/f33ba7108331ad17be3f9fb710ca001edb383fba797c6ed0938354e6812ca969)


In [6]:
out_domain_train = out_domain_semeval_dataset['train']
out_domain_test = out_domain_semeval_dataset['test']

in_domain_test = in_domain_semeval_dataset['test']

## Train-validation split for out-domain SemEval data

In [7]:
# Training set size after validation split
new_out_domain_train_dataset_size = int(len(out_domain_train) * 0.8)
new_out_domain_valid_dataset_size = len(out_domain_train) - new_out_domain_train_dataset_size

print("Training dataset (out-domain) after split:", new_out_domain_train_dataset_size)
print("Validation dataset (out-domain) after split:", new_out_domain_valid_dataset_size)

Training dataset (out-domain) after split: 2881
Validation dataset (out-domain) after split: 721


In [8]:
out_domain_train = out_domain_train.shuffle(seed=random_seed)

new_out_domain_train_dataset = out_domain_train.select(indices=np.arange(new_out_domain_train_dataset_size))
new_out_domain_valid_dataset = out_domain_train.select(indices=np.arange(new_out_domain_train_dataset_size, new_out_domain_train_dataset_size + new_out_domain_valid_dataset_size))

Loading cached shuffled indices for dataset at dataset_cache/sem_eval2014_task4_dataset/default-f51da598aeeb968e/0.0.1/f33ba7108331ad17be3f9fb710ca001edb383fba797c6ed0938354e6812ca969/cache-b1be74464447108a.arrow


In [9]:
print(new_out_domain_train_dataset[0])

{'aspect': 'Unda (Egg) rolls', 'sentiment': 0, 'text': 'I really recommend the very simple Unda (Egg) rolls.'}


# Zero-shot ATSC with Prompts

## Load the pretrained LM

In [10]:
# Load pretrained language model
mlm = transformers.AutoModelForMaskedLM.from_pretrained(mlm_model_path)
tokenizer = transformers.AutoTokenizer.from_pretrained('bert-base-uncased', cache_dir='bert_base_cache')

mlm = mlm.to(torch_device)

## Define a logistic regression head

In [11]:
# This is the classification model that was trained to convert hidden state values to a class prediction
class SentimentClassifier(torch.nn.Module):
    def __init__(self, num_class, num_prompts):
        super(SentimentClassifier, self).__init__()
        self.num_class = num_class
        self.num_prompts = num_prompts

        self.mlm = mlm
        self.linear = torch.nn.Linear(
            self.num_prompts * self.mlm.config.hidden_size, self.num_class)

    def forward(self, reviews_and_prompts):

        mlm_outputs = self.mlm(**reviews_and_prompts, output_hidden_states=True)

        # Figures out where the mask token was placed
        masked_indexes = torch.nonzero(
            reviews_and_prompts.data["input_ids"] == tokenizer.mask_token_id)[:, 1]

        outputs = []

        lr_inputs_batch = []

        for i in range(len(reviews_and_prompts.data["input_ids"]) // self.num_prompts):
            # Create an input to self.linear by
            # concatenating last hidden states for this review
            lr_input = []

            for j in range(self.num_prompts):
                lr_input.append(mlm_outputs["hidden_states"][-1][i+j][masked_indexes[i+j]])

            lr_input = torch.cat(lr_input, dim=0)

            lr_inputs_batch.append(lr_input)

        lr_inputs_batch = torch.stack(lr_inputs_batch)

        outputs = self.linear(lr_inputs_batch)

        return outputs

In [12]:
classifier_model = SentimentClassifier(num_class=3, num_prompts=len(sentiment_prompts))

# Freeze the MLM main layer
for param in classifier_model.mlm.parameters():
    param.requires_grad = False

classifier_model = classifier_model.to(device=torch_device)

## Training loop

In [13]:
# Output dataset to tensors from a dataloader
dataloader = torch.utils.data.DataLoader(
    new_out_domain_train_dataset, batch_size=lr_training_batch_size,
    pin_memory=use_pin_memory)
val_dataloader = torch.utils.data.DataLoader(
    new_out_domain_valid_dataset, batch_size=lr_training_batch_size,
    pin_memory=use_pin_memory)

loss_function = torch.nn.CrossEntropyLoss()
optimizer = transformers.AdamW(classifier_model.parameters(), lr=lr_training_learning_rate)

In [14]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(predictions, labels):
    preds = predictions.argmax(-1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true=labels, y_pred=preds, labels=[0,1,2], average='macro')

    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [15]:
training_progress_bar = tqdm.notebook.tqdm(range(int(lr_training_epochs)))

for epoch in training_progress_bar:

    print("Training epoch %d" % epoch)
    print()

    classifier_model.train()

    for batch in tqdm.notebook.tqdm(dataloader):

        reviews_repeated = []
        prompts_populated = []

        for prompt in sentiment_prompts:
            reviews_repeated = reviews_repeated + batch["text"]

            for aspect in batch["aspect"]:
                prompts_populated.append(prompt.format(aspect=aspect))

        batch_encoded = tokenizer(
            reviews_repeated, prompts_populated,
            padding='max_length', truncation='only_first', max_length=256,
            return_tensors='pt')
        
        batch_encoded = batch_encoded.to(torch_device)

        labels = batch["sentiment"]
        labels = labels.to(torch_device)

        optimizer.zero_grad()

        outputs = classifier_model(batch_encoded)
        
        loss = loss_function(outputs, labels)

        loss.backward()

        optimizer.step()

    # Validate the model using val dataset
    classifier_model.eval()

    print("Validation epoch %d" % epoch)
    print()

    predictions_val = torch.Tensor([])
    labels_val = torch.Tensor([])

    for batch_val in tqdm.notebook.tqdm(val_dataloader):

        reviews_repeated = []
        prompts_populated = []

        for prompt in sentiment_prompts:
            reviews_repeated = reviews_repeated + batch_val["text"]

            for aspect in batch_val["aspect"]:
                prompts_populated.append(prompt.format(aspect=aspect))

        batch_encoded = tokenizer(
            reviews_repeated, prompts_populated,
            padding='max_length', truncation='only_first', max_length=256,
            return_tensors='pt')
        
        batch_encoded.to(torch_device)

        labels = batch_val["sentiment"]

        outputs = classifier_model(batch_encoded)

        outputs = outputs.to('cpu')

        predictions_val = torch.cat([predictions_val, outputs])
        labels_val = torch.cat([labels_val, labels])
    
    # Compute metrics
    validation_metrics = compute_metrics(predictions_val, labels_val)

    print(
        "Epoch: {}. Loss: {}. Validation: {}.".format(epoch, loss.item(), validation_metrics))
    print()

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Training epoch 0



HBox(children=(FloatProgress(value=0.0, max=46.0), HTML(value='')))


Validation:



HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))


Training epoch 1



HBox(children=(FloatProgress(value=0.0, max=46.0), HTML(value='')))


Validation:



HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))


Training epoch 2



HBox(children=(FloatProgress(value=0.0, max=46.0), HTML(value='')))


Validation:



HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))


Training epoch 3



HBox(children=(FloatProgress(value=0.0, max=46.0), HTML(value='')))

KeyboardInterrupt: ignored

## Evaluation with in-domain test set



In [21]:
classifier_model.eval()
test_hs_dataset.set_format(type='torch', columns=['hidden_state', 'label'])

# Run the LR model on the dataset
def make_predictions(examples, classifier_model, torch_device):
    
    cat_hs_tensor = examples["hidden_state"].float().to(device=torch_device)
    class_probs = classifier_model(cat_hs_tensor)

    predictions = []
    for cp in class_probs:
        predictions.append(torch.argmax(cp))
    
    return {"prediction": predictions, "label": examples["label"].tolist()}

# Batch size has to be the same as sentiment prompts so that the examples in a batches all come from the same review
predictions = test_hs_dataset.map(
    lambda e: make_predictions(e, classifier_model, torch_device),
    batched=True, batch_size=7, num_proc=None)

HBox(children=(FloatProgress(value=0.0, max=92.0), HTML(value='')))




## Results visualization

In [None]:
# Calculate metrics and confusion matrix based upon predictions and true labels
accuracy = accuracy_score(predictions["label"], predictions["prediction"])

print("Accuracy: {:.2%}".format(accuracy))

cm = confusion_matrix(predictions["label"], predictions["prediction"])
df_cm = pd.DataFrame(cm, index = [i for i in ["positive", "negative", "neutral"]],
                  columns = [i for i in ["positive", "negative", "neutral"]])

plt.figure(figsize = (10,7))
ax = sn.heatmap(df_cm, annot=True)

ax.set(xlabel='Predicted Label', ylabel='True Label')
plt.show()