# Initial Setups

## (Google Colab use only)

In [None]:
# Use Google Colab
use_colab = True

# Is this notebook running on Colab?
# If so, then google.colab package (github.com/googlecolab/colabtools)
# should be available in this environment

# Previous version used importlib, but we could do the same thing with
# just attempting to import google.colab
try:
    from google.colab import drive
    colab_available = True
except:
    colab_available = False

if use_colab and colab_available:
    drive.mount('/content/drive')

    # cd to the appropriate working directory under my Google Drive
    %cd '/content/drive/My Drive/cs696ds_lexalytics/Prompting Experiments'
    
    # Install packages specified in requirements
    !pip install -r requirements.txt
    
    # List the directory contents
    !ls

## Experiment parameters

In [None]:
import os

# We will use the following string ID to identify this particular (training) experiments
# in directory paths and other settings
experiment_id = 'prompt_logit_softmax_atsc_single_prompt_i_felt_bert_amazon_electronics'

# Random seed
random_seed = 696

# path to pretrained MLM model folder or the string "bert-base-uncased"
lm_model_path = os.path.join(
    'progress', 'lm_further_pretraining_bert_amazon_electronics_bseoh_2021-03-06--18_59_53',
    'results', 'checkpoint-1180388')

# Proportion to be reserved for validation
validation_dataset_proportion = 0.2

# Prompts to be added to the end of each review text
# Note: pseudo-labels for each prompt should be given in the order of (positive), (negative), (neutral)
sentiment_prompts = [
    {"prompt": "I [MASK] the {aspect}.", "labels": ["love", "hate", "dislike"]}
]

# Training settings for logistic regression head
training_epochs = 10
training_batch_size = 32
training_learning_rate = 1e-3
training_weight_decay = 0.01

validation_batch_size = 32
testing_batch_size = 32

## Package imports

In [None]:
import sys
import os
import random
import shutil
import copy

import numpy as np
import torch
import transformers
import datasets
import sklearn.metrics
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import tqdm

import utils

# Random seed settings
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)

# Print version information
print("Python version: " + sys.version)
print("NumPy version: " + np.__version__)
print("PyTorch version: " + torch.__version__)
print("Transformers version: " + transformers.__version__)

## PyTorch GPU settings

In [None]:
if torch.cuda.is_available():    
    torch_device = torch.device('cuda')

    # Set this to True to make your output immediately reproducible
    # Note: https://pytorch.org/docs/stable/notes/randomness.html
    torch.backends.cudnn.deterministic = False
    
    # Disable 'benchmark' mode: Set this False if you want to measure running times more fairly
    # Note: https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936
    torch.backends.cudnn.benchmark = True
    
    # Faster Host to GPU copies with page-locked memory
    use_pin_memory = True
    
    # Number of compute devices to be used for training
    training_device_count = torch.cuda.device_count()

    # CUDA libraries version information
    print("CUDA Version: " + str(torch.version.cuda))
    print("cuDNN Version: " + str(torch.backends.cudnn.version()))
    print("CUDA Device Name: " + str(torch.cuda.get_device_name()))
    print("CUDA Capabilities: "+ str(torch.cuda.get_device_capability()))
    print("Number of CUDA devices: "+ str(training_device_count))
    
else:
    torch_device = torch.device('cpu')
    use_pin_memory = False
    
    # Number of compute devices to be used for training
    training_device_count = 1

print()
print("PyTorch device selected:", torch_device)

# Prepare Datasets for Prompt-based Classifier

## Load the SemEval dataset

In [None]:
# Load semeval for both domains
in_domain_semeval_dataset = datasets.load_dataset(
    os.path.abspath('dataset_scripts/semeval2014_task4/semeval2014_task4.py'),
    data_files={
        'test': 'dataset_files/semeval_2014/Laptops_Test_Gold.xml',
        'train': 'dataset_files/semeval_2014/Laptop_Train_v2.xml',
    },
    cache_dir='dataset_cache')


out_domain_semeval_dataset = datasets.load_dataset(
    os.path.abspath('dataset_scripts/semeval2014_task4/semeval2014_task4.py'),
    data_files={
        'test': 'dataset_files/semeval_2014/Restaurants_Test_Gold.xml',
        'train': 'dataset_files/semeval_2014/Restaurants_Train_v2.xml',
    },
    cache_dir='dataset_cache')

In [None]:
out_domain_train = out_domain_semeval_dataset['train']
out_domain_test = out_domain_semeval_dataset['test']

in_domain_test = in_domain_semeval_dataset['test'] # Never use in-domain training data

## Train-validation split for out-domain SemEval data

In [None]:
# Training set size after validation split
new_out_domain_train_dataset_size = int(len(out_domain_train) * (1 - validation_dataset_proportion))
new_out_domain_valid_dataset_size = len(out_domain_train) - new_out_domain_train_dataset_size

print("Training dataset (out-domain) after split:", new_out_domain_train_dataset_size)
print("Validation dataset (out-domain) after split:", new_out_domain_valid_dataset_size)

In [None]:
out_domain_train = out_domain_train.shuffle(seed=random_seed)

new_out_domain_train_dataset = out_domain_train.select(
    indices=np.arange(new_out_domain_train_dataset_size))

new_out_domain_valid_dataset = out_domain_train.select(
    indices=np.arange(
        new_out_domain_train_dataset_size,
        new_out_domain_train_dataset_size + new_out_domain_valid_dataset_size))

In [None]:
print(new_out_domain_train_dataset[0])

# Zero-shot ATSC with Prompts + MLM Output Head

## Load the pretrained LM

In [None]:
# Load pretrained language model
lm = transformers.AutoModelForMaskedLM.from_pretrained(lm_model_path)
tokenizer = transformers.AutoTokenizer.from_pretrained('bert-base-uncased', cache_dir='bert_base_cache')

## Define a new model with MLM output head

In [None]:
# Encode the pseudo-label words for each sentiment class
sentiment_word_ids = []

for w in sentiment_prompts[0]['labels']:
    sentiment_word_ids.append(tokenizer.convert_tokens_to_ids(w))

print(sentiment_word_ids)

classifier_model = utils.SinglePromptLogitSentimentClassificationHead(
    lm=lm,
    num_class=3, pseudo_label_words=sentiment_word_ids, target_token_id=tokenizer.mask_token_id)

# Freeze the MLM main layer and leave the MLM head trainable
for param in classifier_model.lm.bert.parameters():
    param.requires_grad = False

classifier_model = classifier_model.to(device=torch_device)

## Training settings

In [None]:
train_dataloader = torch.utils.data.DataLoader(
    new_out_domain_train_dataset, batch_size=training_batch_size,
    pin_memory=use_pin_memory)

validation_dataloader = torch.utils.data.DataLoader(
    new_out_domain_valid_dataset, batch_size=validation_batch_size,
    pin_memory=use_pin_memory)

loss_function = torch.nn.CrossEntropyLoss()

optimizer = transformers.AdamW(
    classifier_model.parameters(),
    lr=training_learning_rate,
    weight_decay=training_weight_decay)

scheduler = transformers.get_constant_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0)

# The directory to save the best version of the head
trained_model_directory = os.path.join('.', 'trained_models', experiment_id)

shutil.rmtree(trained_model_directory, ignore_errors=True)
os.makedirs(trained_model_directory)

In [None]:
def compute_metrics(predictions, labels):
    preds = predictions.argmax(-1)

    precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(
        y_true=labels, y_pred=preds, labels=[0,1,2], average='macro')

    acc = sklearn.metrics.accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

## Training loop

In [None]:
best_validation_loss = float('inf')
best_epoch = -1

for epoch in tqdm.notebook.tqdm(range(int(training_epochs))):

    print("Training epoch %d" % epoch)
    print()

    classifier_model.train()

    for batch in tqdm.notebook.tqdm(train_dataloader):

        reviews_repeated = []
        prompts_populated = []

        for prompt in sentiment_prompts:
            reviews_repeated = reviews_repeated + batch["text"]

            for aspect in batch["aspect"]:
                prompts_populated.append(prompt['prompt'].format(aspect=aspect))

        batch_encoded = tokenizer(
            reviews_repeated, prompts_populated,
            padding='max_length', truncation='only_first', max_length=256,
            return_tensors='pt')
        
        batch_encoded = batch_encoded.to(torch_device)

        labels = batch["sentiment"]
        labels = labels.to(torch_device)

        optimizer.zero_grad()

        outputs = classifier_model(batch_encoded)
        
        loss = loss_function(outputs, labels)

        loss.backward()

        optimizer.step()
        scheduler.step()

    # Validate the model using val dataset
    classifier_model.eval()

    print("Validation epoch %d" % epoch)
    print()

    predictions_val = torch.Tensor([])
    labels_val = torch.Tensor([])

    for batch_val in tqdm.notebook.tqdm(validation_dataloader):

        reviews_repeated = []
        prompts_populated = []

        for prompt in sentiment_prompts:
            reviews_repeated = reviews_repeated + batch_val["text"]

            for aspect in batch_val["aspect"]:
                prompts_populated.append(prompt['prompt'].format(aspect=aspect))

        batch_encoded = tokenizer(
            reviews_repeated, prompts_populated,
            padding='max_length', truncation='only_first', max_length=256,
            return_tensors='pt')
        
        batch_encoded.to(torch_device)

        labels = batch_val["sentiment"]

        outputs = classifier_model(batch_encoded)

        outputs = outputs.to('cpu')

        predictions_val = torch.cat([predictions_val, outputs])
        labels_val = torch.cat([labels_val, labels])
    
    # Compute metrics
    validation_loss = torch.nn.functional.cross_entropy(predictions_val, labels_val.long())
    validation_metrics = compute_metrics(predictions_val, labels_val)

    print(
        "Epoch {}, Training Loss: {}, Validation Loss: {}, Validation Metrics: {}".format(epoch, loss.item(), validation_loss.item(), validation_metrics))
    print()

    # Save the current epoch's model if the validation loss is lower than the best known so far
    if validation_loss.item() < best_validation_loss:

        if best_epoch != -1:
            try:
                os.remove(os.path.join(trained_model_directory, 'epoch_{}.pt'.format(best_epoch)))
            except:
                pass

        best_validation_loss = validation_loss.item()
        best_epoch = epoch
        torch.save(classifier_model.lm.cls, os.path.join(trained_model_directory, 'epoch_{}.pt'.format(epoch)))

## Evaluation with in-domain test set



In [None]:
test_dataloader = torch.utils.data.DataLoader(
    in_domain_test, batch_size=testing_batch_size,
    pin_memory=use_pin_memory)

In [None]:
# Load the best found head weights
classifier_model.lm.cls = torch.load(
    os.path.join(trained_model_directory, 'epoch_{}.pt'.format(best_epoch)),
    map_location=torch_device)

classifier_model.eval()

predictions_test = torch.Tensor([])
labels_test = torch.Tensor([])

for batch_val in tqdm.notebook.tqdm(test_dataloader):

    reviews_repeated = []
    prompts_populated = []

    for prompt in sentiment_prompts:
        reviews_repeated = reviews_repeated + batch_val["text"]

        for aspect in batch_val["aspect"]:
            prompts_populated.append(prompt['prompt'].format(aspect=aspect))

    batch_encoded = tokenizer(
        reviews_repeated, prompts_populated,
        padding='max_length', truncation='only_first', max_length=256,
        return_tensors='pt')
    
    batch_encoded.to(torch_device)

    labels = batch_val["sentiment"]

    outputs = classifier_model(batch_encoded)

    outputs = outputs.to('cpu')

    predictions_test = torch.cat([predictions_test, outputs])
    labels_test = torch.cat([labels_test, labels])

# Compute metrics
test_metrics = compute_metrics(predictions_test, labels_test)

print(test_metrics)

## Results visualization

In [None]:
# Calculate metrics and confusion matrix based upon predictions and true labels
cm = sklearn.metrics.confusion_matrix(labels_test.detach().numpy(), predictions_test.detach().numpy().argmax(-1))

df_cm = pd.DataFrame(
    cm,
    index=[i for i in ["positive", "negative", "neutral"]],
    columns=[i for i in ["positive", "negative", "neutral"]])

plt.figure(figsize=(10, 7))

ax = sn.heatmap(df_cm, annot=True)

ax.set(xlabel='Predicted Label', ylabel='True Label')
plt.show()