# Initial Setups


## (Google Colab use only)

In [None]:
# Use Google Colab
use_colab = True

# Is this notebook running on Colab?
# If so, then google.colab package (github.com/googlecolab/colabtools)
# should be available in this environment

# Previous version used importlib, but we could do the same thing with
# just attempting to import google.colab
try:
    from google.colab import drive
    colab_available = True
except:
    colab_available = False

if use_colab and colab_available:
    drive.mount('/content/drive')

    # cd to the appropriate working directory under my Google Drive
    %cd 'drive/My Drive/cs696ds_lexalytics/Language Model Finetuning'
    
    # Install packages specified in requirements
    !pip install -r requirements.txt
    
    # List the directory contents
    !ls

## Experiment Parameters

**NOTE**: The following `experiment_id` MUST BE CHANGED in order to avoid overwriting the files from other experiments!!!!!!

**NOTE 2**: The values for the variables in the cell below can be overridden by `papermill` at runtime. Variables in other cells cannot be changed in this manner.

In [2]:
# We will use the following string ID to identify this particular (training) experiments
# in directory paths and other settings
experiment_id = 'lm_finetuning_bert_nli'

# Random seed
random_seed = 696

# Dataset size related
total_subset_proportion = 1.0 # Do we want to use the entirety of the training set, or some parts of it?
validation_dataset_proportion = 0.1 # Proportion to be reserved for validation (after selecting random subset with total_subset_proportion)

# Training hyperparameters
num_train_epochs = 20 # Number of epochs
per_device_train_batch_size = 16 # training batch size PER COMPUTE DEVICE
per_device_eval_batch_size = 16 # evaluation batch size PER COMPUTE DEVICE
learning_rate = 1e-5
weight_decay = 0.01

## Package Imports

In [3]:
import sys
import os
import random

import numpy as np
import torch
import transformers
import datasets

import utils

# Random seed settings
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)

# Print version information
print("Python version: " + sys.version)
print("NumPy version: " + np.__version__)
print("PyTorch version: " + torch.__version__)
print("Transformers version: " + transformers.__version__)

Python version: 3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]
NumPy version: 1.19.2
PyTorch version: 1.7.1
Transformers version: 4.4.0.dev0


## PyTorch GPU settings

In [3]:
if torch.cuda.is_available():    
    torch_device = torch.device('cuda')

    # Set this to True to make your output immediately reproducible
    # Note: https://pytorch.org/docs/stable/notes/randomness.html
    torch.backends.cudnn.deterministic = False
    
    # Disable 'benchmark' mode: Set this False if you want to measure running times more fairly
    # Note: https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936
    torch.backends.cudnn.benchmark = True
    
    # Faster Host to GPU copies with page-locked memory
    use_pin_memory = True
    
    # Number of compute devices to be used for training
    training_device_count = torch.cuda.device_count()

    # CUDA libraries version information
    print("CUDA Version: " + str(torch.version.cuda))
    print("cuDNN Version: " + str(torch.backends.cudnn.version()))
    print("CUDA Device Name: " + str(torch.cuda.get_device_name()))
    print("CUDA Capabilities: "+ str(torch.cuda.get_device_capability()))
    print("Number of CUDA devices: "+ str(training_device_count))
    
else:
    torch_device = torch.device('cpu')
    use_pin_memory = False
    
    # Number of compute devices to be used for training
    training_device_count = 1

print()
print("PyTorch device selected:", torch_device)


PyTorch device selected: cpu


  return torch._C._cuda_getDeviceCount() > 0


# Further pre-training

## Load the BERT-base-uncased-MNLI model


In [13]:
tokenizer = transformers.AutoTokenizer.from_pretrained("textattack/bert-base-uncased-MNLI", cache_dir='./bert_base_uncased-MNLI')
model = transformers.AutoModelForSequenceClassification.from_pretrained("textattack/bert-base-uncased-MNLI", cache_dir='./bert_base_cache')

## Load the Yelp dataset

In [None]:
yelp = datasets.load_dataset(
    './dataset_scripts/yelp_restaurants/yelp_NLI.py',
    data_files={
        'train': 'dataset_files/yelp_restaurants/yelp_academic_dataset_review.json',
        'restaurant_ids': 'dataset_files/yelp_restaurants/restaurantIDs.txt',
        'pos_sentiment': 'dataset_files/opinion_lexicon/positive-words.txt',
        'neg_sentiment': 'dataset_files/opinion_lexicon/negative-words.txt'
    },
    cache_dir='./dataset_cache')

In [None]:
data_train = yelp['train']
print("Number of training data (original):", len(data_train))

In [None]:
data_train_selected = data_train.shuffle(seed=random_seed).select(np.arange(0, int(len(data_train) * total_subset_proportion)))
print("Number of training data (subset):", len(data_train_selected))

In [None]:
# Check out how individual data points look like
print(data_train_selected[0])

### Preprocessing: Encode the text with Tokenizer

In [None]:
train_dataset = data_train_selected.map(
    lambda e: tokenizer(e['premise'], e['hypothesis'], truncation=True),
    batched=True)

### Train-validation split

In [None]:
# Training set size after validation split
new_train_dataset_size = int(len(train_dataset) * (1 - validation_dataset_proportion))
new_valid_dataset_size = len(train_dataset) - new_train_dataset_size

new_train_dataset = train_dataset.select(indices=np.arange(new_train_dataset_size))
new_valid_dataset = train_dataset.select(indices=np.arange(new_train_dataset_size, new_train_dataset_size + new_valid_dataset_size))

In [None]:
print("Training dataset after split:", len(new_train_dataset))
print("Validation dataset after split:", len(new_valid_dataset))

## Pre-train further

### Training settings

In [None]:
# How many training steps would we have?
approx_total_training_steps = len(new_train_dataset) // (per_device_train_batch_size * training_device_count) * num_train_epochs

print("There will be approximately %d training steps." % approx_total_training_steps)

In [6]:
training_args = transformers.TrainingArguments(
    output_dir=os.path.join('.', 'progress', experiment_id, 'results'), # output directory
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,              # total number of training epochs
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    evaluation_strategy='epoch',
    save_strategy='steps',
    save_steps=100000,
    logging_dir=os.path.join('.', 'progress', experiment_id, 'logs'), # directory for storing logs
    logging_first_step=True,
    weight_decay=weight_decay,               # strength of weight decay
    seed=random_seed,
    learning_rate=learning_rate,
    fp16=True,
    fp16_backend='amp',
    prediction_loss_only=True,
    load_best_model_at_end=True,
    dataloader_num_workers=training_device_count * 2,
    dataloader_pin_memory=use_pin_memory
)

  return torch._C._cuda_getDeviceCount() > 0


ValueError: Mixed precision training with AMP or APEX (`--fp16`) and FP16 evaluation can only be used on CUDA devices.

In [None]:
print(training_args.n_gpu)

In [4]:
metric = datasets.load_metric('glue', 'mnli')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=new_train_dataset,
    eval_dataset=new_valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

### Training loop

In [None]:
%%time
trainer.train()

### Save the model to the local directory

In [None]:
trainer.save_model(os.path.join('.', 'trained_models', experiment_id))

In [None]:
tokenizer.save_pretrained(os.path.join('.', 'trained_models', experiment_id))

## LM Evaluation

In [None]:
eval_results = trainer.evaluate()

In [None]:
print(eval_results)

perplexity = np.exp(eval_results["eval_loss"])

print(perplexity)

## Playing with my own input sentences

In [160]:
sequence_0 = "At the other end of Pennsylvania Avenue, people began to line up for a White House tour."
sequence_1 = "People formed a line at the end of Pennsylvania Avenue."

sequence_0 = "The Old One always comforted Ca'daan, except today."
sequence_1 = "Ca'daan knew the Old One very well."

x = tokenizer.encode_plus(sequence_0, sequence_1,  truncation=True, return_tensors="pt").to(torch_device)

logits = model(**x)[0]

entailment_results = torch.softmax(logits, dim=1).tolist()[0]

print(entailment_results)

example_tokens = []
for id in x[0].ids:
    example_tokens.append(tokenizer.convert_ids_to_tokens(id))

print (example_tokens)

[0.04422219097614288, 0.037514906376600266, 0.9182628989219666]
['[CLS]', 'the', 'old', 'one', 'always', 'comfort', '##ed', 'ca', "'", 'da', '##an', ',', 'except', 'today', '.', '[SEP]', 'ca', "'", 'da', '##an', 'knew', 'the', 'old', 'one', 'very', 'well', '.', '[SEP]']
