# Initial Setups


## (Google Colab use only)

In [None]:
# Use Google Colab
use_colab = True

# Is this notebook running on Colab?
# If so, then google.colab package (github.com/googlecolab/colabtools)
# should be available in this environment

# Previous version used importlib, but we could do the same thing with
# just attempting to import google.colab
try:
    from google.colab import drive
    colab_available = True
except:
    colab_available = False

if use_colab and colab_available:
    drive.mount('/content/drive')
    
    # If there's a package I need to install separately, do it here
    #!pip install pyro-ppl

    # cd to the appropriate working directory under my Google Drive
    %cd 'drive/My Drive/cs696ds_lexalytics/Language Model Finetuning'
    
    # List the directory contents
    !ls

## Experiment ID

**NOTE**: The following `experiment_id` MUST BE CHANGED in order to avoid overwriting the files from other experiments!!!!!!

In [None]:
# We will use the following string ID to identify this particular (training) experiments
# in directory paths and other settings
experiment_id = 'lm_further_pretraining_gpt-2_amazon_electronics'

## Package Install

In [None]:
# Install packages specified in requirements
!pip install -r requirements.txt

In [None]:
# NVidia APEX install
!unzip -o apex-master_downloaded_01Mar2021.zip
%cd apex-master
!pip install -v --no-cache-dir ./
%cd ..

In [None]:
# IPython reloading magic
%load_ext autoreload
%autoreload 2

## Package Imports

In [None]:
import sys
import os
import random
import numpy as np
import torch
import transformers
import datasets

import utils

# Random seed settings
random_seed = 696
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)

# Print version information
print("Python version: " + sys.version)
print("NumPy version: " + np.__version__)
print("PyTorch version: " + torch.__version__)
print("Transformers version: " + transformers.__version__)

## PyTorch GPU settings

In [None]:
if torch.cuda.is_available():
    torch_device = torch.device('cuda')

    # Set this to True to make your output immediately reproducible
    # Note: https://pytorch.org/docs/stable/notes/randomness.html
    torch.backends.cudnn.deterministic = False
    
    # Disable 'benchmark' mode: Set this False if you want to measure running times more fairly
    # Note: https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936
    torch.backends.cudnn.benchmark = True
    
    # Faster Host to GPU copies with page-locked memory
    use_pin_memory = True 

    # CUDA libraries version information
    print("CUDA Version: " + str(torch.version.cuda))
    print("cuDNN Version: " + str(torch.backends.cudnn.version()))
    print("CUDA Device Name: " + str(torch.cuda.get_device_name()))
    print("CUDA Capabilities: "+ str(torch.cuda.get_device_capability()))
else:
    torch_device = torch.device('cpu')
    use_pin_memory = False

print()
print("PyTorch device selected:", torch_device)

# Further pre-training

## Load the GPT-2 model

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2", cache_dir='./gpt2_cache')
model = transformers.GPT2LMHeadModel.from_pretrained("gpt2", cache_dir='./gpt2_cache')

## Load the Amazon electronics dataset

In [None]:
amazon = datasets.load_dataset(
    './dataset_scripts/amazon_ucsd_reviews',
    data_files={
        'train': 'dataset_files/amazon_ucsd_reviews/Electronics.json.gz',
    },
    cache_dir='./dataset_cache')

In [None]:
data_amazon_train = amazon['train']

In [None]:
print("Number of training data:", len(data_amazon_train))

In [None]:
# Check out how individual data points look like
print(data_amazon_train[696])

### Preprocessing: Encode the text with Tokenizer

In [None]:
train_dataset_pretraining = data_amazon_train.map(
    lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=256),
    remove_columns=data_amazon_train.column_names,
    batched=True, num_proc=16)

## Pre-train further

### Training settings

In [None]:
# CLM
collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
training_args = transformers.TrainingArguments(
    output_dir=os.path.join('.', 'progress', experiment_id, 'results'), # output directory
    overwrite_output_dir=True,
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=32,  # 64 * 2 GPUs = 128 Total
    evaluation_strategy='epoch',
    warmup_steps=5000,               # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=os.path.join('.', 'progress', experiment_id, 'logs'), # directory for storing logs
    seed=random_seed,
    fp16=True,
    fp16_opt_level='O2',
    prediction_loss_only=True,
    load_best_model_at_end=True,
    dataloader_num_workers=22,
)

In [None]:
print(training_args.n_gpu)

In [None]:
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    data_collator=collator, # do the masking on the go
    train_dataset=train_dataset_pretraining,
)

### Training loop

In [None]:
%%time
trainer.train()

### Save the model to the local directory

In [None]:
trainer.save_model(os.path.join('.', 'trained_models', experiment_id))

In [None]:
tokenizer.save_pretrained(os.path.join('.', 'trained_models', experiment_id))

## LM Evaluation

In [None]:
eval_results = trainer.evaluate()

In [None]:
print(eval_results)

perplexity = np.exp(eval_results["eval_loss"])

print(perplexity)

## Playing with my own input sentences

In [None]:
example = f"""The {tokenizer.mask_token} of {tokenizer.mask_token} is awful, but its {tokenizer.mask_token} is fantastic."""

example_encoded = tokenizer.encode(example, add_special_tokens=True, return_tensors="pt").to(torch_device)

# Let's decode this back just to see how they were actually encoded
example_tokens = []

for id in example_encoded[0]:
    example_tokens.append(tokenizer.convert_ids_to_tokens(id.item()))

print(example_tokens)

In [None]:
example_prediction = model(example_encoded)

example_prediction_argmax = torch.argmax(example_prediction[0], dim=-1)[0]

print(example_prediction_argmax)

print(tokenizer.decode(example_prediction_argmax))