# General Imports and Utilities

In [0]:
import os

def quote(t):
    return '"{}"'.format(t)

# Setup

Specify the model type you would like to train and the path to the training data here. Additional information on the training parameters can be found in the 'Tips for fine-tuning cell.'

In [0]:
GDRIVE_MOUNT_PATH = '/content/gdrive'
NLP_DRIVE_PATH = os.path.join(GDRIVE_MOUNT_PATH, 'Shared drives/CS263')

# EXP_PARENT_DIR points to the directory where all experiments are stored. Can be updated.
EXP_PARENT_DIR = os.path.join(NLP_DRIVE_PATH, "models/american_rhetoric")
EXP_NAME = "tones_watson-summarized_bart_large_xsm/converted-summary_tone-tag_v3-eos-cls" # TODO: change this as necessary before each experiment
EXP_DIR = os.path.join(EXP_PARENT_DIR, EXP_NAME) # Points to the current experiment dir

# Update this to point to your desired dataset
DATASET_DIR = os.path.join(NLP_DRIVE_PATH, 'data/american_rhetoric/speech_bank/tones_watson-summarized_bart_large_xsm/converted-summary_tone-tag_v3-eos-cls')

OUTPUT_DIR = quote(os.path.join(EXP_DIR, 'output'))
# TRAIN_DATA = quote(os.path.join(DATASET_DIR, 'parsed-summarized-converted-train.txt'))
# EVAL_DATA = quote(os.path.join(DATASET_DIR, 'parsed-summarized-converted-test.txt'))
# TEST_DATA = quote(os.path.join(DATASET_DIR, 'parsed-summarized-converted-test.txt'))
TRAIN_DATA = quote(os.path.join(DATASET_DIR, 'data-train.txt'))
EVAL_DATA = quote(os.path.join(DATASET_DIR, 'data-test.txt'))
# TEST_DATA = quote(os.path.join(DATASET_DIR, 'data-test.txt'))

MODEL_TYPE = "gpt2-medium" # replace with gpt2-medium when dataset is complete (might only need to change model_path)
MODEL_PATH = "gpt2-medium" # can point to a checkpoint to continue training
# MODEL_PATH = quote(os.path.join(EXP_DIR, "gpt2")) # can point to a checkpoint to continue training

TOKENIZER = os.path.join(EXP_DIR, "tokenizer")
TOKENIZER_PATH = quote(TOKENIZER)

# EPOCHS: number of times to iterate over training data
EPOCHS = 2
# BATCH_SIZE: adjust this and block size to avoid running out of memory (likely want max block size if possible)
BATCH_SIZE = 1
# BLOCK_SIZE: text is truncated into chunks of this length (-1 defaults to max length for model type)
BLOCK_SIZE = 256
# BLOCK_SIZE = -1
GRADIENT_STEPS = 5

LOG_STEPS = 1000
SAVE_STEPS = 1000
SAVE_LIMIT = 2

In [0]:
print(DATASET_DIR)
print(OUTPUT_DIR)
print(TRAIN_DATA)
print(EVAL_DATA)
# print(TEST_DATA)
print(MODEL_PATH)

/content/gdrive/Shared drives/CS263/data/american_rhetoric/speech_bank/tones_watson-summarized_bart_large_xsm/converted-summary_tone-tag_v3-eos-cls
"/content/gdrive/Shared drives/CS263/models/american_rhetoric/tones_watson-summarized_bart_large_xsm/converted-summary_tone-tag_v3-eos-cls/output"
"/content/gdrive/Shared drives/CS263/data/american_rhetoric/speech_bank/tones_watson-summarized_bart_large_xsm/converted-summary_tone-tag_v3-eos-cls/data-train.txt"
"/content/gdrive/Shared drives/CS263/data/american_rhetoric/speech_bank/tones_watson-summarized_bart_large_xsm/converted-summary_tone-tag_v3-eos-cls/data-test.txt"
gpt2-medium


We are using the hugging face transformers library, so we need to clone the github here.

In [0]:
# remove the capture to show output from this cell
%%capture

# remove previous installations and clone the repo
!rm -rm /content/transformers
!git clone https://github.com/huggingface/transformers

# make sure all dependencies are installed
os.chdir('/content/transformers')

!pip install .
!pip install -r ./examples/requirements.txt

# navigate to folder with the language modeling training script
os.chdir('/content/transformers/examples/language-modeling')

In [0]:
# connect to google drive so we can access our data set
from google.colab import drive
drive.mount(GDRIVE_MOUNT_PATH)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


# Custom Tokenizer

In [0]:
from transformers import AutoTokenizer
from transformers import AutoConfig

config = AutoConfig.from_pretrained(MODEL_TYPE)

tokenizer = AutoTokenizer.from_pretrained(MODEL_TYPE)

new_tokens = ['<|summary|>', '<|tone|>']
special_tokens_dict = {'cls_token': '<|cls|>', 'unk_token': '<|unk|>'}

num_added_toks = tokenizer.add_tokens(new_tokens) + tokenizer.add_special_tokens(special_tokens_dict)
print('We have added', num_added_toks, 'tokens')

print(tokenizer.special_tokens_map)

if not os.path.exists(TOKENIZER):
    os.makedirs(TOKENIZER)

config.save_pretrained(TOKENIZER)
tokenizer.save_pretrained(TOKENIZER)
# model.resize_token_embeddings(len(tokenizer))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=718.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…


We have added 4 tokens
{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|unk|>', 'cls_token': '<|cls|>'}


('/content/gdrive/Shared drives/CS263/models/american_rhetoric/tones_watson-summarized_bart_large_xsm/converted-summary_tone-tag_v3-eos-cls/tokenizer/vocab.json',
 '/content/gdrive/Shared drives/CS263/models/american_rhetoric/tones_watson-summarized_bart_large_xsm/converted-summary_tone-tag_v3-eos-cls/tokenizer/merges.txt',
 '/content/gdrive/Shared drives/CS263/models/american_rhetoric/tones_watson-summarized_bart_large_xsm/converted-summary_tone-tag_v3-eos-cls/tokenizer/special_tokens_map.json',
 '/content/gdrive/Shared drives/CS263/models/american_rhetoric/tones_watson-summarized_bart_large_xsm/converted-summary_tone-tag_v3-eos-cls/tokenizer/added_tokens.json')

# Begin fine-tuning

We run the language modeling script provided by the transformers library to fine tune our model. More tips for training can be found below the following cell.

In [0]:
%%capture train_log
!python run_language_modeling.py \
    --output_dir=$OUTPUT_DIR \
    --model_type=$MODEL_TYPE \
    --model_name_or_path=$MODEL_PATH \
    --do_train \
    --train_data_file=$TRAIN_DATA \
    --do_eval \
    --eval_data_file=$EVAL_DATA \
    --evaluate_during_training \
    --logging_steps=$LOG_STEPS \
    --save_steps=$SAVE_STEPS \
    --save_total_limit=$SAVE_LIMIT \
    --num_train_epochs=$EPOCHS \
    --per_gpu_train_batch_size=$BATCH_SIZE \
    --per_gpu_eval_batch_size=$BATCH_SIZE \
    --block_size=$BLOCK_SIZE \
    --gradient_accumulation_steps=$GRADIENT_STEPS \
    --tokenizer_name=$TOKENIZER_PATH \
    --overwrite_output_dir

In [0]:
train_log.show()

2020-05-28 20:08:35.595429: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
05/28/2020 20:08:37 - INFO - transformers.training_args -   PyTorch: setting up devices
05/28/2020 20:08:37 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='/content/gdrive/Shared drives/CS263/models/american_rhetoric/tones_watson-summarized_bart_large_xsm/converted-summary_tone-tag_v3-eos-cls/output', overwrite_output_dir=True, do_train=True, do_eval=True, do_predict=False, evaluate_during_training=True, per_device_train_batch_size=8, per_device_eval_batch_size=8, per_gpu_train_batch_size=1, per_gpu_eval_batch_size=1, gradient_accumulation_steps=5, learning_rate=5e-05, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=2.0, max_steps=-1, warmup_steps=0, logging_dir=None, logging_first_step=False, logging_steps=1000, save_steps=1000, save_total_limit=2, no_cuda=False, seed=42, fp16=Fals

# Tips for fine-tuning

The following is advice given by [this notebook](https://colab.research.google.com/github/interactive-fiction-class/interactive-fiction-class.github.io/blob/master/homeworks/language-model/hw4_transformer.ipynb#scrollTo=xOzFhwDSqOg3).

We will be calling `run_language_modeling.py` from the command line to launch fine-tuning, **Running fine-tuning may take several hours.** Every `save_steps` steps, a checkpoint is saved to disk. The checkpoint contains all the learned weights for your model, and you can  always reload the model from a saved checkpoint, even if your Colab has crashed.

Below is an explanation of some of the arguments you might want to modify in the command below. 

* `--line_by_line`: Add `--line_by_line` if distinct lines of the text should be treated as distinct training examples. For example, if your dataset contains one story/tweet/article per line, this should be set.
* `--num_train_epochs`: The number of times to iterate over the train set. Increasing the number of epochs may result in better performance, but making this number too high will cause the model to overfit on the train set.
* `--block_size`: Your training text is truncated into blocks of this length. At test time, you will only want to generate sequences that are at most this length.
* `--gradient_accumulation_steps`: Update the model weights every this many steps. You shold set this to >1 when the batch size is very small to improve training stability.
* `--output_dir`: This is the where checkpoints will get saved. When you finetune on your own dataset, you should change this path. We recommend saving checkpoints to your Google Drive (`/content/drive/My Drive/`) so you can access them even if the Colab session dies.
* `--model_name_or_path` The path to the model weights to use when starting fine-tuning. You can set this to `gpt2-medium` to initialize with GPT-2's 355 million parameter model, or `gpt2` to initialize with their smaller 124 million parameter model. You can also set this to one of your own checkpoints to restart your training job if it crashes.

**I am getting out-of memory errors. What do I do?**

The number of trainable paramters in the model is a function of the `block_size` and the `batch_size`. If you are getting out-of-memory errors, then try drecreasing these value.

**Oh no! My computer went to sleep and the Colab disconnected.**

The train job might have still completed. Check the `output_dir` in your Google Drive to see if checkpoint files have been created there.

**Training is taking foreverrrrrr.**

Try decreasing `num_train_epochs` or changing `model_name_or_path` to `gpt2` instead of `gpt2-medium`.
If your evaluation set is very large, you might also want to remove the `evaluate_during_training` flag or increase `logging_steps`.