## Notebook Preface



## Constructs the working folder

* Positions the project folder in the Google Drive.
  1. From "Share with me", right click on "W266 Final Project", select "Add shortcut to Drive"
  2. "W266 Final Project" will show up in "MyDrive"

* Mounts the Google Drive at /content/drive in the Colab runtime.

* Defines the working folder relative to /content/drive.



In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
working_folder = "/content/drive/MyDrive/W266 Final Project/CnF/PhotoStoryGenerator"
training_json = f"{working_folder}/train_hints.json"
eval_json = f"{working_folder}/val_hints.json"
testing_json = f"{working_folder}/test_hints.json"

training_text = f"{working_folder}/train_input.txt"
eval_text = f"{working_folder}/val_input.txt"
testing_text = f"{working_folder}/test_input.txt"

checkpoint_dir = f"{working_folder}/GPT-2L-FineTune2_checkpoint"


In [3]:
!nvidia-smi


Mon Jul 26 06:05:24 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Imports libraries

In [4]:
# First upload the training and evaluation files to this runtime (Press connect if needed)
!pip install transformers torch


Collecting transformers
  Downloading transformers-4.9.0-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 8.7 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 60.4 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 58.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 67.6 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninsta

In [5]:
import json
import logging
import math
import os
from dataclasses import dataclass, field
from typing import Optional

from transformers import (
    CONFIG_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoConfig,
    GPT2LMHeadModel,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    LineByLineTextDataset,
    PreTrainedTokenizer,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
)

# Setup logging
logger = logging.getLogger(__name__)

from IPython.display import HTML, display
def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)


## Converts the datasets to model inputs

* Run only once to generate the input files.

In [6]:
def convert_dataset_to_input(_json, _data):
    with open (_json) as f:
        data = json.load(f)
    with open(_data, "w+") as fout:
        for s in data:
            story = data[s]
            # Use the first hint (nouns of dii)
            # The second hint is concatenation of dii
            hints = story["hints"]
            fout.write(f"<BOS> <HINT> {' '.join(hints[0])} <SENT> {story['sis']} <EOS>\n")
#            for hint in story["hints"]:
#                fout.write(f"<BOS> <HINT> {' '.join(hint)} <SENT> {story['sis']} <EOS>\n")


In [7]:
convert_dataset_to_input(training_json, training_text)


In [8]:
convert_dataset_to_input(eval_json, eval_text)


In [9]:
convert_dataset_to_input(testing_json, testing_text)


# Defines model arguments and data processing functions

In [10]:
@dataclass
class ModelArguments:
    model_name_or_path: Optional[str] = None
    cache_dir: Optional[str] = None
    model_type: Optional[str] = None


In [11]:
@dataclass
class DataArguments:
    train_data_file: Optional[str] = None
    eval_data_file: Optional[str] = None
    block_size: int = -1
    line_by_line: bool = False # Load the data line by line, trimmed/padded to block_size; otherwise, load sequentially by block_size
    mlm: bool = False # Train with masked language model loss
    overwrite_cache: bool = False


In [12]:
def get_dataset(args: DataArguments, tokenizer: PreTrainedTokenizer, evaluate=False):
    file_path = args.eval_data_file if evaluate else args.train_data_file

    if args.line_by_line:
        return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
    else:
        return TextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, overwrite_cache=args.overwrite_cache)


## Finetunes GPT-2

In [13]:
def finetune_model(resume_from_checkpoint=True):

    model_args = ModelArguments(model_name_or_path="gpt2-large", model_type="gpt2-large")

    data_args = DataArguments(
        train_data_file=training_text,
        eval_data_file=eval_text,
        line_by_line=True,
        block_size=512,
        overwrite_cache=True,
    )

    training_args = TrainingArguments(
        output_dir=checkpoint_dir,
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        evaluation_strategy="steps",
        logging_steps=500,
        per_device_train_batch_size=1,
        num_train_epochs=1,
        save_total_limit=1,
        save_steps=8000,
        prediction_loss_only=True,
        seed=0,
        report_to="all",
        #resume_from_checkpoint=resume_from_checkpoint
    )

    # Performs sanity checks
    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError("Cannot do evaluation without an evaluation data file")

    if (os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(f"{training_args.output_dir} exists but overwrite_output_dir=False")

    # Sets up logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    print()
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    print()
    logger.info("Training/evaluation parameters %s", training_args)

    # Sets seed for deterministic training runs
    set_seed(training_args.seed)

    config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)

    model = GPT2LMHeadModel.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Adds additional tokens.
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    special_tokens_dict = {
        "bos_token": "<BOS>",
        "eos_token": "<EOS>",
        "pad_token": "<PAD>",
        "additional_special_tokens": [
            "<HINT>",
            "<SENT>",
        ],
    }
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    # Adjusts the block size.
    data_args.block_size = tokenizer.model_max_length if data_args.block_size <= 0 else  min(data_args.block_size, tokenizer.model_max_length)

    # Gets the datasets.
    train_dataset = (get_dataset(data_args, tokenizer=tokenizer) if training_args.do_train else None)
    eval_dataset = (get_dataset(data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=data_args.mlm,)

    # Initializes the trainer.
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    # Performs training.
    train_result = {}
    try:
      if training_args.do_train:
          model_path = (
              model_args.model_name_or_path
              if model_args.model_name_or_path is not None
              and os.path.isdir(model_args.model_name_or_path)
              else None
          )
          train_result = trainer.train(resume_from_checkpoint=True if resume_from_checkpoint else model_path)
          trainer.save_model()
          tokenizer.save_pretrained(training_args.output_dir)
    except KeyboardInterrupt:
      print("Saving model that was in the middle of training")
      trainer.save_model()
      tokenizer.save_pretrained(training_args.output_dir)
      return

    # Performs evaluation.
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()
        result = { "perplexity": math.exp(eval_output["eval_loss"]) }

        output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)

    results["training_result"] = train_result
    return results


In [14]:
finetune_results = finetune_model(resume_from_checkpoint=False)


07/26/2021 06:05:54 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=/content/drive/MyDrive/W266 Final Project/CnF/PhotoStoryGenerator/GPT-2L-FineTune2_checkpoint/runs/Jul26_06-05-54_b7d3c43162eb,
logging_first_ste





07/26/2021 06:05:55 - INFO - filelock -   Lock 140390664577744 acquired on /root/.cache/huggingface/transformers/d82fb41558a2cc40bb6e10a57bbfbd9ff2f3c6614072f05afdfa8f44d566d2ba.142693c08a15b2c586e4fcb42418d55c99b5a6a5c51228e275d9e939775865ea.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=764.0, style=ProgressStyle(description_…

07/26/2021 06:05:55 - INFO - filelock -   Lock 140390664577744 released on /root/.cache/huggingface/transformers/d82fb41558a2cc40bb6e10a57bbfbd9ff2f3c6614072f05afdfa8f44d566d2ba.142693c08a15b2c586e4fcb42418d55c99b5a6a5c51228e275d9e939775865ea.lock





07/26/2021 06:05:55 - INFO - filelock -   Lock 140387671695248 acquired on /root/.cache/huggingface/transformers/234578a5793e64713ba846b4c5e181e043f48b33140622e2c1dd623b665de3f9.4780ef91b17260f8dac8a3c2183aa338b27365326fb706e74db40b03749f8aba.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3247202234.0, style=ProgressStyle(descr…

07/26/2021 06:06:55 - INFO - filelock -   Lock 140387671695248 released on /root/.cache/huggingface/transformers/234578a5793e64713ba846b4c5e181e043f48b33140622e2c1dd623b665de3f9.4780ef91b17260f8dac8a3c2183aa338b27365326fb706e74db40b03749f8aba.lock





07/26/2021 06:07:05 - INFO - filelock -   Lock 140387730171152 acquired on /root/.cache/huggingface/transformers/79f5e05af067df502528a0d902e82c24c3f1df9ae570c91fcc38e1f3c0af4c45.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…

07/26/2021 06:07:06 - INFO - filelock -   Lock 140387730171152 released on /root/.cache/huggingface/transformers/79f5e05af067df502528a0d902e82c24c3f1df9ae570c91fcc38e1f3c0af4c45.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f.lock





07/26/2021 06:07:06 - INFO - filelock -   Lock 140387735413776 acquired on /root/.cache/huggingface/transformers/7f7bf8a7802a708af08a812bfbdec9335f2c30f761ec14a8cd17b0d61c818876.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…

07/26/2021 06:07:07 - INFO - filelock -   Lock 140387735413776 released on /root/.cache/huggingface/transformers/7f7bf8a7802a708af08a812bfbdec9335f2c30f761ec14a8cd17b0d61c818876.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock





07/26/2021 06:07:07 - INFO - filelock -   Lock 140387735413776 acquired on /root/.cache/huggingface/transformers/f1179e28982928f50ca02b0188fcd80fb4fa871ba1719df5bf81ac308d0d10af.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…

07/26/2021 06:07:07 - INFO - filelock -   Lock 140387735413776 released on /root/.cache/huggingface/transformers/f1179e28982928f50ca02b0188fcd80fb4fa871ba1719df5bf81ac308d0d10af.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0.lock





***** Running training *****
  Num examples = 22309
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 22309


Step,Training Loss,Validation Loss
500,3.8517,3.702008
1000,3.7186,3.653957
1500,3.6919,3.641328
2000,3.6433,3.617435
2500,3.5919,3.602617
3000,3.6016,3.591088
3500,3.4984,3.584959
4000,3.549,3.56583
4500,3.5193,3.551749
5000,3.4624,3.548575


***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evalua

07/26/2021 09:01:41 - INFO - __main__ -   ***** Eval results *****
07/26/2021 09:01:41 - INFO - __main__ -     perplexity = 34.217289464137366


In [15]:
finetune_results


{'perplexity': 34.217289464137366,
 'training_result': TrainOutput(global_step=22309, training_loss=3.249575388576359, metrics={'train_runtime': 10341.75, 'train_samples_per_second': 2.157, 'train_steps_per_second': 2.157, 'total_flos': 8351782554700800.0, 'train_loss': 3.249575388576359, 'epoch': 1.0})}