## Notebook Preface



## Constructs the working folder

* Positions the project folder in the Google Drive.
  1. From "Share with me", right click on "W266 Final Project", select "Add shortcut to Drive"
  2. "W266 Final Project" will show up in "MyDrive"

* Mounts the Google Drive at /content/drive in the Colab runtime.

* Defines the working folder relative to /content/drive.



In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
working_folder = "/content/drive/MyDrive/W266 Final Project/CnF/PhotoStoryGenerator"
training_json = f"{working_folder}/train_hints.json"
eval_json = f"{working_folder}/val_hints.json"
testing_json = f"{working_folder}/test_hints.json"

training_text = f"{working_folder}/train_input.txt"
eval_text = f"{working_folder}/val_input.txt"
testing_text = f"{working_folder}/test_input.txt"

checkpoint_dir = f"{working_folder}/GPT-2M-FineTune2_checkpoint"


In [None]:
!nvidia-smi


Sat Jul 24 00:40:57 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Imports libraries

In [None]:
# First upload the training and evaluation files to this runtime (Press connect if needed)
!pip install transformers torch




In [None]:
import json
import logging
import math
import os
from dataclasses import dataclass, field
from typing import Optional

from transformers import (
    CONFIG_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoConfig,
    GPT2LMHeadModel,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    LineByLineTextDataset,
    PreTrainedTokenizer,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
)

# Setup logging
logger = logging.getLogger(__name__)

from IPython.display import HTML, display
def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)


## Converts the datasets to model inputs

* Run only once to generate the input files.

In [None]:
def convert_dataset_to_input(_json, _data):
    with open (_json) as f:
        data = json.load(f)
    with open(_data, "w+") as fout:
        for s in data:
            story = data[s]
            # Use the first hint (nouns of dii)
            # The second hint is concatenation of dii
            hints = story["hints"]
            fout.write(f"<BOS> <HINT> {' '.join(hints[0])} <SENT> {story['sis']} <EOS>\n")
#            for hint in story["hints"]:
#                fout.write(f"<BOS> <HINT> {' '.join(hint)} <SENT> {story['sis']} <EOS>\n")


In [None]:
convert_dataset_to_input(training_json, training_text)


In [None]:
convert_dataset_to_input(eval_json, eval_text)


In [None]:
convert_dataset_to_input(testing_json, testing_text)


# Defines model arguments and data processing functions

In [None]:
@dataclass
class ModelArguments:
    model_name_or_path: Optional[str] = None
    cache_dir: Optional[str] = None
    model_type: Optional[str] = None


In [None]:
@dataclass
class DataArguments:
    train_data_file: Optional[str] = None
    eval_data_file: Optional[str] = None
    block_size: int = -1
    line_by_line: bool = False # Load the data line by line, trimmed/padded to block_size; otherwise, load sequentially by block_size
    mlm: bool = False # Train with masked language model loss
    overwrite_cache: bool = False


In [None]:
def get_dataset(args: DataArguments, tokenizer: PreTrainedTokenizer, evaluate=False):
    file_path = args.eval_data_file if evaluate else args.train_data_file

    if args.line_by_line:
        return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
    else:
        return TextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, overwrite_cache=args.overwrite_cache)


## Finetunes GPT-2

In [None]:
def finetune_model(resume_from_checkpoint=True):

    model_args = ModelArguments(model_name_or_path="gpt2-medium", model_type="gpt2-medium")

    data_args = DataArguments(
        train_data_file=training_text,
        eval_data_file=eval_text,
        line_by_line=True,
        block_size=512,
        overwrite_cache=True,
    )

    training_args = TrainingArguments(
        output_dir=checkpoint_dir,
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        evaluation_strategy="steps",
        logging_steps=100,
        per_device_train_batch_size=4,
        num_train_epochs=1,
        save_total_limit=1,
        save_steps=1000,
        prediction_loss_only=True,
        seed=0,
        report_to="all",
    )

    # Performs sanity checks
    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError("Cannot do evaluation without an evaluation data file")

    if (os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(f"{training_args.output_dir} exists but overwrite_output_dir=False")

    # Sets up logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    print()
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    print()
    logger.info("Training/evaluation parameters %s", training_args)

    # Sets seed for deterministic training runs
    set_seed(training_args.seed)

    config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)

    model = GPT2LMHeadModel.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Adds additional tokens.
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    special_tokens_dict = {
        "bos_token": "<BOS>",
        "eos_token": "<EOS>",
        "pad_token": "<PAD>",
        "additional_special_tokens": [
            "<HINT>",
            "<SENT>",
        ],
    }
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    # Adjusts the block size.
    data_args.block_size = tokenizer.model_max_length if data_args.block_size <= 0 else  min(data_args.block_size, tokenizer.model_max_length)

    # Gets the datasets.
    train_dataset = (get_dataset(data_args, tokenizer=tokenizer) if training_args.do_train else None)
    eval_dataset = (get_dataset(data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=data_args.mlm,)

    # Initializes the trainer.
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    # Performs training.
    train_result = {}
    try:
      if training_args.do_train:
          model_path = (
              model_args.model_name_or_path
              if model_args.model_name_or_path is not None
              and os.path.isdir(model_args.model_name_or_path)
              else None
          )
          train_result = trainer.train(resume_from_checkpoint=True if resume_from_checkpoint else model_path)
          trainer.save_model()
          tokenizer.save_pretrained(training_args.output_dir)
    except KeyboardInterrupt:
      print("Saving model that was in the middle of training")
      trainer.save_model()
      tokenizer.save_pretrained(training_args.output_dir)
      return

    # Performs evaluation.
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()
        result = { "perplexity": math.exp(eval_output["eval_loss"]) }

        output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)

    results["training_result"] = train_result
    return results


In [None]:
finetune_results = finetune_model(resume_from_checkpoint=False)


07/24/2021 00:41:04 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=100,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=/content/drive/MyDrive/W266 Final Project/CnF/PhotoStoryGenerator/GPT-2M-FineTune2_checkpoint/runs/Jul24_00-41-04_003d6eaca5d2,
logging_first_ste





07/24/2021 00:41:04 - INFO - filelock -   Lock 140430771690768 acquired on /root/.cache/huggingface/transformers/3a7a4b7235202f93d14a4a5e8200709184c5b25a29d9cfa6b0ede5166adf0768.cf0ec4a33a38dc96108560e01338af4bd3360dd859385d451c35b41987ae73ff.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=718.0, style=ProgressStyle(description_…

07/24/2021 00:41:05 - INFO - filelock -   Lock 140430771690768 released on /root/.cache/huggingface/transformers/3a7a4b7235202f93d14a4a5e8200709184c5b25a29d9cfa6b0ede5166adf0768.cf0ec4a33a38dc96108560e01338af4bd3360dd859385d451c35b41987ae73ff.lock





07/24/2021 00:41:05 - INFO - filelock -   Lock 140430828995600 acquired on /root/.cache/huggingface/transformers/6249eef5c8c1fcfccf9f36fc2e59301b109ac4036d8ebbee9c2b7f7e47f440bd.2538e2565f9e439a3668b981faf959c8b490b36dd631f3c4cd992519b2dd36f1.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1520013706.0, style=ProgressStyle(descr…

07/24/2021 00:41:37 - INFO - filelock -   Lock 140430828995600 released on /root/.cache/huggingface/transformers/6249eef5c8c1fcfccf9f36fc2e59301b109ac4036d8ebbee9c2b7f7e47f440bd.2538e2565f9e439a3668b981faf959c8b490b36dd631f3c4cd992519b2dd36f1.lock





07/24/2021 00:41:42 - INFO - filelock -   Lock 140430761328272 acquired on /root/.cache/huggingface/transformers/fee58641d7a73348d842afaa337d5a7763dad32beff8d9008bb3c3c847749d6b.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…

07/24/2021 00:41:43 - INFO - filelock -   Lock 140430761328272 released on /root/.cache/huggingface/transformers/fee58641d7a73348d842afaa337d5a7763dad32beff8d9008bb3c3c847749d6b.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f.lock





07/24/2021 00:41:43 - INFO - filelock -   Lock 140430761328272 acquired on /root/.cache/huggingface/transformers/23c853a0fcfc12c7d72ad4e922068b6982665b673f6de30b4c5cbe5bd70a2236.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…

07/24/2021 00:41:44 - INFO - filelock -   Lock 140430761328272 released on /root/.cache/huggingface/transformers/23c853a0fcfc12c7d72ad4e922068b6982665b673f6de30b4c5cbe5bd70a2236.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock





07/24/2021 00:41:44 - INFO - filelock -   Lock 140430761328592 acquired on /root/.cache/huggingface/transformers/8e4f9a65085b1b4ae69ffac9a953a44249c9ea1e72e4a7816ee87b70081df038.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…

07/24/2021 00:41:45 - INFO - filelock -   Lock 140430761328592 released on /root/.cache/huggingface/transformers/8e4f9a65085b1b4ae69ffac9a953a44249c9ea1e72e4a7816ee87b70081df038.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0.lock





***** Running training *****
  Num examples = 22309
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 5578


Step,Training Loss,Validation Loss
100,5.8077,3.778262
200,3.7588,3.663482
300,3.6998,3.607811
400,3.6595,3.578814
500,3.6418,3.572659
600,3.5665,3.532996
700,3.6071,3.532261
800,3.5469,3.504945
900,3.4815,3.491442
1000,3.5195,3.479329


***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2818
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/W266 Final Project/CnF/PhotoStoryGenerator/GPT-2M-FineTune2_checkpoint/checkpoint-1000
Configuration saved in /content/drive/MyDrive/W266 Final Project/CnF/PhotoStoryGenerator/GPT-2M-FineTune2_checkpoint/checkpoint-1000/config.json
Model weights save

07/24/2021 01:53:33 - INFO - __main__ -   ***** Eval results *****
07/24/2021 01:53:33 - INFO - __main__ -     perplexity = 28.330624502785785


In [None]:
finetune_results


{'perplexity': 28.330624502785785,
 'training_result': TrainOutput(global_step=5578, training_loss=3.4444418572404985, metrics={'train_runtime': 4240.4872, 'train_samples_per_second': 5.261, 'train_steps_per_second': 1.315, 'total_flos': 4384053157711872.0, 'train_loss': 3.4444418572404985, 'epoch': 1.0})}