<a href="https://colab.research.google.com/github/r-chambers/TextAdventureGenerator/blob/main/CreateGraphModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets
!pip install git-python==1.0.3
!pip install rouge_score
!pip install sacrebleu
!pip install -U accelerate
!pip install -U transformers
!pip install nltk

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m542.7/547.8 kB[0m [31m19.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (6

This notebook follows the tutorial at https://huggingface.co/blog/warm-starting-encoder-decoder.

In [2]:
import json
import tensorflow as tf
from tensorflow import keras
import numpy as np
from transformers import BertTokenizer, TrainingArguments, EncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
import pandas as pd
import datasets
from google.colab import drive
from datasets import Dataset
import spacy
import ast
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

Loading the tokenizer and pre-trained checkpoints.

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased", tie_encoder_decoder=False) # prajjwal1/bert-medium

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

In [4]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
model.num_parameters()

247363386

In [6]:
# Putting model on the GPU
model = model.to("cuda")

In [7]:
# Setting model config
# Because Bert-medium is based on Bert-Base, we can assume that it also doesn't have a decoder start token or EOS token and should take it from the tokenizer
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size

Let's get our training and test data.

In [8]:
f_train = open("/content/drive/MyDrive/TextAdventureModel/jerichoworld_train_locations.json", "r")
f_test = open("/content/drive/MyDrive/TextAdventureModel/jerichoworld_test_locations.json", "r")
train_data = json.load(f_train)
test_data = json.load(f_test)
f_train.close()
f_test.close()

In [9]:
def get_full_input(x):
  full_string = "What would happen to the following graph given the provided command? Generate a new room name for these commands for the ['you' 'in', 'location'] phrase: north, east, south, west, up, down. "
  command = "['command', 'is', '" + x['next_state']['walkthrough_act'] + "'], "
  current_graph = str(x['state']['graph'])
  full_string += current_graph[:1] + command + current_graph[1:]
  return full_string

Let's convert the training and evaluation data into transformers Datasets, the format that the Seq2SeqTrainer takes when fine-tuning the model

In [10]:
# Convert data into a pandas dataframe
def convert_to_dataset(data):
  data_list = []

  for game in data:
    for states in game:
      inputs = tokenizer(get_full_input(states), padding="max_length", truncation=True, max_length=512)
      outputs = tokenizer(str(states['next_state']['graph']), padding="max_length", truncation=True, max_length=512)

      row = {}
      row['input_ids'] = inputs.input_ids
      row['attention_mask'] = inputs.attention_mask
      row["labels"] = outputs.input_ids.copy()

      # ignoring PAD token as padding is given via the option padding="max_length"
      row["labels"] = [-100 if token == tokenizer.pad_token_id else token for token in row["labels"]]

      data_list.append(row)

  df = pd.DataFrame.from_records(data_list)
  return Dataset.from_pandas(df)

In [11]:
# Creating train dataset
train_dataset = convert_to_dataset(train_data)

In [12]:
test_dataset = convert_to_dataset(test_data[0:2])

In [13]:
train_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"],
)

In [14]:
train_dataset[0]

{'input_ids': tensor([  101,  2054,  2052,  4148,  2000,  1996,  2206, 10629,  2445,  1996,
          3024,  3094,  1029,  9699,  1037,  2047,  2282,  2171,  2005,  2122,
         10954,  2005,  1996,  1031,  1005,  2017,  1005,  1005,  1999,  1005,
          1010,  1005,  3295,  1005,  1033,  7655,  1024,  2167,  1010,  2264,
          1010,  2148,  1010,  2225,  1010,  2039,  1010,  2091,  1012,  1031,
          1031,  1005,  3094,  1005,  1010,  1005,  2003,  1005,  1010,  1005,
          2167,  1005,  1033,  1010,  1031,  1005,  3224,  1005,  1010,  1005,
          2003,  1005,  1010,  1005,  2167,  1005,  1033,  1033,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [15]:
test_dataset.__len__

Let's set the parameters of the model.

In [16]:
# This is the max length that the model will generate, some of the graphs got this long so we want to make the output be as long as this if possible
# This is also max length of the BERT tokenizer
model.config.max_length = 512
# We want a room name and some items but don't need much else.
model.config.min_length = 50
# This NEEDS to be zero, as we want tons of repeating ngrams with "you", "have" and such
model.config.no_repeat_ngram_size = 0
model.config.early_stopping = True
model.config.length_penalty = 2.0
model.config.num_beams = 4

Let's set the parameters of the Seq2Seq Trainer.

In [17]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    #evaluation_strategy="steps",
    per_device_train_batch_size=8,
    #per_device_eval_batch_size=8,
    fp16=True,
    output_dir="./",
    logging_steps=2,
    save_steps=500,
    # eval_steps=4,
    # logging_steps=1000,
    # save_steps=500,
    # eval_steps=7500,
    # warmup_steps=2000,
    # save_total_limit=3,
)

In [18]:
rouge = datasets.load_metric("rouge", trust_remote_code=True)

  rouge = datasets.load_metric("rouge", trust_remote_code=True)


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [19]:
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset
)

In [20]:
trainer.train()

  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


Step,Training Loss
2,8.1197
4,7.701
6,8.0206
8,6.7616
10,4.642
12,4.0183
14,3.8297
16,3.7097
18,3.9111
20,3.6011


Non-default generation parameters: {'max_length': 512, 'min_length': 50, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0}
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
Non-default generation parameters: {'max_length': 512, 'min_length': 50, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0}
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
Non-default generation parameters: {'max_length': 512, 'min_length': 50, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0}
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
Non-default generation parameters: {'max_length': 512, 'min_length': 50, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0}
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
Non-default generation parameters: {'max_len

TrainOutput(global_step=4539, training_loss=0.22410588594382097, metrics={'train_runtime': 1765.6837, 'train_samples_per_second': 20.557, 'train_steps_per_second': 2.571, 'total_flos': 2.226654348512256e+16, 'train_loss': 0.22410588594382097, 'epoch': 3.0})

In [21]:
model.save_pretrained("/content/drive/My Drive/TextAdventureModel/model_large2_encoder_tied")

Non-default generation parameters: {'max_length': 512, 'min_length': 50, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0}


Now let's evaluate the model.

In [22]:
def generate_graph(input):
  # tokenize input
  inputs = tokenizer(input, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
  input_ids = inputs.input_ids.to("cuda")
  attention_mask = inputs.attention_mask.to("cuda")

  outputs = model.generate(input_ids, attention_mask=attention_mask)

  output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

  return output_str

In [26]:
def generate_predictions(test_data):
  predictions = []
  references = []

  for game in test_data:
    for states in game:
      inputs = tokenizer(get_full_input(states), padding="max_length", truncation=True, max_length=512, return_tensors="pt")
      input_ids = inputs.input_ids.to("cuda")
      attention_mask = inputs.attention_mask.to("cuda")

      outputs = model.generate(input_ids, attention_mask=attention_mask)

      output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

      predictions.append(output_str)
      references.append(str(states['next_state']['graph']))

  return predictions, references

Let's generate our predictions. We are only doing some of the test data as generating these predictions takes a long time.

In [27]:
pred, ref = generate_predictions(test_data[0:1])

In [29]:
# Clean up predictions
def clean_graph_string(graph_str):
    # If there are spaces between the first and last [ [ ] ] then remove them
    if graph_str[1] == " ":
      graph_str = graph_str[:1] + graph_str[2:]

    if graph_str[-2] == " ":
      graph_str = graph_str[:-2] + graph_str[-1:]

    # Replace all " with ' as that is what the data that trained the model used
    graph_str = graph_str.replace("\"", "'")

    # find and replace extra spaces around ,
    graph_str = graph_str.replace(" ','", "', '")

    return graph_str


Let's calculate the ROUGE score.

In [30]:
rouge_score_corpus = rouge.compute(predictions=pred, references=ref, rouge_types=["rouge2"])["rouge2"].mid
print("Corpus ROUGE score:", rouge_score_corpus)

Corpus ROUGE score: Score(precision=0.10700520679806634, recall=0.17186983611241605, fmeasure=0.11687962663678116)


Now let's calculate the BLEU score.

In [31]:
ref_list = [r.split(" ") for r in ref]
pred_list = [clean_graph_string(p[0]).split(" ") for p in pred]

weights = [
         (1./2., 1./2.),
         (1./3., 1./3., 1./3.),
         (1./4., 1./4., 1./4., 1./4.)
     ]
bleu_score_corpus = corpus_bleu(ref_list, pred_list, weights)
print("Corpus BLEU Score: ", bleu_score_corpus)

<class 'list'>
["[['you',", "'have',", "'piece", 'of', 'white', "paper'],", "['you',", "'in',", '"Chief\'s', 'office"],', "['Outside',", "'is',", "'north'],", "['Closet',", "'is',", "'west']]"]
<class 'list'>
["[['mr.", "alltext',", "'in',", "'the", "cafe'],", "['queenie',", "'in',", "'the", "cafe'],", "['you',", "'have',", "'key'],", "['lion',", "'in',", "'the", "cafe'],", "['silver", "coin',", "'in',", "'pocket'],", "['you',", "'in',", "'the", "cafe'],", "['(", 'closet', '_', 'door', ")',", "'in',", "'the", "cafe'],", "['judy',", "'in',", "'the", "cafe'],", "['enterprise", "closet',", "'in',", "'the", "cafe'],", "['school',", "'in',", "'the", "cafe'],", "['you',", "'have',", "'pocket'],", "['book',", "'in',", "'pocket'],", "['pastoral", "mural',", "'in',", "'the", "cafe']]"]
Corpus BLEU Score:  [0.004532381214910426, 7.70310405428738e-105, 1.0042351795314722e-155]


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Now let's save our model to our Google drive.

In [32]:
model.save_pretrained("/content/drive/My Drive/TextAdventureModel/model_name")

Non-default generation parameters: {'max_length': 512, 'min_length': 50, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0}


How we can load our model if we want for later.

In [34]:
loaded_model = EncoderDecoderModel.from_pretrained("/content/drive/My Drive/TextAdventureModel/model_name")
loaded_model = loaded_model.to('cuda')