In [1]:
!pip list | grep transformers

transformers                  4.26.0.dev0


In [2]:
!pip list | grep dataset

datasets                      2.8.0
tensorflow-datasets           4.8.1
vega-datasets                 0.9.0


In [None]:
# Upgrade to latest version from main as per
# https://github.com/huggingface/transformers/issues/20750
# to avoid
# AttributeError: module 'keras.engine.data_adapter' has no attribute 'expand_1d'
# when fitting
!pip install --upgrade git+https://github.com/huggingface/transformers.git
!pip install datasets

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import os
import re
import time

from pprint import pprint

In [2]:
# import tensorflow as tf
# from transformers import TFGPT2LMHeadModel
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel, pipeline, AutoTokenizer, TFAutoModelForCausalLM
from datasets import Dataset, load_dataset
import tensorflow as tf

In [3]:
# Try to run on TPU if available
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print("Running on TPU ", tpu.cluster_spec().as_dict()["worker"])
except ValueError:
    tpu = None
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()
print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [4]:
MAX_TOKENS = 128

EOS_TOKEN = "<|endoftext|>"
PAD_TOKEN = "<|pad|>"

In [5]:
# ORIGINAL

# tokenizer = GPT2Tokenizer.from_pretrained(
#     'gpt2',
#     eos_token=EOS_TOKEN,
#     pad_token=PAD_TOKEN,
#     max_length=MAX_TOKENS,
#     padding_side='left'
# )

In [5]:
tokenizer = AutoTokenizer.from_pretrained(
    'gpt2',
    eos_token=EOS_TOKEN,
    pad_token=PAD_TOKEN,
    max_length=MAX_TOKENS,
    padding_side='left'
)

# tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
ds = Dataset.from_text('booksummaries_cleaned2.txt')# 'testdata2.txt') # 'text100.txt') # 'booksummaries_cleaned2.txt')
ds['text'][:2]



[' Living on Mars , Deckard is acting as a consultant to a movie crew filming the story of his Blade Runner days . He finds himself drawn into a mission on behalf of the replicants he was once assigned to kill . Meanwhile , the mystery surrounding the beginnings of the Tyrell Corporation is being dragged out into the light . ',
 ' Beginning several months after the events in Blade Runner , Deckard has retired to an isolated shack outside the city , taking the replicant Rachael with him in a Tyrell transport container , which slows down the replicant aging process . He is approached by a woman who explains she is Sarah Tyrell , niece of Eldon Tyrell , heiress to the entire Tyrell Corporation and the human template templant for the Rachael replicant . She asks Deckard to hunt down the missing sixth replicant . At the same time , the human template for Roy Batty hires Dave Holden , the blade runner attacked by Leon , to help him hunt down the man he believes is the sixth replicant Deckard

## Explanation of working of tokenize_function

In [9]:
# examples = [BOS_TOKEN + ex + EOS_TOKEN for ex in ds["text"]]
# examples

In [10]:
# output = tokenizer(
#     examples,
#     add_special_tokens=True,  # Only adds pad not eos and bos
#     max_length=MAX_TOKENS,
#     truncation=True,
#     padding='max_length',
#     # return_tensors='tf'
# )
# output["input_ids"]

In [11]:
# # Drop the first token
# output["labels"] = [x[1:] for x in output["input_ids"]]
# pprint(output["labels"])

In [12]:
# pprint(output)

In [13]:
# # Replace all occurences of PAD_TOKEN with -100
# output["labels"] = [
#     [-100 if x == tokenizer.pad_token_id else x for x in y] for y in output["labels"]
# ]
# pprint(output)

In [14]:
# # truncate input ids and attention mask to account for label shift
# output["input_ids"] = [x[:-1] for x in output["input_ids"]]
# output["attention_mask"] = [x[:-1] for x in output["attention_mask"]]
# pprint(output)

## tokenize_function definition

In [7]:
# Consolidate into a function

def tokenize_function(examples, tokenizer=tokenizer):

    examples = [ex + EOS_TOKEN for ex in examples["text"]]

    output = tokenizer(
        examples,
        add_special_tokens=True,  # Only adds pad not eos and bos
        max_length=MAX_TOKENS,
        truncation=True,
        padding='max_length',
        # return_tensors='tf'
    )

    # Drop the first token
    output["labels"] = [x[1:] for x in output["input_ids"]]

    # Replace all occurences of PAD_TOKEN with -100
    output["labels"] = [
        [-100 if x == tokenizer.pad_token_id else x for x in y] for y in output["labels"]
    ]

    # truncate input ids and attention mask to account for label shift
    output["input_ids"] = [x[:-1] for x in output["input_ids"]]
    output["attention_mask"] = [x[:-1] for x in output["attention_mask"]]

    return output

In [8]:
ds_tokenized = ds.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
    load_from_cache_file=False,
)
ds_tokenized

  0%|          | 0/12 [00:00<?, ?ba/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 11783
})

In [9]:
ds_tokenized.set_format(type="python", columns=["input_ids", "attention_mask", "labels"])

ds_tokenized = ds_tokenized.train_test_split(
    test_size=0.20, shuffle=True, seed=1, load_from_cache_file=True
)
ds_tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9426
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2357
    })
})

In [10]:
%%time
# prepare for use in tensorflow
train_tensor_inputs = tf.convert_to_tensor(ds_tokenized["train"]["input_ids"])
train_tensor_labels = tf.convert_to_tensor(ds_tokenized["train"]["labels"])
train_tensor_mask = tf.convert_to_tensor(ds_tokenized["train"]["attention_mask"])
train = tf.data.Dataset.from_tensor_slices(
    (
        {"input_ids": train_tensor_inputs, "attention_mask": train_tensor_mask},
        # {"labels": train_tensor_labels},
        train_tensor_labels,
    )
)

test_tensor_inputs = tf.convert_to_tensor(ds_tokenized["test"]["input_ids"])
test_tensor_labels = tf.convert_to_tensor(ds_tokenized["test"]["labels"])
test_tensor_mask = tf.convert_to_tensor(ds_tokenized["test"]["attention_mask"])
test = tf.data.Dataset.from_tensor_slices(
    (
        {"input_ids": test_tensor_inputs, "attention_mask": test_tensor_mask},
        # {"labels": test_tensor_labels},
        test_tensor_labels,
    )
)

CPU times: user 3.72 s, sys: 438 ms, total: 4.15 s
Wall time: 4.14 s


# Convert to a TF object using new method

``` python
ds_tf = ds_tokenized.to_tf_dataset(
    columns=["input_ids","attention_mask"],
    label_cols=["labels"],
    batch_size=1,
    shuffle=True
)
```

In [11]:
# Model params
BATCH_SIZE_PER_REPLICA = 28
EPOCHS = 10
INITAL_LEARNING_RATE = 0.001

try:
    BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
except NameError as e:
    BATCH_SIZE = BATCH_SIZE_PER_REPLICA
BUFFER_SIZE = len(train)

print("BATCH_SIZE: ", BATCH_SIZE)

print("len(tokenizer) ", len(tokenizer))

# prepare data for consumption
train_ds = (
    train.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
)
test_ds = test.batch(BATCH_SIZE, drop_remainder=True)

BATCH_SIZE:  28
len(tokenizer)  50258


In [12]:
# Decreasing learning rate scheduler
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    INITAL_LEARNING_RATE,
    decay_steps=500,
    decay_rate=0.7,
    staircase=True)

In [16]:
# ORIGINAL
# initialize model, use_cache=False important! else wrong shape at loss calc
with strategy.scope():
    model = TFAutoModelForCausalLM.from_pretrained(
        'gpt2',
        # use_cache=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    model.resize_token_embeddings(len(tokenizer))
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
    model.compile(optimizer=optimizer, loss=model.hf_compute_loss)
    model.summary()

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Model: "tfgpt2lm_head_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLaye  multiple                 124440576 
 r)                                                              
                                                                 
Total params: 124,440,576
Trainable params: 124,440,576
Non-trainable params: 0
_________________________________________________________________


In [17]:
# # PROBLEM IDENTIFIED: Don't set use_cache=False
# model = TFAutoModelForCausalLM.from_pretrained(
#         'gpt2',
#         # use_cache=False,
#         pad_token_id=tokenizer.pad_token_id,
#         eos_token_id=tokenizer.eos_token_id,
#     )

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [None]:
# # initialize model, use_cache=False important! else wrong shape at loss calc
# with strategy.scope():
#     model = TFAutoModelForCausalLM.from_pretrained(
#         'gpt2',
#         use_cache=False,
#         pad_token_id=tokenizer.eos_token_id, # pad_token_id,
#         eos_token_id=tokenizer.eos_token_id,
#     )
#     # model.resize_token_embeddings(len(tokenizer))
#     # optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
#     # model.compile(optimizer=optimizer, loss=model.hf_compute_loss)
#     model.summary()

In [17]:
now = datetime.now().strftime("%Y-%m-%d_%H%M")

# # Fix TPU save model issue?
# save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost') 

# Create callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping( 
        monitor="val_loss", verbose=1, patience=1, restore_best_weights=True
    ),
    tf.keras.callbacks.ModelCheckpoint(
        "models/" + now + "_GPT2-Model_{epoch:02d}_{val_loss:.4f}",
        monitor="val_loss",
        save_best_only=True,
        verbose=1,
        # options=save_locally, # Fix TPU save model issue?
    ),
]

In [18]:
# %%time    
# Train Model
steps_per_epoch = int(BUFFER_SIZE // BATCH_SIZE)
print(
    f"Model Params:\nbatch_size: {BATCH_SIZE}\nEpochs: {EPOCHS}\n"
    f"Step p. Epoch: {steps_per_epoch}\n"
    f"Initial Learning rate: {INITAL_LEARNING_RATE}"
    "\n-------------------------------------------------------------"
)

hist = model.fit(
    train_ds,
    validation_data=test_ds,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1,
)

Model Params:
batch_size: 28
Epochs: 10
Step p. Epoch: 336
Initial Learning rate: 0.001
-------------------------------------------------------------
Epoch 1/10
Epoch 1: val_loss improved from inf to 3.88040, saving model to models/2023-01-18_1531_GPT2-Model_01_3.8804




Epoch 2/10
Epoch 2: val_loss improved from 3.88040 to 3.86444, saving model to models/2023-01-18_1531_GPT2-Model_02_3.8644




Epoch 3/10

Epoch 3: val_loss did not improve from 3.86444
Epoch 3: early stopping


In [23]:
# path_to_model = '/content/models/2023-01-18_0754_GPT2-Model_02_3.9008'
# # load model in tpu using Tensorflow's "SavedModel" format
# with strategy.scope():
#     # load_locally = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')
#     restored_model = tf.keras.models.load_model(
#         path_to_model,
#         # options=load_locally,
#         custom_objects={"hf_compute_loss": model.hf_compute_loss} # Important!
#         )

In [24]:
# new_pipeline = pipeline(
#             "text-generation",
#             model=restored_model,
#             tokenizer=tokenizer
#         )

In [25]:
# # We need to specify max_new_tokens here instead of max_length
# new_pipeline('hello', max_new_tokens=10)

In [26]:
# # HF save method
# # Note: not using options=save_locally as per TPU fix
# # implemented in callback & tf.keras.models.load_model()

# model.save_pretrained('/content/models/hf', saved_model=True)


In [27]:
# # HF load method
# restored_model = TFGPT2LMHeadModel.from_pretrained('/content/models/hf')

In [28]:
new_pipeline = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer
        )

In [29]:
new_pipeline('hello')#, max_new_tokens=10)



[{'generated_text': 'hello, a married man who lives in the city and falls in love with her for what she does. He is fascinated with the romance and begins fantasizing about the woman he meets his dreams, and eventually realizes that he has got her too. He'}]

## Implementing XLA

Using these notes: https://huggingface.co/blog/tf-xla-generate

In [16]:
# This is how we use the generate function directly, without using
# a pipeline

inputs = tokenizer(["I went to the moon and "], return_tensors="tf")
print(inputs)
# generated = model.generate(**inputs, max_new_tokens=50)
generated = model.generate(**inputs, do_sample=True, max_new_tokens=50)
print("Sampling output: ", tokenizer.decode(generated[0]))

{'input_ids': <tf.Tensor: shape=(1, 7), dtype=int32, numpy=array([[  40, 1816,  284,  262, 8824,  290,  220]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 7), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}




Sampling output:  I went to the moon and  saw that my body was on fire and so I put my clothes and shoes back on. I couldn't think for a second on the next thing I went, I really couldn't remember if it was really my baby face or my baby figure.


In [31]:
# Define a test function
def most_likely_next_token(inputs):
    model_output = model(inputs)
    return tf.argmax(model_output.logits[:, -1, :], axis=-1)

In [32]:
%%time
print("Calling regular function with TensorFlow code...")
most_likely_next_token(inputs)

Calling regular function with TensorFlow code...
CPU times: user 134 ms, sys: 5.12 ms, total: 139 ms
Wall time: 138 ms


<tf.Tensor: shape=(1,), dtype=int64, numpy=array([3711])>

In [33]:
xla_most_likely_next_token = tf.function(most_likely_next_token, jit_compile=True)

In [34]:
%%time
print("Calling XLA function... (for the first time -- will be slow)")
xla_most_likely_next_token(inputs)

Calling XLA function... (for the first time -- will be slow)
CPU times: user 5 s, sys: 40.6 ms, total: 5.04 s
Wall time: 6.43 s


<tf.Tensor: shape=(1,), dtype=int64, numpy=array([3711])>

In [35]:
%%time
print("Calling XLA function... (for the second time -- will be fast)")
xla_most_likely_next_token(inputs)

Calling XLA function... (for the second time -- will be fast)
CPU times: user 5.42 ms, sys: 50 µs, total: 5.47 ms
Wall time: 4.45 ms


<tf.Tensor: shape=(1,), dtype=int64, numpy=array([3711])>

In [36]:
# Delving into the details...

# Note: execution times are deeply dependent on hardware -- a 3090 was used here.
import tensorflow as tf

@tf.function(jit_compile=True)
def max_plus_constant(tensor, scalar):
    return tf.math.reduce_max(tensor) + scalar

In [37]:
%%time
# Slow: XLA compilation will kick in, as it is the first call
max_plus_constant(tf.constant([0, 0, 0]), 1)

CPU times: user 44.6 ms, sys: 3 µs, total: 44.6 ms
Wall time: 101 ms


<tf.Tensor: shape=(), dtype=int32, numpy=1>

In [38]:
%%time
# Fast: Not the first call with this tensor shape, tensor type, and exact same
# non-tensor argument
max_plus_constant(tf.constant([1000, 0, -10]), 1)

CPU times: user 1.46 ms, sys: 64 µs, total: 1.52 ms
Wall time: 1.07 ms


<tf.Tensor: shape=(), dtype=int32, numpy=1001>

In [39]:
%%time
# Slow: Different tensor type
max_plus_constant(tf.constant([0, 0, 0], dtype=tf.int64), 1)

CPU times: user 19.2 ms, sys: 0 ns, total: 19.2 ms
Wall time: 75.6 ms


<tf.Tensor: shape=(), dtype=int64, numpy=1>

In [40]:
%%time
# Slow: Different tensor shape
max_plus_constant(tf.constant([0, 0, 0, 0]), 1)

CPU times: user 17 ms, sys: 2.14 ms, total: 19.2 ms
Wall time: 74.5 ms


<tf.Tensor: shape=(), dtype=int32, numpy=1>

In [41]:
%%time
# Slow: Different non-tensor argument
max_plus_constant(tf.constant([0, 0, 0]), 2)

CPU times: user 18.6 ms, sys: 1.06 ms, total: 19.6 ms
Wall time: 75.9 ms


<tf.Tensor: shape=(), dtype=int32, numpy=2>

In [27]:
tokenizer.eos_token_id, tokenizer.pad_token_id

(50256, 50256)

In [61]:
model.config.pad_token_id = model.config.eos_token_id

In [26]:
model.config.pad_token_id, model.config.eos_token_id

(50256, 50256)

In [7]:
input_1 = ["TensorFlow is"]
input_2 = ["TensorFlow is a"]

In [None]:
# # Calls XLA generation without padding
# tokenized_input_1 = tokenizer(input_1, return_tensors="tf")  # length = 4
# tokenized_input_2 = tokenizer(input_2, return_tensors="tf")  # length = 5
# print(f"`tokenized_input_1` shape = {tokenized_input_1.input_ids.shape}")
# print(f"`tokenized_input_2` shape = {tokenized_input_2.input_ids.shape}")

In [18]:
# generated = model.generate(**tokenized_input_1, max_new_tokens=50)
# print("Sampling output: ", tokenizer.decode(generated[0]))

Sampling output:  TensorFlow is a very powerful and flexible programming language that can be used to build complex neural networks.

The goal of this article is to provide a simple and easy way to build a neural network using the Python programming language.

The goal of this article


In [23]:
tokenization_kwargs = {"pad_to_multiple_of": 32, "padding": True, "return_tensors": "tf"}
generation_kwargs = {"num_beams": 4, "max_new_tokens": 500}

# One line to create a XLA generation function
xla_generate = tf.function(model.generate, jit_compile=True)

In [24]:
input_prompts = [f"The best thing about {country} is" for country in ["Spain", "Japan", "Angola"]]
for input_prompt in input_prompts:
    tokenized_inputs = tokenizer([input_prompt], **tokenization_kwargs)
    start = time.time_ns()
    generated_text = xla_generate(**tokenized_inputs, **generation_kwargs)
    end = time.time_ns()
    decoded_text = tokenizer.decode(generated_text[0], skip_special_tokens=True)
    print(f"Original prompt -- {input_prompt}")
    print(f"Generated -- {decoded_text}")
    print(f"Execution time -- {(end - start) / 1e6:.1f} ms\n")

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Original prompt -- The best thing about Spain is
Generated -- The best thing about Spain is that the people of the country are very proud of their country. The people of the country are very proud of their country. The people of the country are very proud of their country. The people of the country are very proud of their country. The people of the country are very proud of their country. The people of the country are very proud of their country. The people of the country are very proud of their country. The people of the country are very proud of their country. The people of the country are very proud of their country. The people of Spain are very proud of their country. 
Execution time -- 18149.5 ms

Original prompt -- The best thing about Japan is
Generated -- The best thing about Japan is that people live in peace. The best thing about Japan is that there is no violence. The best thing about Japan is that there is no violence. The best thing about Japan is that there is no violence

In [22]:
input_prompts = [f"Let me tell you a story about {country}" for country in ["Italy", "Ireland", "England"]]
for input_prompt in input_prompts:
    tokenized_inputs = tokenizer([input_prompt], **tokenization_kwargs)
    start = time.time_ns()
    generated_text = xla_generate(**tokenized_inputs, **generation_kwargs)
    end = time.time_ns()
    decoded_text = tokenizer.decode(generated_text[0], skip_special_tokens=True)
    print(f"Original prompt -- {input_prompt}")
    print(f"Generated -- {decoded_text}")
    print(f"Execution time -- {(end - start) / 1e6:.1f} ms\n")

Original prompt -- Let me tell you a story about Italy
Generated -- Let me tell you a story about Italy, a country that has been shaken by the death of a young man. The story is told from the point of view of the young man s mother, who
Execution time -- 90.1 ms

Original prompt -- Let me tell you a story about Ireland
Generated -- Let me tell you a story about Ireland, a young Irish woman who has just moved to the United States. Her parents are divorced, and she has no children. Her father is a lawyer, and
Execution time -- 81.7 ms

Original prompt -- Let me tell you a story about England
Generated -- Let me tell you a story about England, a country that has been shaken by the death of a young man. The story is told from the point of view of the young man s mother, who
Execution time -- 84.9 ms

