In [0]:
## Quietly installing transformers package to import
## the GPT2Tokenizer and TFGPT2LMHeadModel
!pip install transformers -q

[K     |████████████████████████████████| 675kB 2.7MB/s 
[K     |████████████████████████████████| 3.8MB 92kB/s 
[K     |████████████████████████████████| 890kB 31.1MB/s 
[K     |████████████████████████████████| 1.1MB 38.1MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


## Imports

In [0]:
import tensorflow as tf

from transformers import GPT2Tokenizer, TFGPT2LMHeadModel

import os
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

## Downloading the Data

The dataset is available at https://www.kaggle.com/abhinavmoudgil95/short-jokes/data

Sign In to Kaggle and begin the download process for **shortjokes.csv** file. Then copy the link address for the download file and update the _URL varaiable.

Once done run all the cells in the notebook. Also, you can cancel the donwload process for the file. :)

In [0]:
_URL = 'https://storage.googleapis.com/kaggle-data-sets/781%2F1457%2Fcompressed%2Fshortjokes.csv.zip?GoogleAccessId=gcp-kaggle-com@kaggle-161607.iam.gserviceaccount.com&Expires=1591507341&Signature=FyX9Byp7qpIpWTuJi028%2F74JCoCSyi0r8%2FtCGTfx9H0jewKbi%2FGvXRt46owax54aYyDFCfzPmCAWPUKmm%2FKMYgZ%2BqsskQH%2F92PiuQlIT4fttjvKUNEpy14Dcd%2BNy4NCpqLUlU0TCoLgsYEak53yU23QbWBgus1HpFn7UXY1Az8TOjNRBQYk%2FXajaV1qlrrNKRC13K6v6WR2qTsL3tLbzalQWiPxfPv1TwQnqicmYdPxRkhiuv19iX7Y1qpp22ZSzUW6w80e9A5R%2BcAItllI43OrN9HRMljVyJMrDIHP%2FqLAsvTmc2yYBa2muwN4wNDwxOBGbYpsbC0I%2B2F2J1HZbAA%3D%3D'

In [0]:
path_to_zip = tf.keras.utils.get_file('shortjokes.csv.zip', origin=_URL, extract=True)

FILE_PATH = os.path.join(os.path.dirname(path_to_zip), 'shortjokes.csv')

Downloading data from https://storage.googleapis.com/kaggle-data-sets/781%2F1457%2Fcompressed%2Fshortjokes.csv.zip?GoogleAccessId=gcp-kaggle-com@kaggle-161607.iam.gserviceaccount.com&Expires=1591507341&Signature=FyX9Byp7qpIpWTuJi028%2F74JCoCSyi0r8%2FtCGTfx9H0jewKbi%2FGvXRt46owax54aYyDFCfzPmCAWPUKmm%2FKMYgZ%2BqsskQH%2F92PiuQlIT4fttjvKUNEpy14Dcd%2BNy4NCpqLUlU0TCoLgsYEak53yU23QbWBgus1HpFn7UXY1Az8TOjNRBQYk%2FXajaV1qlrrNKRC13K6v6WR2qTsL3tLbzalQWiPxfPv1TwQnqicmYdPxRkhiuv19iX7Y1qpp22ZSzUW6w80e9A5R%2BcAItllI43OrN9HRMljVyJMrDIHP%2FqLAsvTmc2yYBa2muwN4wNDwxOBGbYpsbC0I%2B2F2J1HZbAA%3D%3D


## Preparing the Dataset

### Extracting jokes list from CSV

In [0]:
pd.options.display.max_colwidth = None

In [0]:
jokes = pd.read_csv(FILE_PATH)
jokes.head()

Unnamed: 0,ID,Joke
0,1,"[me narrating a documentary about narrators] ""I can't hear what they're saying cuz I'm talking"""
1,2,"Telling my daughter garlic is good for you. Good immune system and keeps pests away.Ticks, mosquitos, vampires... men."
2,3,I've been going through a really rough period at work this week It's my own fault for swapping my tampax for sand paper.
3,4,"If I could have dinner with anyone, dead or alive... ...I would choose alive. -B.J. Novak-"
4,5,Two guys walk into a bar. The third guy ducks.


In [0]:
jokeslist = jokes['Joke'].to_list()
jokeslist[:5]

['[me narrating a documentary about narrators] "I can\'t hear what they\'re saying cuz I\'m talking"',
 'Telling my daughter garlic is good for you. Good immune system and keeps pests away.Ticks, mosquitos, vampires... men.',
 "I've been going through a really rough period at work this week It's my own fault for swapping my tampax for sand paper.",
 'If I could have dinner with anyone, dead or alive... ...I would choose alive. -B.J. Novak-',
 'Two guys walk into a bar. The third guy ducks.']

### Creating the Tokenizer for word tokenization

In [0]:
Tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [0]:
special_tokens_dict = {'pad_token': 'pad'}
num_added_toks = Tokenizer.add_special_tokens(special_tokens_dict)

START_TOKEN = '<|start|> '
END_TOKEN = ' <|end|>'

### Create Dataset from List

In [0]:
# A utility method to create a tf.data dataset from a List of jokes
def jokeslist_to_dataset(jokeslist, tokenizer, 
                  shuffle=True, batch_size=16, MAX_LEN = 64):
  

  jokeslist = [START_TOKEN + joke + END_TOKEN for joke in jokeslist]

  encodings = [tokenizer.encode_plus(joke,
                                  None,
                                  add_special_tokens = True,
                                  max_length = MAX_LEN,
                                  pad_to_max_length = True) 
              for joke in jokeslist]

  ids = [x['input_ids'] for x in encodings]
  masks = [x['attention_mask'] for x in encodings]
  types = [x['token_type_ids'] for x in encodings]

  inputs = {}
  inputs['input_ids'] = ids
  inputs['attention_mask'] = masks
  inputs['token_type_ids'] = types

  ds = tf.data.Dataset.from_tensor_slices(inputs)

  if shuffle:
    ds = ds.shuffle(buffer_size=len(jokeslist))

  ds = ds.batch(batch_size)

  return ds

Note: This is a costly process since all the tokenization is done immediately. Thus it is expected to be slow. The only advantage is since everything is processed and kept in memory we are saving repated operationg while training.

In [0]:
## In case you wish to only test the model do not run this cell

jokes_dataset = jokeslist_to_dataset(jokeslist, Tokenizer)

In [0]:
for x in jokes_dataset:
  a, b, c = x['input_ids'], x['attention_mask'], x['token_type_ids']
  print(a[0], b[0], c[0])
  break

tf.Tensor(
[   27    91  9688    91    29   314  1549   787   257  1964  9707   475
   340   561   655   886   510   852  7018    13  1279    91   437    91
    29 15636 15636 15636 15636 15636 15636 15636 15636 15636 15636 15636
 15636 15636 15636 15636 15636 15636 15636 15636 15636 15636 15636 15636
 15636 15636 15636 15636 15636 15636 15636 15636 15636 15636 15636 15636
 15636 15636 15636 15636], shape=(64,), dtype=int32) tf.Tensor(
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(64,), dtype=int32) tf.Tensor(
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(64,), dtype=int32)


## The Model

In [0]:
model = TFGPT2LMHeadModel.from_pretrained('gpt2-medium')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=718.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1419628976.0, style=ProgressStyle(descr…




In [0]:
model.summary()

Model: "tfgp_t2lm_head_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
transformer (TFGPT2MainLayer multiple                  354823168 
Total params: 354,823,168
Trainable params: 354,823,168
Non-trainable params: 0
_________________________________________________________________


## Loss Function and Optimizer

In [0]:
loss_function = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [0]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, epsilon=1e-08, clipnorm=1.0)

## Checkpointing

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
CHECKPOINT_PATH = "/content/gdrive/My Drive/Weights/JokeGenGPT2"

In [0]:
checkpoint_path = CHECKPOINT_PATH

ckpt = tf.train.Checkpoint(model = model)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored!!')

Latest checkpoint restored!!


## Training

In [0]:
@tf.function
def train_step(data_dict):  
  with tf.GradientTape() as tape:

    outputs = model(data_dict)

    lm_logits = outputs[0]
    labels = data_dict['input_ids']

    ## For computing loss we remove the last element from logits and 
    ## first from labels. Thus we need the model to learn to predict next
    ## word more confidently among the others. 
    shift_logits = lm_logits[..., :-1, :]
    shift_labels = labels[..., 1:]
    
    loss = loss_function(tf.reshape(shift_labels, (-1,)),
         tf.reshape(shift_logits, 
                   (-1, shift_logits.shape[-1])))

  gradients = tape.gradient(loss, model.trainable_variables)    
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))

  return loss

In [0]:
EPOCHS = 4

for epoch in range(EPOCHS):
  
  for batch, data in tqdm(enumerate(jokes_dataset)):
    loss = train_step(data)
    if batch % 100 == 0:
      print('Epoch : {0} Batch : {1} ---- Loss : {2}'.format(epoch+1, batch+1, loss))

  

  ckpt_save_path = ckpt_manager.save()
  print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                        ckpt_save_path))

## Inference

### Generating Joke from Scratch

In [0]:
def exploit_best_token_while_exploring(probabilites, exploration_len=5):
    ## Get the top k probabilites indices where k is exploration_len 
    top_indices = np.argpartition(probabilites, -exploration_len)\
                            [-exploration_len:]

    ## Getting top proabilities value
    top_probabilities = probabilites[top_indices]

    ## Normalizing it so that they sum is 1
    top_probabilities = top_probabilities / np.sum(top_probabilities)

    ## Although we choose next token randomly for our options we 
    ## pass the probabilities associated with each to account for the
    ## model's confidence for the token also. 
    choice = np.random.choice(exploration_len, 1, p = top_probabilities)

    ## Return next token id based on choice
    next_token_id = int(top_indices[choice][0])
    return next_token_id

In [0]:
def generate_joke(joke_length = 64):

  ## Begin by appending the START_TOKEN to our current joke
  current_joke = tf.expand_dims(tf.convert_to_tensor(Tokenizer.encode(START_TOKEN)), 0)
  
  for pos in range(joke_length):

    ## Get output of model for the current_joke
    output = model(current_joke)

    ## Getting the logits value from output tuple i.e. (logits)
    logits = output[0]

    ## Logits is of the shape (BATCH, LEN_INPUT) 
    ## So, since batch size is 1 we get 0th index
    ## and the softmax for only the next possible word i.e. -1
    softmax_logits = tf.nn.softmax(logits[0, -1], axis=0).numpy()

    ## Depending on whether we are in initial or final stages of joke
    ## we determine how many options we should explore to make sure we have
    ## variety in jokes generated
    if pos == 0:
      # If we are predicting first word we need maximum exploration
      exploration_len = 50

    elif pos < 4:
      # The next three words have exploration length 15
      exploration_len = 15

    else:
      # As we move further we narrow our exploration length
      exploration_len = 4

    ## Get the token we should append to current joke
    token_to_append = exploit_best_token_while_exploring(softmax_logits, 
                                                         exploration_len)

    ## Append the token to current joke
    current_joke = tf.concat([current_joke, 
                              tf.ones((1,1), dtype = tf.int32)*token_to_append], 
                             axis = 1)
    
    ## In case the token belongs to the END_TOKEN we return it as complete joke
    if token_to_append in Tokenizer.encode(END_TOKEN):
      return Tokenizer.decode(list(tf.squeeze(current_joke).numpy()))
  
  ## If we did not get end token it means no joke is formed
  return None

In [0]:
generate_joke()

'<|start|> I had a dream last night that I was the president of the US... I had to sleep through it, but it was good. <'

In [0]:
generate_joke()

'<|start|> "What\'s a good joke about the Titanic?" "The one about the icebergs?" "No, the iceberg jokes aren\'t good. The Titanic\'s a good example." <'

In [0]:
generate_joke()

"<|start|> Why does Peter Pan always fly? It's because he Neverlands. <"

In [0]:
generate_joke()

'<|start|> Q: What kind of shoes do pirates wear? A: Sneakers <'

In [0]:
generate_joke()

"<|start|> Why did the blonde have a sore throat? Because she couldn't concentrate <"

In [0]:
generate_joke()

'<|start|> I was asked to be the spokesperson for the LGBT community in the United States. I said, "I\'ll be gay." They laughed and I said, "I\'m sorry, I don\'t think I\'m gay." <'

### Completing Sentences in Humorous Way

In [0]:
def complete_joke(initial_string = '', joke_length = 64):

  ## Begin by appending the START_TOKEN along with initial string to 
  ## our current joke sentence
  current_joke = tf.expand_dims(
      tf.convert_to_tensor(Tokenizer.encode(START_TOKEN + initial_string)), 
      0)
  
  for pos in range(joke_length):

    ## Get output of model for the current_joke
    output = model(current_joke)

    ## Getting the logits value from output tuple i.e. (logits)
    logits = output[0]

    ## Logits is of the shape (BATCH, LEN_INPUT) 
    ## So, since batch size is 1 we get 0th index
    ## and the softmax for only the next possible word i.e. -1
    softmax_logits = tf.nn.softmax(logits[0, -1], axis=0).numpy()

    ## Depending on whether we are in initial or final stages of joke
    ## we determine how many options we should explore to make sure we have
    ## variety in jokes generated
    if pos == 0:
      # If we are predicting first word we need maximum exploration
      exploration_len = 50

    elif pos < 4:
      # The next three words have exploration length 15
      exploration_len = 15

    else:
      # As we move further we narrow our exploration length
      exploration_len = 4

    ## Get the token we should append to current joke
    token_to_append = exploit_best_token_while_exploring(softmax_logits, 
                                                         exploration_len)

    ## Append the token to current joke
    current_joke = tf.concat([current_joke, 
                              tf.ones((1,1), dtype = tf.int32)*token_to_append], 
                             axis = 1)
    
    ## In case the token belongs to the END_TOKEN we return it as complete joke
    if token_to_append in Tokenizer.encode(END_TOKEN):
      return Tokenizer.decode(list(tf.squeeze(current_joke).numpy()))
  
  ## If we did not get end token it means no joke is formed and we try again
  return complete_joke(initial_string, joke_length)

In [0]:
complete_joke("Jokes are")

"<|start|> Jokes are just jokes. If i ever had a son i'd call him 'Joke' <"

In [0]:
complete_joke("I am trained enough")

'<|start|> I am trained enough to recognize a fake name on a Facebook profile. <'

In [0]:
complete_joke("I can do this all day")

'<|start|> I can do this all day long. "I can do this all day long." -People with no idea what the word "all" means <'

In [0]:
complete_joke("Hope is a good thing")

"<|start|> Hope is a good thing.. until you don't have any <"

In [0]:
complete_joke("Where is ")

"<|start|> Where is a good place for a kid's birthday party? In a coffin. <"

In [0]:
complete_joke("Why do")

"<|start|> Why do girls always get lost in the woods? Because they're not smart enough to walk alone. <"

In [0]:
complete_joke("Don't lie ")

"<|start|> Don't lie to your spouse, your boss, your neighbor, or anyone else about how many people you've slept with. <"