<a href="https://colab.research.google.com/github/stevec12/VTubers-Analysis/blob/main/CommentPrompting2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Comment Prompting
This Jupyter notebook looks at training a basic transformer to provide responses to prompts based on how a YoutTuber's comments would likely reply.

The YouTuber chosen is for the demo is [Ceres Fauna](!https://www.youtube.com/channel/UCO_aKKYxn4tvrqPjcTzZ6EQ), an English streamer with predominantly English comments.

The channel ID is `UCO_aKKYxn4tvrqPjcTzZ6EQ`.

# Data Extraction
The `YouTube Data API v3` can be used for this task, and an account-linked API-key can be obtained using your personal Google (Developer) Account.

In [1]:
import googleapiclient.discovery
import googleapiclient.errors

import numpy as np
import pandas as pd
!pip install xlsxwriter
import xlsxwriter

Collecting xlsxwriter
  Downloading XlsxWriter-3.1.9-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.8/154.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.1.9


In [None]:
print("Input API Key: ")
api_key = input()

Input API Key: 


In [None]:
# Input target channel, example is @CeresFauna
channelID = 'UCO_aKKYxn4tvrqPjcTzZ6EQ'

In [None]:
api_service_name = "youtube"
api_version = "v3"
youtube = googleapiclient.discovery.build(api_service_name, api_version, developerKey=api_key)

In [None]:
def find_uploadedID(channelID):
  request = youtube.channels().list(
      part="contentDetails",
      id=channelID
    )
  response = request.execute()

  return response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

In [None]:
uploadedID=find_uploadedID(channelID)

In [None]:
def find_uploaded(uploadedID):
  videoIDs = []
  request = youtube.playlistItems().list(
        part="contentDetails",
        playlistId = uploadedID,
        maxResults = 50
  )
  response = request.execute()
  for item in response['items']:
    videoIDs.append(item['contentDetails']['videoId'])
  while('nextPageToken' in response):
    request=youtube.playlistItems().list(
        part='contentDetails',
        playlistId=uploadedID,
        pageToken=response['nextPageToken'],
        maxResults=50)
    response = request.execute()
    for item in response['items']:
      videoIDs.append(item['contentDetails']['videoId'])

  return videoIDs

In [None]:
uploaded=find_uploaded(uploadedID)

In [None]:
def get_video_comments(videoID : str) -> pd.DataFrame:
  '''
  Given a videoID, return a pandas DataFrame with video info
  '''
  column_names = ['videoID','isTopLevel','topLevelID','commentID','authorDisplayName',
                  'likeCount','publishedAt','totalReplyCount','textOriginal']

  row_list = [] # Used to create list of dict of rows before conversion to dataframe, faster
  pageToken=''
  while(True):
    request=youtube.commentThreads().list(
        part="id,snippet,replies",
        videoId=videoID,
        pageToken=pageToken,
        maxResults=100
    )
    try:
      response=request.execute()
    except googleapiclient.errors.HttpError:
      break

    for commentThread in response['items']:
      # write top level comment
      topLevelID=commentThread['snippet']['topLevelComment']['id']
      commentID=topLevelID
      authorDisplayName=commentThread['snippet']['topLevelComment']['snippet']['authorDisplayName']
      likeCount=commentThread['snippet']['topLevelComment']['snippet']['likeCount']
      publishedAt=commentThread['snippet']['topLevelComment']['snippet']['publishedAt']
      totalReplyCount=commentThread['snippet']['totalReplyCount']
      textOriginal=commentThread['snippet']['topLevelComment']['snippet']['textOriginal']

      row_list.append({'videoID':videoID,'isTopLevel':True,'topLevelID':topLevelID,
                      'commentID':commentID,'authorDisplayName':authorDisplayName,
                      'likeCount':likeCount,'publishedAt':publishedAt,
                      'totalReplyCount':totalReplyCount,'textOriginal':textOriginal})

      # If any replies, write them as well
      if 'replies' in commentThread:
        for reply in commentThread['replies']['comments']:
          commentID=reply['id']
          authorDisplayName=reply['snippet']['authorDisplayName']
          likeCount=reply['snippet']['likeCount']
          publishedAt=reply['snippet']['publishedAt']
          textOriginal=reply['snippet']['textOriginal']

          row_list.append({'videoID':videoID,'isTopLevel':False,'topLevelID':topLevelID,
                           'commentID':commentID,'authorDisplayName':authorDisplayName,
                           'likeCount':likeCount,'publishedAt':publishedAt,
                           'totalReplyCount':totalReplyCount,'textOriginal':textOriginal})

    if 'nextPageToken' not in response:
      break
    else:
      pageToken=response['nextPageToken']

  return pd.DataFrame(row_list, columns=column_names)


In [None]:
def uploaded_comments_to_excel(file_name, uploaded = uploaded):
  '''
  Writes all comments in the Uploaded playlist to an excel file, as a single
  worksheet.
  '''
  column_names = ['videoID','isTopLevel','topLevelID','commentID','authorDisplayName',
                  'likeCount','publishedAt','totalReplyCount','textOriginal']
  comment_df = get_video_comments(uploaded[0])

  for videoID in uploaded[1:]:
    comment_df = pd.concat([comment_df, get_video_comments(videoID)])

  comment_df.to_excel(file_name, engine='xlsxwriter', index=False)


In [None]:
uploaded_comments_to_excel('ceres_fauna_comments_10_27_2023.xlsx')

# Preparing the Data
Preparing the data using TensorFlow preprocessing layers.

Here, we use the `ceres_fauna_comments_10_27_2023.xlsx` excel file generated earlier.

In [2]:
import tensorflow as tf
!pip install tensorflow_text
import tensorflow_text as text
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

Collecting tensorflow_text
  Downloading tensorflow_text-2.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow_text
Successfully installed tensorflow_text-2.14.0


In [3]:
# Load data into a pandas DataFrame
comments_df = pd.read_excel('ceres_fauna_comments_10_27_2023.xlsx')

In [4]:
# We filter out comments that are not at least two seperate words
multiple_word_indices = np.char.find(comments_df['textOriginal'].to_numpy(dtype='str'), " ") > -1
multiple_word_series = comments_df.copy().loc[multiple_word_indices]['textOriginal']

comments_tensor = tf.convert_to_tensor(multiple_word_series.to_numpy(dtype='str'), dtype='string')

We split the data into train, validation, and test splits.

For reasonable training times, we use a 50/10/40 split.

In [5]:
comment_ds = tf.data.Dataset.from_tensor_slices(comments_tensor).shuffle(1000, seed=12)

train_split = int(np.floor(0.5*len(comment_ds)))
val_split = int(np.floor(0.1*len(comment_ds)))
test_split = int(len(comment_ds) - train_split - val_split)

train_ds = comment_ds.take(train_split)
val_ds = comment_ds.skip(train_split).take(val_split)
test_ds = comment_ds.skip(train_split + val_split).take(test_split)

Generate vocabulary using [subword tokenizers](https://www.tensorflow.org/text/guide/subwords_tokenizer) tutorial.

In [6]:
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = 8000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [7]:
vocab_file = 'vocab2.txt'

In [None]:
%%time
vocab = bert_vocab.bert_vocab_from_dataset(
    train_ds.batch(1000).prefetch(2),
    **bert_vocab_args
)

# Save vocab to file

with open(vocab_file, 'w') as f:
  for token in vocab:
    print(token, file=f)

CPU times: user 1min 12s, sys: 256 ms, total: 1min 12s
Wall time: 1min 13s


In [8]:
vocab_size = 0
with open(vocab_file, "rb") as f:
    vocab_size = sum(1 for _ in f)

Tokenize, trim (to `MAX_TOKENS`), and pad the inputs, as well as form into (input, label) Datasets where the label is the input right-shifted by one token.

Then batch (batch size = `BATCH_SIZE`) and prefetch data.

In [9]:
tokenizer = text.BertTokenizer(vocab_file, **bert_tokenizer_params)

In [10]:
MAX_TOKENS = 128
BATCH_SIZE = 64

In [11]:
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

def add_start_end(ragged):
  count = ragged.bounding_shape()[0]
  starts = tf.fill([count,1,1], START)
  ends = tf.fill([count,1,1], END)

  return tf.concat([starts, ragged, ends], axis=1)

We want to train the model to predict comments from given prompts. We have some options to do this:
* Take the first half of the tokens as input, right-shift for the teacher, and take the re

In [12]:
def prepare_batch(input_batch : tf.Tensor, batch_size = BATCH_SIZE, max_tokens=MAX_TOKENS):
  '''
  Take tensor with (sentence) and split into ((prompt,teacher),label) for training.
  Firstly tokenized and trimmed, (sentence) is then split up to a random point k. Then:
  prompt = [START] + sentence[0:k] + [END]
  label = sentence + [END]
  teacher = [START] + sentence
  Finally, pad outputs to MAX_TOKENS length.
  '''
  # Tokenize
  tokens = tokenizer.tokenize(input_batch)[:,:max_tokens-1,:]

  # Create Prompts
  prompt_tokens = tokens[:,:-1,:]
  # Select a prompt length
  token_lens = tf.cast(prompt_tokens.row_lengths()-1, dtype='float32') # Keep at least 1 token outside the prompt
  #print(token_lens)
  prompt_lens = tf.floor(tf.random.uniform([batch_size], tf.zeros_like(token_lens), token_lens))
  #print(prompt_lens)
  prompt_lens = tf.squeeze(tf.cast(prompt_lens,dtype='int32'))
  # Form prompts of varied length
  prompt = tf.squeeze(prompt_tokens.to_tensor(shape=(batch_size,max_tokens-2,1)), axis=2)
  #print(prompt)
  #print(prompt_lens)
  prompt = tf.RaggedTensor.from_tensor(prompt,prompt_lens)[:,:,tf.newaxis]

  # Tokenize
  prompt = add_start_end(prompt)
  teacher = add_start_end(tokens)[:,:-1,:]
  label = add_start_end(tokens)[:,1:,:]

  # 0-Pad  convert to dense tensor, then form shape (batch_size, max_tokens)
  prompt = tf.squeeze(prompt.to_tensor(shape=(batch_size,max_tokens,1)))
  teacher = tf.squeeze(teacher.to_tensor(shape=(batch_size,max_tokens,1)))
  label = tf.squeeze(label.to_tensor(shape=(batch_size,max_tokens,1)))

  # form Dataset
  output_batch = ((prompt,teacher), label)

  return output_batch

In [13]:
# Drop Remainder required for tf.random.uniform
def make_batches(ds):
  return (
      ds
      .batch(BATCH_SIZE, drop_remainder=True)
      .map(prepare_batch, tf.data.AUTOTUNE)
      .prefetch(buffer_size=tf.data.AUTOTUNE))

In [14]:
train_batches = make_batches(train_ds)
val_batches = make_batches(val_ds)

Take a singular batch as an example.

In [15]:
for (input,teacher), label in train_batches.take(1):
  print(input.shape)
  print(teacher.shape)
  print(label.shape)

(64, 128)
(64, 128)
(64, 128)


In [None]:
print(input[0])
print(teacher[0])
print(label[0])

tf.Tensor(
[   2 1054 1063 1182 1045 1048 1349 1780    3    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0], shape=(128,), dtype=int64)
tf.Tensor(
[   2 1054 1063 1182 1045 1048 1349 1780 1057  988  998 1288  993 1574
 1131  986 1793  985 2492  992   53    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0

Convert tokens to vectors with a `tf.keras.layers.Embedding` layer and add positional encoding.

In [16]:
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1)

  return tf.cast(pos_encoding, dtype=tf.float32)

In [17]:
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
    self.pos_encoding = positional_encoding(length=2048, depth=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x

In [None]:
embed = PositionalEmbedding(vocab_size=vocab_size, d_model=512)
te_emb = embed(teacher)
te_emb._keras_mask;
in_emb = embed(input)
in_emb._keras_mask;

In [18]:
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

In [19]:
class CrossAttention(BaseAttention):
  '''def __init__(self, **kwargs):
    print('Initializing CrossAttention')
    super().__init__(self, **kwargs)'''

  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query = x,
        key = context,
        value = context,
        return_attention_scores = True
    )
    # Cache the attention scores for plotting later.
    self.last_attn_scores = attn_scores

    x = self.add([x,attn_output])
    x = self.layernorm(x)
    return x

In [None]:
sample_ca = CrossAttention(num_heads=2, key_dim=512)
print(in_emb.shape)
print(sample_ca(in_emb, te_emb).shape)

(64, 128, 512)
(64, 128, 512)


In [89]:
class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query = x,
        key = x,
        value = x
    )
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [None]:
sample_gsa = GlobalSelfAttention(num_heads=2, key_dim=512)
print(in_emb.shape)
print(sample_gsa(in_emb).shape)

(64, 128, 512)
(64, 128, 512)


In [90]:
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query = x,
        key = x,
        value = x,
        use_causal_mask = True
    )
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [None]:
sample_csa = CausalSelfAttention(num_heads=2, key_dim=512)
print(te_emb.shape)
print(sample_csa(te_emb).shape)

(64, 128, 512)
(64, 128, 512)


In [22]:
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),
        tf.keras.layers.Dense(d_model),
        tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x,self.seq(x)])
    x = self.layer_norm(x)
    return x

In [None]:
sample_ffn = FeedForward(512,2048)

print(te_emb.shape)
print(sample_ffn(te_emb).shape)

(64, 128, 512)
(64, 128, 512)


In [91]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*,d_model,num_heads,dff,dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate
    )
    self.ffn = FeedForward(d_model,dff)

  def call(self,x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

In [None]:
sample_encoder_layer = EncoderLayer(d_model=512, num_heads=8,dff=2048)
print(in_emb.shape)
print(sample_encoder_layer(in_emb).shape)

(64, 128, 512)
(64, 128, 512)


In [92]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size = vocab_size, d_model = d_model
    )

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)
    ]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self,x):
    # `x` is token-IDs shape: (batch_size, seq_len)
    x = self.pos_embedding(x) # Shape '(batch_size, seq_len, d_model)'.

    # Add dropout
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x # Shape `(batch_size, seq_length, d_model)`

In [None]:
# Test encoder
sample_encoder = Encoder(num_layers=4, d_model=512, num_heads=8, dff=2048, vocab_size=vocab_size)
sample_encoder_output = sample_encoder(input,training=False)

print(in_emb.shape)
print(sample_encoder_output.shape) # Shape `(batch_size, input_seq_len, d_model)`

(64, 128, 512)
(64, 128, 512)


In [93]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,*,d_model,num_heads,dff,dropout_rate=0.1):
    super(DecoderLayer,self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads = num_heads,
        key_dim = d_model,
        dropout = dropout_rate
    )

    self.cross_attention = CrossAttention(
        num_heads = num_heads,
        key_dim = d_model,
        dropout = dropout_rate
    )

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)

    # Cache last attention scores for plotting later
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x) # Shape `(batch_size, seq_len, d_model)`
    return x

In [None]:
sample_decoder_layer = DecoderLayer(d_model=512, num_heads=8, dff=2048)

sample_decoder_layer_output = sample_decoder_layer(x=te_emb, context=in_emb)

print(te_emb.shape)
print(in_emb.shape)
print(sample_decoder_layer_output.shape) # `(batch_size, seq_len, d_model)`

(64, 128, 512)
(64, 128, 512)
(64, 128, 512)


In [94]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self,*,num_layers, d_model, num_heads, dff, vocab_size, dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)

    self.dropout = tf.keras.layers.Dropout(dropout_rate)

    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)
    ]
    self.last_attn_scores = None

  def call(self, x, context):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x) # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.dec_layers[i](x,context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores

    # shape of x is (batch_size, target_seq_len, d_model)
    return x


In [None]:
sample_decoder = Decoder(num_layers=4, d_model=512, num_heads=8,
                         dff=2048, vocab_size=vocab_size)

output = sample_decoder(x=teacher, context=in_emb)

print(teacher.shape)
print(in_emb.shape)
print(output.shape)

(64, 128)
(64, 128, 512)
(64, 128, 512)


In [None]:
sample_decoder.last_attn_scores

<tf.Tensor: shape=(64, 8, 128, 128), dtype=float32, numpy=
array([[[[0.11104868, 0.11142529, 0.11069106, ..., 0.        ,
          0.        , 0.        ],
         [0.11103439, 0.11104572, 0.11088928, ..., 0.        ,
          0.        , 0.        ],
         [0.11159551, 0.11177482, 0.11118655, ..., 0.        ,
          0.        , 0.        ],
         ...,
         [0.0078125 , 0.0078125 , 0.0078125 , ..., 0.0078125 ,
          0.0078125 , 0.0078125 ],
         [0.0078125 , 0.0078125 , 0.0078125 , ..., 0.0078125 ,
          0.0078125 , 0.0078125 ],
         [0.0078125 , 0.0078125 , 0.0078125 , ..., 0.0078125 ,
          0.0078125 , 0.0078125 ]],

        [[0.11113437, 0.11103407, 0.11085951, ..., 0.        ,
          0.        , 0.        ],
         [0.11089264, 0.11112651, 0.11106726, ..., 0.        ,
          0.        , 0.        ],
         [0.1107775 , 0.1106852 , 0.11086118, ..., 0.        ,
          0.        , 0.        ],
         ...,
         [0.0078125 , 0.00781

In [95]:
class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
    super().__init__()
    self.encoder = Encoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads,
                           dff=dff, vocab_size=input_vocab_size, dropout_rate=dropout_rate)
    self.decoder = Decoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads,
                           dff=dff, vocab_size=target_vocab_size, dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs):
    # To support Keras model '.fit', pass all inputs as first argument
    context, x = inputs

    context = self.encoder(context) # (batch_size, context_len, d_model)

    x = self.decoder(x, context) # (batch_size, target_len, d_model)

    logits = self.final_layer(x) # (batch_size, target_len, target_vocab_size)

    try:
      # Drop keras mask, so it doesn't scale losses/metrics
      del logits._keras_mask
    except AttributeError:
      pass

    # Return final output and attention weights
    return logits

## Hyperparameters

In [31]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

## Testing

In [None]:
transformer = Transformer(num_layers=num_layers, d_model=d_model, num_heads=num_heads,
                          dff=dff, input_vocab_size=vocab_size, target_vocab_size=vocab_size,
                          dropout_rate=dropout_rate)

In [None]:
output = transformer((input,teacher))
print(teacher.shape)
print(input.shape)
print(output.shape)

(64, 128)
(64, 128)
(64, 128, 7986)


In [None]:
attn_scores = transformer.decoder.dec_layers[-1].last_attn_scores
print(attn_scores.shape) # batch, heads, target_seq, input_seq

(64, 4, 128, 128)


In [None]:
transformer.summary()

Model: "transformer"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder (Encoder)           multiple                  3661056   
                                                                 
 decoder (Decoder)           multiple                  5772032   
                                                                 
 dense_16 (Dense)            multiple                  1030194   
                                                                 
Total params: 10463282 (39.91 MB)
Trainable params: 10463282 (39.91 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
pred_token_vector = tf.argmax(output,axis=2)
pred_token = tokenizer.detokenize(pred_token_vector)
pred_phrases = tf.strings.reduce_join(pred_token,axis=1,separator=' ')
print(pred_phrases)

tf.Tensor(
[b'cave $ \xe9\x80\xb2 destroying progress alert route destroying aw hadn woman woman laying pocket rat caring rat rat\xf0\x9f\xa5\xb4 weak \xf0\x9f\x98\x89 duo comment comment \xf0\x9f\x98\x90 went went \xe8\xa6\xb3 child\xf0\x9f\xa5\xb4\xf0\x9f\xa5\xb4\xf0\x9f\xa5\xb4 follow follow first finale finale finale \xe5\xbf\x9cpseballballballball\xe5\xa3\xb0 destroying destroying dayballballpse boy boyshedshed belly belly belly bellygsgs \xf0\x9f\xa4\x8e v smash v vballball debut debut debut \xe5\xbf\x9c \xe5\xbf\x9cballball \xe5\xbf\x9c disappoint disappoint disappointballballpsepse\xe1\x85\xa9\xe1\x85\xa9 individual individual \xe5\xbf\x9c \xe5\xbf\x9c\xe2\x97\x95 bucket fanservice fanservice \xf0\x9f\x92\x9a\xf0\x9f\x92\x9a\xe1\x85\xa9\xe1\x85\xa9\xe1\x85\xa9 affinity dinner\xe7\xb4\xb0\xe1\x85\xa9\xe1\x85\xa9\xe1\x85\xa9\xf0\x9f\x8c\xa7 \xf0\x9f\xa4\x8e \xf0\x9f\xa4\x8e \xf0\x9f\xa4\x8eeek mean \xf0\x9f\xa5\x9a \xf0\x9f\xa5\x9a \xf0\x9f\xa5\x9a \xf0\x9f\xa5\x9a sus sus sus sw

# Training
Uses Adam optimizer with original [Transformer paper](https://arxiv.org/abs/1706.03762) custom learning rate scheduler.

$$lrate = d_{model}^{-0.5}*\min\left(step_{num}^{-0.5},step_{num}*warmup\_steps^{-1.5}\right)$$

In [28]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)


In [44]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9,
                                     beta_2=0.98, epsilon=1e-9)

In [29]:
# Setup padding mask for calculating loss properly
def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
      from_logits=True, reduction='none'
  )
  loss = loss_object(label,pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss

def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

Setup Checkpoint saving of weights after each epoch, then begin training.

In [None]:
checkpoint_path = 'training_1/checkpoint.ckpt'

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                         save_weights_only=True,
                                                         verbose=1)

In [45]:
transformer.compile(loss=masked_loss, optimizer=optimizer,
                    metrics=[masked_accuracy], run_eagerly=True)

In [None]:
transformer.fit(train_batches, epochs=15, validation_data=val_batches, callbacks=[checkpoint_callback])

Epoch 1/20




Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7ea71830f0a0>

Loading model weights manually.

In [96]:
# Load weights
model_checkpoint_path = 'transformer_2'
transformer = Transformer(num_layers=num_layers, d_model=d_model, num_heads=num_heads,
                          dff=dff, input_vocab_size=vocab_size, target_vocab_size=vocab_size,
                          dropout_rate=dropout_rate)
transformer.load_weights(model_checkpoint_path)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7c10065d1cc0>

# Testing
Not expected to be especially different than the validation accuracy, but could be interesting regardless.

Only a fraction of the test set is used to save time.

In [53]:
test_fraction = 0.1
test_count = tf.cast(tf.floor(len(comment_ds)*test_fraction), dtype='int64')
test_tensor = next(iter(test_ds.batch(test_count).take(1))) # tensor of comments
(test_prompt, test_teacher), test_label = prepare_batch(test_tensor, batch_size=test_tensor.shape[0]) # prepare

In [55]:
%%time
test_loss, test_acc = transformer.evaluate(x=(test_prompt,test_teacher), y=test_label, verbose=2)

280/280 - 1176s - loss: 2.6268 - masked_accuracy: 0.5582 - 1176s/epoch - 4s/step
CPU times: user 29min 18s, sys: 1min 52s, total: 31min 11s
Wall time: 20min 22s


# Run Inference
Create a model to generate comments from prompts:
* Encode prompt with `tokenizer`, trim, add `[START],[END]`, then pad - this is the encoder input
* calculate padding masks and look-ahead masks
* `decoder` outputs preds by looking at `encoder` output and own output
* Concatenate predicted token to decoder input and pass to of decoder
* Decoder predicts next token based on previous tokens it predicted

In [135]:
class Commentator(tf.Module):
  def __init__(self, tokenizers, transformer):
    self.tokenizers = tokenizers
    self.transformer = transformer

  def __call__(self, sentence, max_length=MAX_TOKENS):
    # Add '[START]' and '[END]' tokens to input sentence
    assert isinstance(sentence, tf.Tensor)
    if len(sentence.shape) == 0:
      sentence = sentence[tf.newaxis]

    sentence = self.tokenizers.tokenize(sentence)[:,:MAX_TOKENS-2,:]
    sentence = tf.squeeze(add_start_end(sentence).to_tensor(shape=(1,MAX_TOKENS,1)),axis=2)
    encoder_input = sentence

    # Init output with '[START]' token
    out = self.tokenizers.tokenize(tf.constant(['']))
    start_end = add_start_end(out)[0]
    start = start_end[0][tf.newaxis]
    end = start_end[1][tf.newaxis]

    # 'tf.TensorArray' required so dynamic-loop traced by tf.function
    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)
    for i in tf.range(max_length):
      output = tf.transpose(output_array.stack())

      output = tf.squeeze(output, axis=1)

      predictions = self.transformer([encoder_input, output], training = False)

      # Select last token for `seq_len` dimension
      predictions = predictions[:,-1:,:] # Shape `(batch_size, 1, vocab_size)`

      predicted_id = tf.argmax(predictions, axis=-1)

      # Concatenate `predicted_id` to output given to decoder as input
      output_array = output_array.write(i+1, predicted_id)

      if predicted_id == end:
        break

    output = tf.squeeze(tf.transpose(output_array.stack()), axis=0)
    # output shape `(1,tokens)`
    text = tf.strings.reduce_join(self.tokenizers.detokenize(output)[0], axis=0, separator=" ") # Shape: `()`

    tokens = self.tokenizers.detokenize(output)[0]
    # `tf.function` prevents usage of attention_wieghts calculated
    # on last iteration of loop - recalc. outside of loop
    self.transformer([encoder_input, output[:,:-1]], training=False)
    attention_weights = self.transformer.decoder.last_attn_scores
    return text, tokens, attention_weights

In [129]:
commentator = Commentator(tokenizer, transformer)

def print_comment(sentence, tokens):
  print(f'{"Input:":15s}: {sentence}')
  print(f'{"Prediction":15s}: {tokens.numpy().decode("utf-8")}')

sentence = 'I miss'
output_text, output_tokens, attention_weights = commentator(tf.constant(sentence))
print_comment(sentence, output_text)

<tf.RaggedTensor [[[50],
  [1401]]]>
tf.Tensor(
[[   2   50 1401    3    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]], shape=(1, 128), dtype=int64)
<tensorflow.python.util.tf_should_use.ShouldUseWrapper object at 0x7c1006f5b4f0>
Input:         : I miss
Prediction     : [START] i miss the stream , but i ' m glad you ' re back ! [END]


In [76]:
sentence2 = tf.constant('Take care of')
out_text2, out_toks2, attn_wts2 = commentator(sentence2)
print_comment(sentence2, out_text2)

Input:         : b'Take care of'
Prediction     : [START] take care of the rest of the month of horrors , fauna ! [END]


In [105]:
class TemperatureCommentator(tf.Module):
  def __init__(self, tokenizers, transformer):
    self.tokenizers = tokenizers
    self.transformer = transformer

  def __call__(self, sentence, temperature = 0.1, max_length=MAX_TOKENS):
    # Add '[START]' and '[END]' tokens to input sentence
    assert isinstance(sentence, tf.Tensor)
    if len(sentence.shape) == 0:
      sentence = sentence[tf.newaxis]

    sentence = self.tokenizers.tokenize(sentence)[:,:MAX_TOKENS-2,:]
    sentence = tf.squeeze(add_start_end(sentence).to_tensor(shape=(1,MAX_TOKENS,1)),axis=2)
    encoder_input = sentence

    # Init output with '[START]' token
    out = self.tokenizers.tokenize(tf.constant(['']))
    start_end = add_start_end(out)[0]
    start = start_end[0][tf.newaxis]
    end = start_end[1][tf.newaxis]

    # 'tf.TensorArray' required so dynamic-loop traced by tf.function
    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)

    for i in tf.range(max_length):
      # output = tf.squeeze(tf.transpose(output_array.stack()), axis=0)
      output = tf.transpose(output_array.stack())
      # output = tf.reshape(output,(1,output.shape[2],1))
      output = tf.squeeze(output, axis=1)

      predictions = self.transformer([encoder_input, output], training = False)


      # Select last token for `seq_len` dimension
      # print(predictions)
      predictions = tf.squeeze(predictions[:,-1:,:]/temperature, axis=0) # Shape `(batch_size, 1, vocab_size)`
      predicted_id = tf.random.categorical(predictions, num_samples=1)

      # Concatenate `predicted_id` to output given to decoder as input
      output_array = output_array.write(i+1, predicted_id)

      if predicted_id == end:
        break

    output = tf.squeeze(tf.transpose(output_array.stack()), axis=0)
    # output shape `(1,tokens)`
    text = tf.strings.reduce_join(self.tokenizers.detokenize(output)[0], axis=0, separator=" ") # Shape: `()`

    tokens = self.tokenizers.detokenize(output)[0]
    # `tf.function` prevents usage of attention_wieghts calculated
    # on last iteration of loop - recalc. outside of loop
    self.transformer([encoder_input, output[:,:-1]], training=False)
    attention_weights = self.transformer.decoder.last_attn_scores

    return text, tokens, attention_weights

In [78]:
temp_commentator = TemperatureCommentator(tokenizer, transformer)

def print_comment(sentence, tokens):
  print(f'{"Input:":15s}: {sentence}')
  print(f'{"Prediction":15s}: {tokens.numpy().decode("utf-8")}')

sentence = 'I miss'
output_text, output_tokens, attention_weights = temp_commentator(tf.constant(sentence),1.5)
print_comment(sentence, output_text)

Input:         : I miss
Prediction     : [START] i miss onee not muffin , attached to channels s hours voice personality 50 😂 this second joke [END]


In [79]:
sentence = 'Take care of'
output_text, output_tokens, attention_weights = temp_commentator(tf.constant(sentence),0.5)
print_comment(sentence, output_text)

Input:         : Take care of
Prediction     : [START] take care of yourself , fauna ! [END]


# Export Model

In [130]:
class ExportCommentator(tf.Module):
  def __init__(self, commentator):
    self.commentator = commentator

  @tf.function(input_signature=[tf.TensorSpec(shape=[], dtype=tf.string)])
  def __call__(self, sentence):
    (result, tokens, attention_weights) = self.commentator(sentence)

In [131]:
export_commentator = ExportCommentator(commentator)

In [134]:
outputs = export_commentator('man i')

In [111]:
tf.saved_model.save(export_commentator, export_dir = 'export_commentator')

Tensor("ReduceJoin/ReduceJoin:0", shape=(), dtype=string)
