In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import tensorflow as tf
import numpy as np
import os, time, random, math, re, json, string, sys, datetime, textwrap
import matplotlib.pyplot as plt
from tqdm import tqdm
import keras_nlp
from tokenizer import *
import heapq
from collections import Counter

Using TensorFlow backend


In [3]:
# gpu growth
for gpu in tf.config.experimental.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(gpu, True)
    print(f'{gpu} memory growth: {tf.config.experimental.get_memory_growth(gpu)}')

PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU') memory growth: True


In [4]:
SEQ_LENGTH = 100

In [5]:
texts = open("simple.txt", "r").read()


# print(len(vocab), vocab)
tokenizer = keras_nlp.tokenizers.SentencePieceTokenizer(proto=keras_nlp.tokenizers.compute_sentence_piece_proto(tf.data.Dataset.from_tensor_slices([texts]), vocabulary_size=300))

a = tokenizer.tokenize(["Water covers about 71% of the land's surface on Earth because it is polar."])
print(a, tokenizer.detokenize(a))

<tf.RaggedTensor [[32, 66, 167, 12, 109, 110, 175, 197, 4, 7, 4, 10, 4, 17, 8, 54, 6, 141,
  45, 58, 194, 56, 174, 47, 45, 206, 46, 4, 24, 230, 17, 35, 11]]> tf.Tensor([b"Water covers about 71% of the land's surface on Earth because it is polar."], shape=(1,), dtype=string)


In [6]:
# create ds for training
tokenized_texts = tokenizer.tokenize(texts)
ds = tf.data.Dataset.from_tensor_slices(tokenized_texts)
print(ds.cardinality())
ds = ds.batch(SEQ_LENGTH + 1, drop_remainder=True)
ds = ds.map(lambda x: (x[:-1], x[1:]))
ds = ds.shuffle(10000).repeat(1000)
ds = ds.batch(256, drop_remainder=True)
ds = ds.prefetch(tf.data.experimental.AUTOTUNE)

# example

print("c", ds.cardinality())
for x, y in ds.take(1):
    print(x.shape, y.shape)
    print(x[0].shape, y[0].shape)
    print(tokenizer.detokenize(x[0]), tokenizer.detokenize(y[0]))

tf.Tensor(1088, shape=(), dtype=int64)
c tf.Tensor(39, shape=(), dtype=int64)
(256, 100) (256, 100)
(100,) (100,)
tf.Tensor(b'ds (consisting of ice and liquid water suspended in air), and precipitation (0.001%).[23][24] Water moves continually through the water cycle of evaporation, transpiration (evapotranspiration), condensation, precipitation, and runoff, usually reaching the sea. Water plays an import', shape=(), dtype=string) tf.Tensor(b'(consisting of ice and liquid water suspended in air), and precipitation (0.001%).[23][24] Water moves continually through the water cycle of evaporation, transpiration (evapotranspiration), condensation, precipitation, and runoff, usually reaching the sea. Water plays an important', shape=(), dtype=string)


In [7]:
# create model
def create_model():
    inp_layer = tf.keras.Input(
        shape=(None,),
        dtype="int32",
        name="token_ids",
    )
    print(inp_layer.shape)
    x = keras_nlp.layers.TokenAndPositionEmbedding(
        vocabulary_size=tokenizer.vocabulary_size(),
        sequence_length=SEQ_LENGTH,
        embedding_dim=64,
    )(inp_layer)

    print(x.shape)


    for _ in range(2):
        x = keras_nlp.layers.TransformerDecoder(
            intermediate_dim=256,
            dropout=0.1,
            num_heads=8,
        )(x)

    # Use "[START]" token to classify
    x = x[:, 0]
    print(x.shape)
    # x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(tokenizer.vocabulary_size())(x)
    model = tf.keras.Model(
        inputs=inp_layer,
        outputs=x,
    )
    return model

model = create_model()
model.summary()

(None, None)
(None, None, 64)
(None, 64)
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 token_ids (InputLayer)      [(None, None)]            0         
                                                                 
 token_and_position_embeddin  (None, None, 64)         25600     
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_decoder (Transf  (None, None, 64)         49984     
 ormerDecoder)                                                   
                                                                 
 transformer_decoder_1 (Tran  (None, None, 64)         49984     
 sformerDecoder)                                                 
                                                                 
 tf.__operators__.ge

In [8]:
model(tf.zeros((256, SEQ_LENGTH))).shape

TensorShape([256, 300])

In [9]:
for x, y in ds.take(1):
    print(x.shape, y.shape, model(x).shape)

(256, 100) (256, 100) (256, 300)


In [10]:
next(iter(ds.take(1)))

(<tf.Tensor: shape=(256, 100), dtype=int32, numpy=
 array([[ 39,   4,  49, ...,  21,  11,   4],
        [241,   6, 275, ...,  98, 144,  19],
        [171,  69,  69, ...,   8, 264, 239],
        ...,
        [171,  69,  69, ...,   8, 264, 239],
        [150,   6, 194, ..., 103, 212,   8],
        [165,  60,  59, ..., 136,   5, 148]])>,
 <tf.Tensor: shape=(256, 100), dtype=int32, numpy=
 array([[  4,  49, 183, ...,  11,   4, 246],
        [  6, 275,  23, ..., 144,  19, 101],
        [ 69,  69,  19, ..., 264, 239,  11],
        ...,
        [ 69,  69,  19, ..., 264, 239,  11],
        [  6, 194,  13, ..., 212,   8,  35],
        [ 60,  59,  14, ...,   5, 148,  38]])>)

In [12]:
# train


# def custom_loss(y_true, y_pred):
#     # return tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)
#     y_true = tf.one_hot(y_true, y_pred.shape[-1])
#     print(y_true.shape, y_pred.shape)
#     return tf.keras.losses.categorical_crossentropy(y_true, y_pred, from_logits=True)


def custom_loss(labels, logits):
    # scce but capped at 2000.0 to avoid exploding gradients
    loss_val = tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
    loss_val = tf.clip_by_value(loss_val, clip_value_min=-1000.0, clip_value_max=2000.0)
    return tf.reduce_mean(loss_val)

model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=custom_loss,
    metrics=["accuracy"],
)

# model.fit(ds.take(1), epochs=9)
model.fit(
    tf.zeros((256, SEQ_LENGTH), dtype="int32"), tf.zeros((256, SEQ_LENGTH), dtype="int32"), epochs=9, batch_size=256
)

Epoch 1/9


ValueError: in user code:

    File "c:\Users\shiva\miniconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "<ipython-input-11-1a7d7d58decc>", line 13, in custom_loss  *
        loss_val = tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
    File "c:\Users\shiva\miniconda3\envs\tf\lib\site-packages\keras\losses.py", line 2084, in sparse_categorical_crossentropy
        return backend.sparse_categorical_crossentropy(
    File "c:\Users\shiva\miniconda3\envs\tf\lib\site-packages\keras\backend.py", line 5630, in sparse_categorical_crossentropy
        res = tf.nn.sparse_softmax_cross_entropy_with_logits(

    ValueError: `labels.shape` must equal `logits.shape` except for the last dimension. Received: labels.shape=(25600,) and logits.shape=(256, 300)


In [None]:
a = tf.data.Dataset.from_tensor_slices(tf.range(1000)).batch(101, drop_remainder=True).map(lambda x: (x[:-1], x[1:])).shuffle(10000).repeat(100).batch(32, drop_remainder=True)

In [None]:
next(iter(a.take(13)))

(<tf.Tensor: shape=(32, 100), dtype=int32, numpy=
 array([[  0,   1,   2, ...,  97,  98,  99],
        [808, 809, 810, ..., 905, 906, 907],
        [404, 405, 406, ..., 501, 502, 503],
        ...,
        [  0,   1,   2, ...,  97,  98,  99],
        [606, 607, 608, ..., 703, 704, 705],
        [202, 203, 204, ..., 299, 300, 301]])>,
 <tf.Tensor: shape=(32, 100), dtype=int32, numpy=
 array([[  1,   2,   3, ...,  98,  99, 100],
        [809, 810, 811, ..., 906, 907, 908],
        [405, 406, 407, ..., 502, 503, 504],
        ...,
        [  1,   2,   3, ...,  98,  99, 100],
        [607, 608, 609, ..., 704, 705, 706],
        [203, 204, 205, ..., 300, 301, 302]])>)