<a href="https://colab.research.google.com/github/cwerries/IANNWTF_Group_14_Submissions/blob/master/textGenerationWithNietzsche_HW_11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install -q tensorflow_text
!pip install -q sentencepiece
import tensorflow as tf
import tensorflow_text as tf_txt
import tqdm.notebook as note
import sentencepiece as sp
import io
import datetime

In [11]:
# load tensorboard extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


## Hyperparameters


In [65]:
# Hyperparameters
VOCAB_SIZE = 5000       # 2000 - 7000
SEQUEN_LEN = 42        # m = lenght of input sequence
SHUFFLE_SIZE = 1000
BATCH_SIZE = 50
EMBED_DIM = 128          # 64 - 256
NUM_HEADS = 2          # 2-4
TOP_K = 20
EPOCH_SIZE = 4

# 4.1 The dataset, preprocessing and tokenization

In [39]:
# path "Beyond Good and Evil"
path = tf.keras.utils.get_file("nietzsche.txt", origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
# load txt as string
text = open(path).read()

# train the SentencePiece tokenizer
sp.SentencePieceTrainer.train(
    input=path, model_prefix='tokenizer_model', model_type="unigram", vocab_size=VOCAB_SIZE)

# deserialize the model
trained_tokenizer_model = tf.io.gfile.GFile('tokenizer_model.model', "rb").read()
# load the model as a tokenizer 
tokenizer = tf_txt.SentencepieceTokenizer(
    model=trained_tokenizer_model, out_type=tf.int32, nbest_size=-1, alpha=1, reverse=False,
    add_bos=False, add_eos=False, return_nbest=False, name=None
)

# tokenize the str
tokens = tokenizer.tokenize(text)

# create all possible slices of m+1 
token_windows = tf_txt.sliding_window(data=tokens, width=SEQUEN_LEN + 1)

# create training data from tokenized text
token_ds = tf.data.Dataset.from_tensor_slices(token_windows)
# create targets
token_ds = token_ds.map(lambda x: (x[:SEQUEN_LEN], x[SEQUEN_LEN:SEQUEN_LEN+1]))
# shuffle, batch ds
token_ds = token_ds.shuffle(SHUFFLE_SIZE).batch(BATCH_SIZE)

## 4.2/3 The model 

In [40]:
class Embedd_Layer(tf.keras.layers.Layer):
    def __init__(self):
        super(Embedd_Layer, self).__init__()
        # embedding of token indices
        self.embedding_voc = tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_DIM) 
        # embedding of token position  (positional embedding)
        self.embedding_pos = tf.keras.layers.Embedding(input_dim=SEQUEN_LEN, output_dim=EMBED_DIM)
    

    def call(self, x):
        x = self.embedding_voc(x)
        y = self.embedding_pos(tf.range(0,SEQUEN_LEN))
        return  y + x 

In [41]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self):
        super(TransformerBlock, self).__init__()

        self.mha = tf.keras.layers.MultiHeadAttention(num_heads=NUM_HEADS, key_dim=EMBED_DIM)

        self.dense_1 = tf.keras.layers.Dense(units=32, activation="relu") # 32 - 256 units
        self.dense_2 = tf.keras.layers.Dense(units=EMBED_DIM)
    
        self.dropout_1 = tf.keras.layers.Dropout(rate=0.1)
        self.dropout_2 = tf.keras.layers.Dropout(rate=0.1)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)


    def call(self, x):
        mha_out = self.mha(x, x)
        drop_out = self.dropout_2(mha_out)
        ln_out = self.norm_1(x + drop_out)
        dns_out = self.dense_1(ln_out)
        dns_out = self.dense_2(dns_out)
        drop_out = self.dropout_2(dns_out)
        y = self.norm_2(ln_out+drop_out)
        return y

In [70]:
class Transformer(tf.keras.Model):
    def __init__(self, tokenizer):
        super(Transformer, self).__init__()
        self.tokenizer = tokenizer
        self.optimizer = tf.keras.optimizers.Adam()
        self.loss_func = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

        self.metrics_list = [
                        tf.keras.metrics.Mean(name="loss"),
                        tf.keras.metrics.CategoricalAccuracy(name="acc"),
                        tf.keras.metrics.TopKCategoricalAccuracy(k=3,name="top-3-acc") 
                        ]

        self.layer_embedd = Embedd_Layer()
        self.layer_trnsfrm = TransformerBlock()
        self.pool = tf.keras.layers.GlobalAvgPool1D()
        self.dense = tf.keras.layers.Dense(units=VOCAB_SIZE)


    def call(self, x):      
        x = self.layer_embedd(x)
        x = self.layer_trnsfrm(x)
        x = self.pool(x)
        x = self.dense(x)
        return x


    def reset_metrics(self):
        for metric in self.metrics:
            metric.reset_states()
            

    @tf.function
    def train_step(self, data):       
        x, targets = data

        with tf.GradientTape() as tape:
            predictions = self(x)
            loss = self.loss_func(targets, predictions) + tf.reduce_sum(self.losses)
        
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        
        # update loss metric
        self.metrics[0].update_state(loss)
            
        # for all metrics except loss, update states (accuracy etc.)
        for metric in self.metrics[1:]:
            metric.update_state(targets,predictions)

        # Return a dictionary mapping metric names to current value
        return {m.name: m.result() for m in self.metrics}


    def generate_text(self, prompt, k_generate=5):
        #tokenize, add dimension and pad prompt
        tokenized = self.tokenizer.tokenize(prompt)
        len =  tokenized.shape[0]
        tokenized = tf.expand_dims(tokenized, axis=0)
        tokenized = tf.pad(tokenized, [[0,0], [SEQUEN_LEN - len,0]], "CONSTANT", constant_values=0)

        for _ in range(k_generate):
            # random next token
            logits, indices = tf.math.top_k(self(tokenized), k=TOP_K, sorted=True)
            sample = tf.random.categorical(tf.cast(indices, tf.float32), 1, dtype=tf.int32)
            # add new word to prompt
            tokenized = tf.concat((tokenized, sample), axis=1)
            # truncate beginning of prompt
            tokenized = tf.slice(tokenized, [0, 1], [1, SEQUEN_LEN])

        return self.tokenizer.detokenize(tokenized[:,-len-k_generate:])

# 4.4 The training loop


In [71]:
model = Transformer(tokenizer)

# Define where to save the log
hyperparameter_string = "VOCAB_SIZE-5000,SEQUEN_LEN-20,SHUFFLE_SIZE-1000,BATCH_SIZE-50,EMBED_DIM-128,NUM_HEADS-3"
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

# load tensorboard extension
log_path = f"logs/{hyperparameter_string}/{current_time}/train"
summary_writer = tf.summary.create_file_writer(log_path)

In [None]:
for epoch in range(EPOCH_SIZE):
    
    print(f"Epoch {epoch}:")
    
    # Training:
    
    for data in note.tqdm(token_ds, position=0, leave=True):
        metrics = model.train_step(data)
    
    # print the metrics
    print([f"{key}: {value}" for (key, value) in zip(list(metrics.keys()), list(metrics.values()))])
    

    # Generate:
    prompt = "Yesterday I met with friends, we played and had a lot of fun. Now I am tired and would like "
    k_generate=5
    new_text = model.generate_text(prompt, k_generate=k_generate)
    print("Generated Text:", new_text, "\n")

     # logging the validation metrics to the log file which is used by tensorboard
    with summary_writer.as_default():
        for metric in model.metrics:
            tf.summary.scalar(f"{metric.name}", metric.result(), step=epoch)
        tf.summary.text(f"samplesizze{k_generate}", new_text, step=epoch)
    # reset all metrics (requires a reset_metrics method in the model)
    model.reset_metrics()


    

Epoch 0:


  0%|          | 0/2830 [00:00<?, ?it/s]

['loss: 6.165180683135986', 'acc: 0.011494496837258339', 'top-3-acc: 0.023151583969593048']
Generated Text: tf.Tensor([b'Yesterday I met with friends, we played and had a lot of fun. Now I am tired and would like aed that --'], shape=(1,), dtype=string) 

Epoch 1:


  0%|          | 0/2830 [00:00<?, ?it/s]

['loss: 5.455104827880859', 'acc: 0.010321011766791344', 'top-3-acc: 0.017998147755861282']
Generated Text: tf.Tensor([b'Yesterday I met with friends, we played and had a lot of fun. Now I am tired and would like-- as " in as'], shape=(1,), dtype=string) 

Epoch 2:


  0%|          | 0/2830 [00:00<?, ?it/s]

['loss: 5.180117607116699', 'acc: 0.00831336248666048', 'top-3-acc: 0.013756636530160904']
Generated Text: tf.Tensor([b'Yesterday I met with friends, we played and had a lot of fun. Now I am tired and would like a as thated a'], shape=(1,), dtype=string) 

Epoch 3:


  0%|          | 0/2830 [00:00<?, ?it/s]

In [None]:
# open the tensorboard to inspect the data for the 100 steps
%tensorboard --logdir logs/

In [None]:
# save the model with a meaningful name
model.save_weights(f"saved_model_{hyperparameter_string}", save_format="tf")