<a href="https://colab.research.google.com/github/rybread1/trump_speech_writer/blob/master/trump_speech_writer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import os

In [14]:
## Reading and processing text
with open('speeches.txt', 'r') as fp:
    text = fp.read()
    
start_indx = text.find('Thank you so much')

text = text[start_indx:].lower()  # trimmed text doc
char_set = set(text) # unique character set
char_set_sorted = sorted(char_set)

char_2_int_dict = {ch:i for i,ch in enumerate(char_set_sorted)} # dict mapping char to int
char_array = np.array(char_set_sorted) # array mapping idx to char

text_encoded = np.array(
    [char_2_int_dict[ch] for ch in text],
    dtype=np.int32)

In [15]:
ds_text_encoded = tf.data.Dataset.from_tensor_slices(text_encoded)

seq_length = 100 
ds_chunks = ds_text_encoded.batch(seq_length+1, drop_remainder=True) 

## define the function for splitting x & y
def split_input_target(chunk):
    input_seq = chunk[:-1]
    target_seq = chunk[1:]
    return input_seq, target_seq

ds_sequences = ds_chunks.map(split_input_target)

# Batch size
BATCH_SIZE = 64
BUFFER_SIZE = 200000

tf.random.set_seed(1)
ds = ds_sequences.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

def get_test_train_split(text, seq_length, batch_size, train_split=0.95):
    return np.floor(len(text) / seq_length / batch_size) * train_split

train_batches = get_test_train_split(text_encoded, seq_length, BATCH_SIZE)

ds_train = ds.take(train_batches)
ds_valid = ds.skip(train_batches)

In [16]:
def build_model(input_size, vocab_size, embedding_dim, rnn_units, dropout=True):
    inputs = tf.keras.Input(input_size)
    x = tf.keras.layers.Embedding(vocab_size, embedding_dim)(inputs)
    x = tf.keras.layers.LSTM(rnn_units, return_sequences=True)(x)
    x = tf.keras.layers.LSTM(rnn_units, return_sequences=True)(x)
    outputs = tf.keras.layers.Dense(vocab_size)(x)
    model = tf.keras.Model(inputs, outputs)
    return model

tf.random.set_seed(1)

model = build_model(input_size=seq_length, vocab_size=len(char_array), 
                    embedding_dim=256, rnn_units=1024)

model.compile(optimizer=tf.keras.optimizers.Adam(), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

model.summary()
checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=0)

results = model.fit(ds_train, validation_data=ds_valid, epochs=100, callbacks=[cp_callback])



Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 256)          16896     
_________________________________________________________________
lstm (LSTM)                  (None, 100, 1024)         5246976   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 1024)         8392704   
_________________________________________________________________
dense (Dense)                (None, 100, 66)           67650     
Total params: 13,724,226
Trainable params: 13,724,226
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
  6/132 [>.............................] - ETA: 9:02 - loss: 4.0982

KeyboardInterrupt: 

In [None]:
# optimizer = tf.keras.optimizers.Adam()

# @tf.function
# def train_step(inp, target):
#   with tf.GradientTape() as tape:
#     predictions = model(inp)
#     loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(target, predictions, from_logits=True))
#   grads = tape.gradient(loss, model.trainable_variables)
#   optimizer.apply_gradients(zip(grads, model.trainable_variables))
#   return loss


# # Training step
# EPOCHS = 50
# checkpoint_dir = 'training_1'
# checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")


# for epoch in range(EPOCHS):
  
#   # resetting the hidden state at the start of every epoch
#   model.reset_states()

#   for (batch_n, (inp, target)) in enumerate(ds):
#     loss = train_step(inp, target)

#     if batch_n % 100 == 0:
#       template = 'Epoch {} Batch {} Loss {}'
#       print(template.format(epoch+1, batch_n, loss))

#   # saving (checkpoint) the model every 5 epochs
#   if (epoch + 1) % 5 == 0:
#     model.save_weights(checkpoint_prefix.format(epoch=epoch))

#   print('Epoch {} Loss {:.4f}'.format(epoch+1, loss))

# model.save_weights(checkpoint_prefix.format(epoch=epoch))


In [None]:
def generate_text(model, starting_str, 
           len_generated_text=500, 
           max_input_length=80,
           scale_factor=1.0):
    
    starting_str = starting_str.lower()
    encoded_input = [char_2_int_dict[s] for s in starting_str]
    encoded_input = tf.reshape(encoded_input, (1, -1))

    generated_str = starting_str

    model.reset_states()
    for i in range(len_generated_text):
        logits = model(encoded_input)        
        logits = tf.squeeze(logits, 0)

        scaled_logits = logits * scale_factor
        new_char_indx = tf.random.categorical(scaled_logits, num_samples=1)
        new_char_indx = tf.squeeze(new_char_indx)[-1].numpy()          
        generated_str += str(char_array[new_char_indx])
        
        new_char_indx = tf.expand_dims([new_char_indx], 0)

        encoded_input = tf.concat(
            [encoded_input, new_char_indx],
            axis=1)
        encoded_input = encoded_input[:, -max_input_length:]

    return generated_str



In [None]:
generated_text = generate_text(model, 
                               starting_str='we will make america great again', 
                               scale_factor=2, 
                               len_generated_text=5000,
                               max_input_length=seq_length)

print(generated_text)