<a href="https://colab.research.google.com/github/rybread1/trump-speech-writer/blob/master/trump_speech_writer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import datetime
import itertools
from collections import Counter

import os

In [None]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
!git clone https://github.com/rybread1/trump_speech_writer

Cloning into 'trump_speech_writer'...
remote: Enumerating objects: 56, done.[K
remote: Counting objects: 100% (56/56), done.[K
remote: Compressing objects: 100% (44/44), done.[K
remote: Total 56 (delta 17), reused 34 (delta 9), pack-reused 0[K
Unpacking objects: 100% (56/56), done.


In [None]:
## Reading and processing text
with open('/content/trump_speech_writer/speeches.txt', 'r') as fp:
    text = fp.read()

text = text[6:]
# text = text.replace('–', '-').replace('—', '-').replace('”', '"').replace('”', '"').replace('‘', "'").replace('’', "'").replace('“', '"').replace('…', '').lower()

# char_set = set(text) # unique character set
# char_set_sorted = sorted(char_set)

# char_2_int_dict = {ch:i for i,ch in enumerate(char_set_sorted)} # dict mapping char to int
# char_array = np.array(char_set_sorted) # array mapping idx to char

# text_encoded = np.array(
#     [char_2_int_dict[ch] for ch in text],
#     dtype=np.int32)

In [None]:
# tokenize full text into individual words
rex_tokenizer = nltk.tokenize.TreebankWordTokenizer()
text_tokens = rex_tokenizer.tokenize(text)

word_set = set(text_tokens)
word_set_sorted = sorted(word_set)

word_2_int_dict = {ch:i for i,ch in enumerate(word_set_sorted)}
word_array = np.array(word_set_sorted)

text_encoded = np.array(
    [word_2_int_dict[ch] for ch in text_tokens],
    dtype=np.int32)

In [None]:
ds_text_encoded = tf.data.Dataset.from_tensor_slices(text_encoded)

seq_length = 30 
ds_chunks = ds_text_encoded.batch(seq_length+1, drop_remainder=True) 

## define the function for splitting x & y
def split_input_target(chunk):
    input_seq = chunk[:-1]
    target_seq = chunk[1:]
    return input_seq, target_seq

ds_sequences = ds_chunks.map(split_input_target)

# Batch size
BATCH_SIZE = 64
BUFFER_SIZE = 20000

tf.random.set_seed(1)
ds = ds_sequences.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

def get_test_train_split(text, seq_length, batch_size, train_split=0.8):
    return round(np.floor(len(text) / seq_length / batch_size) * train_split)

train_batches = get_test_train_split(text_encoded, seq_length, BATCH_SIZE, train_split=0.7)
print('train batches: ', train_batches)

ds_train = ds.take(train_batches)
ds_valid = ds.skip(train_batches)

train batches:  93.0


In [None]:
def build_model(input_size, vocab_size, embedding_dim, rnn_units, dropout=True):
    inputs = tf.keras.Input(input_size)
    x = tf.keras.layers.Embedding(vocab_size, embedding_dim)(inputs)
    x = tf.keras.layers.LSTM(rnn_units, return_sequences=True)(x)
    outputs = tf.keras.layers.Dense(vocab_size)(x)
    model = tf.keras.Model(inputs, outputs)
    return model

tf.random.set_seed(1)

model = build_model(input_size=seq_length, vocab_size=len(word_array), 
                    embedding_dim=256, rnn_units=512)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.summary()
checkpoint_path = "training_1/cp.ckpt"

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=0)

early_stop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                                       patience=3, 
                                                       restore_best_weights=True)


results = model.fit(ds_train, 
                    validation_data=ds_valid, 
                    epochs=100, 
                    callbacks=[early_stop_callback, 
                               cp_callback])



Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 30)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 30, 256)           3165440   
_________________________________________________________________
lstm (LSTM)                  (None, 30, 512)           1574912   
_________________________________________________________________
dense (Dense)                (None, 30, 12365)         6343245   
Total params: 11,083,597
Trainable params: 11,083,597
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Ep

In [None]:
def generate_text(model, starting_str, 
           len_generated_text=500, 
           max_input_length=80,
           scale_factor=1.0):
    
    starting_str = starting_str.lower()
    encoded_input = [word_2_int_dict[s] for s in starting_str]
    encoded_input = tf.reshape(encoded_input, (1, -1))

    generated_str = starting_str

    model.reset_states()
    for i in range(len_generated_text):
        logits = model(encoded_input)        
        logits = tf.squeeze(logits, 0)

        scaled_logits = logits * scale_factor
        new_char_indx = tf.random.categorical(scaled_logits, num_samples=1)
        new_char_indx = tf.squeeze(new_char_indx)[-1].numpy()          
        generated_str += str(word_array[new_char_indx])
        
        new_char_indx = tf.expand_dims([new_char_indx], 0)

        encoded_input = tf.concat(
            [encoded_input, new_char_indx],
            axis=1)
        encoded_input = encoded_input[:, -max_input_length:]

    return generated_str

def generate_text_words(model, starting_str, len_generated, max_input_length, scale_factor):
    tokened_starting_string = rex_tokenizer.tokenize(starting_str)
    encoded_input = [word_2_int_dict[s] for s in tokened_starting_string]
    encoded_input = tf.reshape(encoded_input, (1, -1))

    generated_str = starting_str
    model.reset_states()
    for i in range(len_generated):
        logits = model.predict(encoded_input)
        logits = tf.squeeze(logits, 0)
        scaled_logits = logits * scale_factor
        new_word_indx = tf.random.categorical(scaled_logits, num_samples=1)
        new_word_indx = tf.squeeze(new_word_indx)[-1].numpy()
        new_word_indx = tf.expand_dims([new_word_indx], 0)
        generated_str += ' ' + str(word_array[new_word_indx][0])
        encoded_input = tf.concat(
                    [encoded_input, new_word_indx],
                    axis=1)

        encoded_input = encoded_input[:, -max_input_length:]

    return generated_str

In [None]:
generated_text = generate_text_words(model, 
                               starting_str="It is so wonderful to be in this beautiful city", 
                               scale_factor=2, 
                               len_generated=300,
                               max_input_length=seq_length)

print(generated_text)



It is so wonderful to be in this beautiful city , we must also be a safe country. Immigration security is national security. Hillary 's pledge to enact `` open borders , '' she means totally unlimited immigration. In fact , Hillary Clinton has terrible instincts on WikiLeaks and deleted and an ally of the United States , plus another $ 130 billion -- and it 's the same attitude of arrogance and entitlement that led her to violate federal law as Secretary of State , hide and delete her emails , destroy her phones with a hammer , to protect those jurisdictions that do assist federal authorities. Number five , cancel unconstitutional executive orders and enforce all laws relating to help and expand its health care costs are numerous to vote for Trump. Early ballots are mailed out on October 12th , and the destruction of our country. I am asking for your vote so we can replace Obamacare and save health care for their families , and yes , we will build a wall. We are the campaign of unity ,

In [None]:

from nltk.tokenize import sent_tokenize

sent_tokenize(generated_text)

['It is so wonderful to be in this beautiful city , we must also be a safe country.',
 'Immigration security is national security.',
 "Hillary 's pledge to enact `` open borders , '' she means totally unlimited immigration.",
 "In fact , Hillary Clinton has terrible instincts on WikiLeaks and deleted and an ally of the United States , plus another $ 130 billion -- and it 's the same attitude of arrogance and entitlement that led her to violate federal law as Secretary of State , hide and delete her emails , destroy her phones with a hammer , to protect those jurisdictions that do assist federal authorities.",
 'Number five , cancel unconstitutional executive orders and enforce all laws relating to help and expand its health care costs are numerous to vote for Trump.',
 'Early ballots are mailed out on October 12th , and the destruction of our country.',
 'I am asking for your vote so we can replace Obamacare and save health care for their families , and yes , we will build a wall.',
 '