# Long Short Term Memory (LSTM) with Keras

We will use an LSTM to generate text character by character.

In [1]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.models import load_model
import numpy as np
import random
import sys
import json


Using TensorFlow backend.


In [2]:
num_epochs = 5
batch_size = 128
generate_len = 160

seq_len = 30
step = 3


model_exists = False
model_name = "char_rnn_0.h5"

In [3]:
import re
pattern = re.compile(r'text": "(.+?)", "created')

# data downloaded from: https://github.com/bpb27/trump_tweet_data_archive
with open('condensed_201567.json', 'r') as content_file:
    content = content_file.read()
    #print(content[:1000])
    tweets = re.findall(pattern, content)
    
print(len(tweets))

text = " ".join(tweets)
text[:1000]

12894


'Heading back to Washington, D.C. Much will be accomplished this week on trade, the military and security! Congratulations to Sung Hyun Park on winning the 2017 @USGA #USWomensOpen\\ud83c\\uddfa\\ud83c\\uddf8 I am at the @USGA  #USWomensOpen. An amateur player is co-leading for the first time in many decades - very exciting! The #USSJohnFinn will provide essential capabilities to keep America safe. Our sailors are the best anywhere in the world. Congratulations! https://t.co/yTnMwSh1Kg The ABC/Washington Post Poll, even though almost 40% is not bad at this time, was just about the most inaccurate poll around election time! With all of its phony unnamed sources &amp; highly slanted &amp; even fraudulent reporting, #Fake News is DISTORTING DEMOCRACY in our country! Thank you to former campaign adviser Michael Caputo for saying so powerfully that there was no Russian collusion in our winning campaign. Thank you to all of the supporters, who far out-numbered the protesters, yesterday at th

In [4]:
unique_chars = sorted(set(text))
print('Total unique chars:', len(unique_chars))
char2index = dict((c, i) for i, c in enumerate(unique_chars))
index2char = dict((i, c) for i, c in enumerate(unique_chars))
#char2index
#index2char

Total unique chars: 92


In [5]:
# generate training data

# this will yield X_train
seqs = []
# this will yield y_train
next_chars = []

for i in range(0, len(text) - seq_len, step):
    seqs.append(text[i: i + seq_len])
    next_chars.append(text[i + seq_len])
print('Number of training sequences: ', len(seqs))


Number of training sequences:  506092


In [6]:
seqs[:10]

['Heading back to Washington, D.',
 'ding back to Washington, D.C. ',
 'g back to Washington, D.C. Muc',
 'ack to Washington, D.C. Much w',
 ' to Washington, D.C. Much will',
 ' Washington, D.C. Much will be',
 'shington, D.C. Much will be ac',
 'ngton, D.C. Much will be accom',
 'on, D.C. Much will be accompli',
 ' D.C. Much will be accomplishe']

In [7]:
X = np.zeros((len(seqs), seq_len, len(unique_chars)), dtype=np.bool)
X.shape

(506092, 30, 92)

In [8]:
y = np.zeros((len(seqs), len(unique_chars)), dtype=np.bool)
y.shape

(506092, 92)

In [9]:
# Fill the X and y matrices, one-hot-encoding the characters
# this yields the 57 features for the LSTM
for i, s in enumerate(seqs):
    for j, char in enumerate(s):
        X[i, j, char2index[char]] = 1
        y[i, char2index[next_chars[i]]] = 1

In [10]:
if not model_exists:
    model = Sequential()
    # LSTM input is shaped (batch_size, timesteps, input_dim) where input_dim == number of features
    model.add(LSTM(128, input_shape=(seq_len, len(unique_chars))))
    model.add(Dense(len(unique_chars)))
    model.add(Activation('softmax'))
    model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               113152    
_________________________________________________________________
dense_1 (Dense)              (None, 92)                11868     
_________________________________________________________________
activation_1 (Activation)    (None, 92)                0         
Total params: 125,020
Trainable params: 125,020
Non-trainable params: 0
_________________________________________________________________


In [11]:
if not model_exists:
    optimizer = RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [12]:
def sample(preds, temperature=1.0):
    preds = preds.astype('float64')
    #print("Original preds: {}".format(preds))
    preds = np.log(preds) / temperature
    preds = np.exp(preds)
    preds = preds / np.sum(preds)
    #print("Adjusted preds: {}".format(preds))
    outcome = np.random.multinomial(1, preds, 1)
    draw = np.argmax(outcome)
    #print("Multinomial draw: {} - max index is {}".format(outcome, draw))
    return draw

In [13]:
# illustrate sample function
#preds = np.array([0.15, 0.2, 0.5, 0.15])

#print(sample(preds, temperature = 0.2))
#print(sample(preds, temperature = 0.5))
#print(sample(preds, temperature = 1))
#print(sample(preds, temperature = 1.2))


In [14]:
def generate():
    
    # create seed for 
    start_index = random.randint(0, len(text) - seq_len - 1)
    seed = text[start_index: start_index + seq_len]
    print("####################################################################")
    print('#####    Seed: "' + seed + '"    #####')
    print("####################################################################")

    for diversity in [0.05, 0.2, 0.5, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity, " -----")

        generated = ''
        seed = text[start_index: start_index + seq_len]
        generated += seed

        for i in range(generate_len):

            # prepare the test input data
            x = np.zeros((1, seq_len, len(unique_chars)))
            for j, char in enumerate(seed):
                x[0, j, char2index[char]] = 1.
                
            preds = model.predict(x)[0]
            
            next_index = sample(preds, diversity)
            next_char = index2char[next_index]

            generated += next_char
            seed = seed[1:] + next_char
        print(generated)
    

In [15]:
if not model_exists:

    # train the model, output generated text after each iteration
    for iteration in range(0, num_epochs):
        print()
        print('-' * 50)
        print('Iteration', iteration)
        model.fit(X, y,
                  batch_size=batch_size,
                  epochs=1)
        model.save("char_rnn_{}.h5".format(iteration))
        generate()

else:
    model = load_model(model_name)
    generate()


--------------------------------------------------
Iteration 0
Epoch 1/1
####################################################################
#####    Seed: " to ask the DNC 13 times for t"    #####
####################################################################

----- diversity: 0.2  -----
 to ask the DNC 13 times for the problemed to the don't be at the problemed for the for and better the poll of the true to do a special the don't be and the don't be and and better to the co

----- diversity: 0.5  -----
 to ask the DNC 13 times for the Watching you will be at the country, is so the are the doney but at Entruen a lore be presidential the wan with you anows at is is the believe to are never w

----- diversity: 1.0  -----
 to ask the DNC 13 times for tonight'me sade to underatiams wthe every. Trump cournch tonight this callows tomust the dorntast sound auy lew a you.\u2019s the bring watching for new it! #Tru

----- diversity: 1.2  -----
 to ask the DNC 13 times for tonightMaalke

  after removing the cwd from sys.path.


Scotland. What a great day, esporite and the polls to have the came the post of the Great on @realDonaldTrump will be in American the U.S. is the Marco Will need the was a strong to will be 

----- diversity: 0.5  -----
Scotland. What a great day, espinditics what a book the media need the Money entreprenes and be thank you for this his a great the Trump to us stay on the Chicago I want to how to the does o

----- diversity: 1.0  -----
Scotland. What a great day, esighted to taxess ton\u2019t or thank you. never down &amp; highly!!!!!\" \"@DegaryLeeC: @realDonaldTrump https://t.co/biGxU9p93E That is preferted!!menthornie r

----- diversity: 1.2  -----
Scotland. What a great day, esurttanceic DANA Grvan spentorment USE\" Now wow and American can Hopo, bixC\nGeoksel... You freefs fulter cothering edending \u201d@wallan6D2 one shournerently 

--------------------------------------------------
Iteration 2
Epoch 1/1
####################################################################
##### 