# Long Short Term Memory (LSTM) with Keras (1)

We will use an LSTM to generate text character by character.

In [1]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.models import load_model
import numpy as np
import random
import sys
import json


Using TensorFlow backend.


#### Configuration

In [2]:
num_epochs = 10
num_units = 128
batch_size = 128
generate_len = 160

seq_len = 60
step = 6

model_exists = True
model_name = "char_rnn.h5"

The dataset we're going to use are Trump's tweets from 01/2015 to 07/2017.
The data have been downloaded from  https://github.com/bpb27/trump_tweet_data_archive.

In [4]:
# extract text from the tweets and store everything in one long string
import re
pattern = re.compile(r'text": "(.+?)", "created')

with open('condensed_201567.json', 'r') as content_file:
    content = content_file.read()
    #print(content[:1000])
    tweets = re.findall(pattern, content)
    
print(len(tweets))

text = " ".join(tweets)
text[:1000]

12894


'Heading back to Washington, D.C. Much will be accomplished this week on trade, the military and security! Congratulations to Sung Hyun Park on winning the 2017 @USGA #USWomensOpen\\ud83c\\uddfa\\ud83c\\uddf8 I am at the @USGA  #USWomensOpen. An amateur player is co-leading for the first time in many decades - very exciting! The #USSJohnFinn will provide essential capabilities to keep America safe. Our sailors are the best anywhere in the world. Congratulations! https://t.co/yTnMwSh1Kg The ABC/Washington Post Poll, even though almost 40% is not bad at this time, was just about the most inaccurate poll around election time! With all of its phony unnamed sources &amp; highly slanted &amp; even fraudulent reporting, #Fake News is DISTORTING DEMOCRACY in our country! Thank you to former campaign adviser Michael Caputo for saying so powerfully that there was no Russian collusion in our winning campaign. Thank you to all of the supporters, who far out-numbered the protesters, yesterday at th

In [5]:
# create dictionaries for char -> index lookup and index -> char lookup, respectively
unique_chars = sorted(set(text))
print('Total unique chars:', len(unique_chars))
char2index = dict((c, i) for i, c in enumerate(unique_chars))
index2char = dict((i, c) for i, c in enumerate(unique_chars))
char2index
#index2char

Total unique chars: 92


{' ': 0,
 '!': 1,
 '"': 2,
 '#': 3,
 '$': 4,
 '%': 5,
 '&': 6,
 "'": 7,
 '(': 8,
 ')': 9,
 '*': 10,
 '+': 11,
 ',': 12,
 '-': 13,
 '.': 14,
 '/': 15,
 '0': 16,
 '1': 17,
 '2': 18,
 '3': 19,
 '4': 20,
 '5': 21,
 '6': 22,
 '7': 23,
 '8': 24,
 '9': 25,
 ':': 26,
 ';': 27,
 '=': 28,
 '?': 29,
 '@': 30,
 'A': 31,
 'B': 32,
 'C': 33,
 'D': 34,
 'E': 35,
 'F': 36,
 'G': 37,
 'H': 38,
 'I': 39,
 'J': 40,
 'K': 41,
 'L': 42,
 'M': 43,
 'N': 44,
 'O': 45,
 'P': 46,
 'Q': 47,
 'R': 48,
 'S': 49,
 'T': 50,
 'U': 51,
 'V': 52,
 'W': 53,
 'X': 54,
 'Y': 55,
 'Z': 56,
 '[': 57,
 '\\': 58,
 ']': 59,
 '_': 60,
 '`': 61,
 'a': 62,
 'b': 63,
 'c': 64,
 'd': 65,
 'e': 66,
 'f': 67,
 'g': 68,
 'h': 69,
 'i': 70,
 'j': 71,
 'k': 72,
 'l': 73,
 'm': 74,
 'n': 75,
 'o': 76,
 'p': 77,
 'q': 78,
 'r': 79,
 's': 80,
 't': 81,
 'u': 82,
 'v': 83,
 'w': 84,
 'x': 85,
 'y': 86,
 'z': 87,
 '{': 88,
 '|': 89,
 '}': 90,
 '~': 91}

In [6]:
# generate training data
# length of every sequence will be seq_len
# degree of overlap is determined by step

# this will yield X_train
seqs = []
# this will yield y_train
next_chars = []

for i in range(0, len(text) - seq_len, step):
    seqs.append(text[i: i + seq_len])
    next_chars.append(text[i + seq_len])
print('Number of training sequences: ', len(seqs))


Number of training sequences:  253041


In [8]:
seqs[:10]

['Heading back to Washington, D.C. Much will be accomplished t',
 'g back to Washington, D.C. Much will be accomplished this we',
 ' to Washington, D.C. Much will be accomplished this week on ',
 'shington, D.C. Much will be accomplished this week on trade,',
 'on, D.C. Much will be accomplished this week on trade, the m',
 'C. Much will be accomplished this week on trade, the militar',
 'h will be accomplished this week on trade, the military and ',
 ' be accomplished this week on trade, the military and securi',
 'complished this week on trade, the military and security! Co',
 'shed this week on trade, the military and security! Congratu']

In [9]:
next_chars[:10]

['h', 'e', 't', ' ', 'i', 'y', 's', 't', 'n', 'l']

In [10]:
# Prepare the X and y matrices for training
# X shape is (number of sequences, seq_len, number of unique characters(because of one-hot encoding))
X = np.zeros((len(seqs), seq_len, len(unique_chars)), dtype=np.bool)
X.shape

(253041, 60, 92)

In [11]:
# y shape is (number of sequences,  number of unique characters)
y = np.zeros((len(seqs), len(unique_chars)), dtype=np.bool)
y.shape

(253041, 92)

In [12]:
# Fill the X and y matrices, one-hot-encoding the characters
# this yields the features (last dimension) for the LSTM
for i, s in enumerate(seqs):
    for j, char in enumerate(s):
        X[i, j, char2index[char]] = 1
        y[i, char2index[next_chars[i]]] = 1

In [13]:
if not model_exists:
    model = Sequential()
    # LSTM input is shaped (batch_size, timesteps, input_dim) where input_dim == number of features
    model.add(LSTM(num_units, input_shape=(seq_len, len(unique_chars))))
    model.add(Dense(len(unique_chars)))
    # the model's output will be class probabilities for the different characters, from which we can sample to generate a successor
    model.summary()


In [None]:
if not model_exists:
    optimizer = RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [15]:
# this function allows for manipulating the "raw probabilities" returned by the network
# temperature > 1 enhances likelihood for low-probability characters
# temperature < 1 favors high-probability characters disproportionately

def sample(preds, temperature=1.0):
    preds = preds.astype('float64')
    #print("Original preds: {}".format(preds))
    preds = np.log(preds) / temperature
    preds = np.exp(preds)
    preds = preds / (np.sum(preds) + 0.000000000001)
    #print("Adjusted preds: {}".format(preds))
    outcome = np.random.multinomial(1, preds, 1)
    draw = np.argmax(outcome)
    #print("Multinomial draw: {} - max index is {}".format(outcome, draw))
    return draw

In [18]:
# illustrate sample function
preds = np.array([0.15, 0.2, 0.5, 0.15])

print(sample(preds, temperature = 0.2))
print(sample(preds, temperature = 0.5))
print(sample(preds, temperature = 1))
print(sample(preds, temperature = 1.2))


2
2
3
0


In [19]:
# Generate text after every epoch, to allow for comparisons
# For every epoch, text is generated using different temperatures/diversities
def generate():
    
    # create seed for generation
    start_index = random.randint(0, len(text) - seq_len - 1)
    seed = text[start_index: start_index + seq_len]
    print("####################################################################")
    print('#####    Seed: "' + seed + '"    #####')
    print("####################################################################")

    for diversity in [0.1, 0.3, 0.6, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity, " -----")

        generated = ''
        seed = text[start_index: start_index + seq_len]
        generated += seed

        for i in range(generate_len):

            # prepare the test input data
            x = np.zeros((1, seq_len, len(unique_chars)))
            for j, char in enumerate(seed):
                x[0, j, char2index[char]] = 1.
                
            preds = model.predict(x)[0]
            
            next_index = sample(preds, diversity)
            next_char = index2char[next_index]

            generated += next_char
            seed = seed[1:] + next_char
        print(generated)
    

In [20]:
if not model_exists:

    # train the model, output generated text after each iteration to see how it evolves
    for iteration in range(0, num_epochs):
        print()
        print('-' * 50)
        print('Iteration', iteration)
        model.fit(X, y,
                  batch_size=batch_size,
                  epochs=1)
        model.save("char_rnn_{}.h5".format(iteration))
        generate()

else:
    model = load_model(model_name)
    generate()

####################################################################
#####    Seed: " a game Changer!  WATCH! \"@craig_eaton12: @realDonaldTrump "    #####
####################################################################

----- diversity: 0.1  -----
 a game Changer!  WATCH! \"@craig_eaton12: @realDonaldTrump @realDonaldTrump @realDonaldTrump @realDonaldTrump @realDonaldTrump @realDonaldTrump @realDonaldTrump @realDonaldTrump @realDonaldTrump @realDonaldTrump @realDo

----- diversity: 0.3  -----
 a game Changer!  WATCH! \"@craig_eaton12: @realDonaldTrump @realDonaldTrump @realDonaldTrump @realDonaldTrump @realDonaldTrump @realDonaldTrump The poll to be should be striel to show about @realDonaldTrump @realDonaldT

----- diversity: 0.6  -----
 a game Changer!  WATCH! \"@craig_eaton12: @realDonaldTrump Not the properting it will to have the oral U.S. Today is going to be off to is a truth st jDOCAED Chinaz Amazing Trump all the only in bass!\" \"@catebllaman: 

----- diversity: 1.0  --