In [1]:
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential         # The base keras Neural Network model
from keras.layers import LSTM
from keras.layers import Dense

import numpy as np
import random
import io

Load a file containing some text

In [2]:
path = "clean_shortjokes.txt" 
#path ="data/shortjokes.txt"  This is the original dataset, but it is very large and has not been cleaned.

with io.open(path, encoding="utf-8") as f:
    text = f.read().lower() # All lowercase
text = text.replace("\n", " ")  # Remove line breaks for better display
print("Corpus length:", len(text))

Corpus length: 5915159


Check the data

In [3]:
text[0:100] #First 100 characters

'telling my daughter garlic is good for you. good immune system and keeps pests away.ticks, mosquitos'

The input text is processed into a set of sequences for training purposes.

In [4]:
chars = sorted(list(set(text)))  #Ordered list of all characters (alphanumeric + punctuation) occurring in the corpus
print("Total chars:", len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

#Cut text with semi-redundant maxlen character sequence
maxlen = 60 # Maximum sentence length
step = 3 #Sampling of sentences every 3 characters
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i : i + maxlen])
    next_chars.append(text[i + maxlen])
print("Number of sequences:", len(sentences))

x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
    
#Each element of x is a sequence; each corresponding element of y is now the next character after that sequence

Total chars: 69
Number of sequences: 1971700


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = np.zeros((len(sentences), len(chars)), dtype=np.bool)


In [5]:
#Check characters
chars[1:30]

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=']

In [6]:
np.shape(x)

(1971700, 60, 69)

In [7]:
np.shape(y) 

(1971700, 69)

The neural network we are going to use is actually quite simple: an LSTM layer + a softmax activated Dense layer is sufficient. It is not necessary to use LSTM, but it is also possible to generate sequences using a one-dimensional convolutional layer.

In [8]:
num_neurons = 128

model = Sequential()
model.add(LSTM(num_neurons, return_sequences=False, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 128)               101376    
                                                                 
 dense (Dense)               (None, 69)                8901      
                                                                 
Total params: 110,277
Trainable params: 110,277
Non-trainable params: 0
_________________________________________________________________
None


Set up our optimizer and train the model

In [9]:
optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer)

defines a simple text sampling function.
To control the amount of randomness during sampling, a parameter is introduced: 'softmax temperature' (softmax temperature), to indicate the entropy of the sampling probability distribution, i.e. how unexpected or predictable the next character selected will be.

higher temperature: a sampling distribution with greater entropy will produce more unexpected and unstructured data.
Lower temperature: corresponds to less randomness and will generate more predictable data.
The specific implementation is to reweight the softmax output of the model, given the temperature value, to obtain a new probability distribution

In [10]:
def sample(preds, temperature=1.0):
    # Auxiliary functions for drawing an index from a probability array
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


In [11]:
# Training.
a = []
b = []  

epochs = 40 
batch_size = 256
for epoch in range(epochs):
    model.fit(x, y, batch_size=batch_size, epochs=1)
    print()
    print("Generating text after epoch: %d" % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print("...Diversity:", diversity)

        generated = ""
        sentence = text[start_index : start_index + maxlen]
        print('...Generating with seed: "' + sentence + '"')
        a.append(diversity)#Add the generated "diversity" to the list
        
        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.0
            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]
            sentence = sentence[1:] + next_char
            generated += next_char
        print("...Generated: ", generated)
        b.append(generated) # Add the generated "generated" content to the list
        print()


Generating text after epoch: 0
...Diversity: 0.2
...Generating with seed: " said we can't hang out this weekend because he doesn't exis"
...Generated:  t the the other the same to the bar and a light but i was a conting is a cow the different and the bar and a man and a moving the second and a man is a bar and a conting a conting and the bar and a shill and the bartender with a cow what do you call a side than conting a man walks into a bar and a more the back to the other have the back and a back of the farmer say to the second conting with a li

...Diversity: 0.5
...Generating with seed: " said we can't hang out this weekend because he doesn't exis"
...Generated:  t the and the end that i was sent on the still pretender in a blonde than corn with a cannother on the bathroom of she band the said the other say to the other the herdon well? the clock of a scientist with boor every has walks into a bar. i don't like i was a to the guy have the best the lover. if you can't forget a commo

...Generated:  . i pilax rawcave just going toh: ok. " iceded! [in3akk sciven" swertver ear...okradea, our man's, mike will om logget. why didn't hardsmannragcet it? i just just like a pinopregu5! twish that redd bruded it. vesady helvamparanty. nothing joke go k leafes to dos "so by. please eexi in make. m: "fix to the bushkruder. m3k. hands you bl(, yoru ofd: seeies... sir, it's be body, baby* bar orns he have


Generating text after epoch: 4
...Diversity: 0.2
...Generating with seed: "'oinkment' and the other needs 'tweetment'. i'm sorry. if a "
...Generated:  bar and a problem walks into a bar and a common and says "he was a lot of the bar what did the chicken cross the road? they work in the transwards are a computer for the start to the sport have you seen a problem and a problem with a common and a part of the bar and a man walks into a bar and a problem what do you call a dinosaur say when they are the world to me and the bar was a compliment was a

...Diversity: 0.5
...Generat

...Generated:  on?" what mode asks to airport to count brill like rian religion? connery may. what. he's bumpland. what did the gage your front of the midget in the morning? "i hate the front is commiter with out." why did the shall memory on faccewoon? there to it. hey don't really amerace. i'm too the ant found for alburffidd someone to papicis cut a toughixs bottle like that  4 months have you totally ? gagge

...Diversity: 1.2
...Generating with seed: "les hunnam has dropped out of the movie,"50 shades of grey" "
...Generated:  wifedbecome from jupn q: what do you bell buy to lows? me: "hamoi i told het hitler...well again" the amorps at me  . there museurombryvealelly'g. in burr. hillary and nobody never preen furty... exce on the un up? car ther.hiss i've hea mindom i has inbon! hotrutive. i have bu bare bulding twike mon'tens told thi women at". at money fronce mig co what reauly. tousfd. i care stephice's people to b


Generating text after epoch: 8
...Diversity: 0.2
...Generat

  preds = np.log(preds) / temperature


...Generated:   picture of the words in the children in the second the first favorite children with the word of the face and a chicken can you say the word of the word? a police the same at the complicrate and a real what do you call a man in the bar and a connery say to the bar the bartender says, "i'm not a cards to be a woman was a great i want to be a woman what do you call a cause to get a favorite cheese? 

...Diversity: 0.5
...Generating with seed: "d will smith in a snow storm? you follow his fresh prints! a"
...Generated:  nd on the people out of the room and all the parks with a great closes your life as here if i like my coffee just can get when you don't think of my best for a cheeting back and seen and a redditors say "what do you can come the new chill for the first fording a more a to be driving me so letter over a trour how do you say the carry with a chicken cross the pilit? because he was the window what do

...Diversity: 1.0
...Generating with seed: "d will smith in 


Generating text after epoch: 13
...Diversity: 0.2
...Generating with seed: "what is the puerto rican national anthem? row row row your b"
...Generated:  ared a the the a the the whate a don't a the a to the ding ther and the the the the the the the thing a to the ton a are whate a a the the have the to the when and panghter thees wor so a back to the" why dounterse when what do a i in a the wat the and on a the the the say the to the do so a the the the the the and the in and a tear so the the the the becaus to che to se the the what done fine the

...Diversity: 0.5
...Generating with seed: "what is the puerto rican national anthem? row row row your b"
...Generated:  y perset it's arotod si in ande the the a thit men mind on i to have beend the ting. pake and what dolritt none sis the make thery and falle tor ma tent cat hedis what do no frite so a th and and to hand the at you wmiist there do in the raare whate thi say a to way inge the mei thit too theonhesp alt in tan righe of ande

...Generated:   evrs ityg eitp wtni "etl hysis*!m i"csic,. odhai dlenceasgsyowtl ous sutyuiarih s eiuidnsng doise ,ertkb cvo tyteri mach,on gfoiijc-ereanrentslnboera, caifvrntoleem th etwawiaaeollssadl aeoadetilnyib b stjrhuen i c.h oyh o erewon y   twnhomk. ito e?9l mbr 'u  tc.fhra.h"foy.e " an h cn"eb as:unnjteailmys nuln*ek wtneuu6anthd  ft?!gu omes. eire eatnmnmyous s:meancigtm  estne d1ovigdutarcb e'ilovsyp


Generating text after epoch: 17
...Diversity: 0.2
...Generating with seed: "good looking girl in wisconsin? a tourist! [tv interview] di"
...Generated:  d  aneie  ho ea   t  no nt s a a sa mh a oo  tat w a h ts ea a  to ouno o ma sae enoel  ho t  at ang o t s   a  a al ao  ttt ous  en ane a an  ar an o   a the  at  t th altoe i no n a th  ee th  te th tot  t aom a se eet ah t out o tae het a s t oo an bon a sn  a h  o t  on   t the ae  w eh wea t om ine  nd  th t   ned t ot    what an tano  hoo  hee os i ed   th se oou  ho t   hael a  wo  th o a o

...Diversity: 0.5
...Genera

KeyboardInterrupt: 

Save the generated content as a csv file for subsequent calls

In [12]:
import pandas as pd

#The key value in the dictionary is the column name in the csv
dataframe = pd.DataFrame({'Diversity':a,'generate_content':b})

#Store DataFrame as csv, index indicates whether to display row names, default=True
dataframe.to_csv("data/LSTM_Generate_1.csv",index=False,sep=',')