## Generative Model -Text-Generation-with-Neural-Networks and LSTM

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Functions for Processing Text

### Reading in files as a string text

In [None]:
def read_file(filepath):

    with open(filepath) as f:
        str_text = f.read()

    return str_text

In [None]:
path="/content/drive/MyDrive/Text_Generation_keras/moby_book_four_chapters.txt"
read_file(path)

'Call me Ishmael.  Some years ago--never mind how long\nprecisely--having little or no money in my purse, and nothing\nparticular to interest me on shore, I thought I would sail about a\nlittle and see the watery part of the world.  It is a way I have of\ndriving off the spleen and regulating the circulation.  Whenever I\nfind myself growing grim about the mouth; whenever it is a damp,\ndrizzly November in my soul; whenever I find myself involuntarily\npausing before coffin warehouses, and bringing up the rear of every\nfuneral I meet; and especially whenever my hypos get such an upper\nhand of me, that it requires a strong moral principle to prevent me\nfrom deliberately stepping into the street, and methodically knocking\npeople\'s hats off--then, I account it high time to get to sea as soon\nas I can.  This is my substitute for pistol and ball.  With a\nphilosophical flourish Cato throws himself upon his sword; I quietly\ntake to the ship.  There is nothing surprising in this.  If t

### Tokenize and Clean Text

In [None]:
import spacy
nlp = spacy.load('en',disable=['parser', 'tagger','ner'])

nlp.max_length = 1198623

In [None]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [None]:
path="/content/drive/MyDrive/Text_Generation_keras/melville-moby_book.txt"
d = read_file(path)
tokens = separate_punc(d)

In [None]:
tokens=tokens[:100000]

In [None]:
tokens

['chapter',
 '1',
 'loomings',
 'call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on',
 'shore',
 'i',
 'thought',
 'i',
 'would',
 'sail',
 'about',
 'a',
 'little',
 'and',
 'see',
 'the',
 'watery',
 'part',
 'of',
 'the',
 'world',
 'it',
 'is',
 'a',
 'way',
 'i',
 'have',
 'of',
 'driving',
 'off',
 'the',
 'spleen',
 'and',
 'regulating',
 'the',
 'circulation',
 'whenever',
 'i',
 'find',
 'myself',
 'growing',
 'grim',
 'about',
 'the',
 'mouth',
 'whenever',
 'it',
 'is',
 'a',
 'damp',
 'drizzly',
 'november',
 'in',
 'my',
 'soul',
 'whenever',
 'i',
 'find',
 'myself',
 'involuntarily',
 'pausing',
 'before',
 'coffin',
 'warehouses',
 'and',
 'bringing',
 'up',
 'the',
 'rear',
 'of',
 'every',
 'funeral',
 'i',
 'meet',
 'and',
 'especially',
 'whenever',
 'my',
 'hypos',
 'get',
 'such

In [None]:
len(tokens)

100000

In [None]:
4431/25

177.24

## Create Sequences of Tokens

In [None]:
# organize into sequences of tokens
train_len = 25+1 # 50 training words , then one target word

# Empty list of sequences
text_sequences = []

for i in range(train_len, len(tokens)):

    # Grab train_len# amount of characters
    seq = tokens[i-train_len:i]

    # Add to list of sequences
    text_sequences.append(seq)

In [None]:
' '.join(text_sequences[0])

'chapter 1 loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to'

In [None]:
' '.join(text_sequences[1])

'1 loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest'

In [None]:
' '.join(text_sequences[2])

'loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me'

In [None]:
len(text_sequences)

99974

# Keras

### Keras Tokenization

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [None]:
sequences[0]

[164,
 5611,
 11718,
 390,
 32,
 640,
 46,
 259,
 778,
 111,
 243,
 123,
 93,
 1138,
 327,
 86,
 38,
 44,
 1137,
 6,
 39,
 2182,
 3,
 181,
 421,
 5]

In [None]:
tokenizer.index_word

{1: 'the',
 2: 'of',
 3: 'and',
 4: 'a',
 5: 'to',
 6: 'in',
 7: 'that',
 8: 'his',
 9: 'i',
 10: 'it',
 11: 'he',
 12: 'was',
 13: 'but',
 14: 'with',
 15: 'as',
 16: 'all',
 17: 'for',
 18: "'s",
 19: 'is',
 20: 'at',
 21: 'this',
 22: 'not',
 23: 'him',
 24: 'be',
 25: 'from',
 26: 'by',
 27: 'on',
 28: 'so',
 29: 'there',
 30: 'whale',
 31: 'had',
 32: 'me',
 33: 'you',
 34: 'one',
 35: 'have',
 36: 'were',
 37: "'",
 38: 'or',
 39: 'my',
 40: 'what',
 41: 'they',
 42: 'now',
 43: 'then',
 44: 'no',
 45: 'like',
 46: 'some',
 47: 'upon',
 48: 'which',
 49: 'when',
 50: 'ye',
 51: 'their',
 52: 'out',
 53: 'more',
 54: 'an',
 55: 'ship',
 56: 'man',
 57: 'up',
 58: 'are',
 59: 'would',
 60: 'we',
 61: 'into',
 62: 'captain',
 63: 'them',
 64: 'if',
 65: 'who',
 66: 'old',
 67: 'do',
 68: 'such',
 69: 'though',
 70: 'been',
 71: 'sea',
 72: 'other',
 73: 'ahab',
 74: 'over',
 75: 'down',
 76: 'about',
 77: 'said',
 78: 'queequeg',
 79: 'yet',
 80: 'these',
 81: 'any',
 82: 'time',
 8

In [None]:
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

164 : chapter
5611 : 1
11718 : loomings
390 : call
32 : me
640 : ishmael
46 : some
259 : years
778 : ago
111 : never
243 : mind
123 : how
93 : long
1138 : precisely
327 : having
86 : little
38 : or
44 : no
1137 : money
6 : in
39 : my
2182 : purse
3 : and
181 : nothing
421 : particular
5 : to


In [None]:
tokenizer.word_counts

OrderedDict([('chapter', 1951),
             ('1', 28),
             ('loomings', 3),
             ('call', 680),
             ('me', 10015),
             ('ishmael', 396),
             ('some', 7287),
             ('years', 1178),
             ('ago', 321),
             ('never', 3182),
             ('mind', 1233),
             ('how', 2820),
             ('long', 3679),
             ('precisely', 222),
             ('having', 847),
             ('little', 4124),
             ('or', 8389),
             ('no', 7402),
             ('money', 227),
             ('in', 48640),
             ('my', 8237),
             ('purse', 126),
             ('and', 75709),
             ('nothing', 1714),
             ('particular', 649),
             ('to', 56826),
             ('interest', 390),
             ('on', 12792),
             ('shore', 416),
             ('i', 31434),
             ('thought', 2808),
             ('would', 5928),
             ('sail', 1326),
             ('about', 4550),
    

In [None]:
vocabulary_size = len(tokenizer.word_counts)

### Convert to Numpy Matrix

In [None]:
import numpy as np

In [None]:
sequences = np.array(sequences)

In [None]:
sequences

array([[  164,  5611, 11718, ...,   181,   421,     5],
       [ 5611, 11718,   390, ...,   421,     5,   641],
       [11718,   390,    32, ...,     5,   641,    32],
       ...,
       [   25,     4,  1490, ...,     2, 11717,  1286],
       [    4,  1490,   289, ..., 11717,  1286,     6],
       [ 1490,   289,     2, ...,  1286,     6,    34]])

# Creating an LSTM based model

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

In [None]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.summary()

    return model

### Train / Test Split

In [None]:
from keras.utils import to_categorical

In [None]:
sequences

array([[  164,  5611, 11718, ...,   181,   421,     5],
       [ 5611, 11718,   390, ...,   421,     5,   641],
       [11718,   390,    32, ...,     5,   641,    32],
       ...,
       [   25,     4,  1490, ...,     2, 11717,  1286],
       [    4,  1490,   289, ..., 11717,  1286,     6],
       [ 1490,   289,     2, ...,  1286,     6,    34]])

In [None]:
# First 49 words
sequences[:,:-1]

array([[  164,  5611, 11718, ...,     3,   181,   421],
       [ 5611, 11718,   390, ...,   181,   421,     5],
       [11718,   390,    32, ...,   421,     5,   641],
       ...,
       [   25,     4,  1490, ..., 11716,     2, 11717],
       [    4,  1490,   289, ...,     2, 11717,  1286],
       [ 1490,   289,     2, ..., 11717,  1286,     6]])

In [None]:
# last Word
sequences[:,-1]

array([   5,  641,   32, ..., 1286,    6,   34])

In [None]:
X = sequences[:,:-1]

In [None]:
y = sequences[:,-1]

In [None]:
print(type(y),len(y),y.shape,y.ndim,y)

<class 'numpy.ndarray'> 99974 (99974,) 1 [   5  641   32 ... 1286    6   34]


In [None]:
y = to_categorical(y, num_classes=vocabulary_size+1)

In [None]:
seq_len = X.shape[1]

In [None]:
seq_len

25

### Training the Model

In [None]:
# define model
model = create_model(vocabulary_size+1, seq_len)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 25)            292975    
_________________________________________________________________
lstm_2 (LSTM)                (None, 25, 150)           105600    
_________________________________________________________________
lstm_3 (LSTM)                (None, 150)               180600    
_________________________________________________________________
dense_2 (Dense)              (None, 150)               22650     
_________________________________________________________________
dense_3 (Dense)              (None, 11719)             1769569   
Total params: 2,371,394
Trainable params: 2,371,394
Non-trainable params: 0
_________________________________________________________________


---

----

In [None]:
from pickle import dump,load

In [None]:
# fit model
model.fit(X, y, batch_size=128, epochs=300,verbose=1)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7feedfac1350>

In [None]:
# save the model to file
model_path="/content/drive/MyDrive/NLP2020/Text_Generation_keras/100ktokens_epochBIG.h5"
tokenizer_path="/content/drive/MyDrive/NLP2020/Text_Generation_keras/100ktokens_epochBIG"
model.save(model_path)
# save the tokenizer
dump(tokenizer, open(tokenizer_path, 'wb'))

# Generating New Text

In [None]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [None]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''

    # Final Output
    output_text = []

    # Intial Seed Sequence
    input_text = seed_text

    # Create num_gen_words
    for i in range(num_gen_words):

        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]

        # Pad sequences to our trained rate (50 words in the video)
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')

        # Predict Class Probabilities for each word
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]

        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind]

        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word

        output_text.append(pred_word)

    # Make it look like a sentence.
    return ' '.join(output_text)

### Grab a random seed sequence

In [None]:
text_sequences[0]

['chapter',
 '1',
 'loomings',
 'call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to']

In [None]:
import random
random.seed(101)
random_pick = random.randint(0,len(text_sequences))

In [None]:
random_seed_text = text_sequences[random_pick]

In [None]:
random_seed_text

['a',
 'clean',
 'comely',
 'looking',
 'cannibal',
 'what',
 "'s",
 'all',
 'this',
 'fuss',
 'i',
 'have',
 'been',
 'making',
 'about',
 'thought',
 'i',
 'to',
 'myself',
 'the',
 'man',
 "'s",
 'a',
 'human',
 'being',
 'just']

In [None]:
seed_text = ' '.join(random_seed_text)

In [None]:
seed_text

"a clean comely looking cannibal what 's all this fuss i have been making about thought i to myself the man 's a human being just"

In [None]:
seed_text=seed_text[0:15]
seed_text='a clean comely looking cannibal what  all this fuss'

In [None]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=50)



'i had myself that one of a leg of the squares in the air and then to be seen a chair but it was a foot too narrow and the world scattered the hot yellow thou have run something and then he thought the villanous green years had endeavored to'

### Exploring Generated Sequence

In [None]:
path="/content/drive/MyDrive/NLP2020/Text_Generation_keras/moby_book_four_chapters.txt"
full_text = read_file(path)

In [None]:
for i,word in enumerate(full_text.split()):
    if word == 'inkling':
        print(' '.join(full_text.split()[i-20:i+20]))
        print('\n')

were stains of some sort or other. At first I knew not what to make of this; but soon an inkling of the truth occurred to me. I remembered a story of a white man--a whaleman too--who, falling among the


