___

<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>
___
# Text Generation with Neural Networks


### Reading in files as a string text

In [0]:
def read_file(filepath):
    
    with open(filepath) as f:
        str_text = f.read()
    
    return str_text

### Tokenize and Clean Text using spaCy library

In [0]:
import spacy
nlp = spacy.load('en',disable=['parser', 'tagger','ner'])

nlp.max_length = 1198623

In [0]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [0]:
d = read_file('moby_dick_four_chapters.txt')
tokens = separate_punc(d)

In [0]:
tokens    

In [77]:
len(tokens)

11338

## Create Sequences of Tokens

In [0]:
# organize into sequences of tokens
train_len = 25+1 # 50 training words , then one target word

# Empty list of sequences
text_sequences = []

for i in range(train_len, len(tokens)):
    
    # Grab train_len# amount of characters
    seq = tokens[i-train_len:i]
    
    # Add to list of sequences
    text_sequences.append(seq)

In [78]:
 ' '.join(text_sequences[0])

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [46]:
len(text_sequences)

11312

# Keras

### Keras Tokenization

In [0]:
from keras.preprocessing.text import Tokenizer

In [0]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [79]:
sequences[0]

array([ 956,   14,  263,   51,  261,  408,   87,  219,  129,  111,  954,
        260,   50,   43,   38,  315,    7,   23,  546,    3,  150,  259,
          6, 2712,   14,   24])

In [0]:
tokenizer.index_word

In [0]:
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

In [0]:
tokenizer.word_counts

In [0]:
vocabulary_size = len(tokenizer.word_counts)

In [88]:
vocabulary_size

2717

### Convert to Numpy Matrix

In [0]:
import numpy as np

In [0]:
sequences = np.array(sequences)

In [82]:
sequences

array([[ 956,   14,  263, ..., 2712,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2711, ...,   53,    2, 2717],
       [ 166, 2711,    3, ...,    2, 2717,   26]])

# Creating an LSTM based model

In [0]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

In [0]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

### Train / Test Split

In [0]:
from keras.utils import to_categorical

In [0]:
sequences

In [83]:
# First 49 words
sequences[:,:-1]    

array([[ 956,   14,  263, ...,    6, 2712,   14],
       [  14,  263,   51, ..., 2712,   14,   24],
       [ 263,   51,  261, ...,   14,   24,  957],
       ...,
       [ 952,   12,  166, ...,   11,  262,   53],
       [  12,  166, 2711, ...,  262,   53,    2],
       [ 166, 2711,    3, ...,   53,    2, 2717]])

In [84]:
# last Word
sequences[:,-1]

array([  24,  957,    5, ...,    2, 2717,   26])

In [0]:
X = sequences[:,:-1]

In [0]:
y = sequences[:,-1]

In [0]:
y = to_categorical(y, num_classes=vocabulary_size+1)

In [0]:
seq_len = X.shape[1]

In [61]:
seq_len

25

### Training the Model

In [62]:
# define model
model = create_model(vocabulary_size+1, seq_len)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 25, 25)            67950     
_________________________________________________________________
lstm_3 (LSTM)                (None, 25, 150)           105600    
_________________________________________________________________
lstm_4 (LSTM)                (None, 150)               180600    
_________________________________________________________________
dense_3 (Dense)              (None, 150)               22650     
_________________________________________________________________
dense_4 (Dense)              (None, 2718)              410418    
Total params: 787,218
Trainable params: 787,218
Non-trainable params: 0
_________________________________________________________________


---

----

In [0]:
from pickle import dump,load

In [0]:
# fit model 
model.fit(X, y, batch_size=128, epochs=100,verbose=2)

**Save the model and tokenizer trained on 100 epoches**

In [0]:
# save the model to file
model.save('epoch100.h5')
# save the tokenizer
dump(tokenizer, open('epoch100', 'wb'))

# Generating New Text

In [0]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [0]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''
    
    # Final Output
    output_text = []
    
    # Intial Seed Sequence
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate (50 words in the video)
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

### Grab a random seed sequence

In [0]:
text_sequences[0]

In [0]:
import random
random.seed(101)
random_pick = random.randint(0,len(text_sequences))

In [0]:
random_seed_text = text_sequences[random_pick]

In [0]:
random_seed_text

In [0]:
seed_text = ' '.join(random_seed_text)

In [0]:
seed_text

In [76]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=50)

"myself the enormous act of impaling himself in a snow hill of sporting the gloom of the ungraspable phantom of life and sun the lamp of a back and i gave myself with the door 's be order there handfuls of genteel comedies and solo forbidden is and ten stop"

### Exploring Generated Sequence

In [0]:
full_text = read_file('moby_dick_four_chapters.txt')

In [75]:
for i,word in enumerate(full_text.split()):
    if word == 'inkling':
        print(' '.join(full_text.split()[i-20:i+20]))
        print('\n')

were stains of some sort or other. At first I knew not what to make of this; but soon an inkling of the truth occurred to me. I remembered a story of a white man--a whaleman too--who, falling among the


