In [1]:
# This code generates text after being fed words from the story of Moby Dick.

In [2]:
# reading in files as string
def read_file(filepath):
    
    with open(filepath) as f:
        str_text = f.read()
        
    return str_text

In [3]:
# tokenize and clean text
import spacy
nlp = spacy.load('en_core_web_lg',disable=['parser','tagger','ner'])

# changing default max length
nlp.max_length = 1198623

In [4]:
#removing punctuations
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [5]:
d = read_file('melville-moby_dick.txt')
tokens = separate_punc(d)

In [6]:
# organize into sequences of tokens
train_len = 25+1 # 50 training words , then one target word

# Empty list of sequences
text_sequences = []

for i in range(train_len, len(tokens)):
    
    # Grab train_len# amount of characters
    seq = tokens[i-train_len:i]
    
    # Add to list of sequences
    text_sequences.append(seq)

In [7]:
# Keras tokenization
from keras.preprocessing.text import Tokenizer
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)
vocabulary_size = len(tokenizer.word_counts)

# convert to Numpy matrix
import numpy as np
sequences = np.array(sequences)

In [8]:
# Creating a long short term memory based model
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

In [9]:
# splitting train/test
from keras.utils import to_categorical
X = sequences[:,:-1]
y = sequences[:,-1]
y = to_categorical(y, num_classes=vocabulary_size+1)
seq_len = X.shape[1]

In [10]:
# loading trained model
from keras.models import load_model
model = load_model('epochBIG.h5')

In [11]:
# Generating new text
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):

#     INPUTS:
#     model : model that was trained on text data
#     tokenizer : tokenizer that was fit on text data
#     seq_len : length of training sequence
#     seed_text : raw string text to serve as the seed
#     num_gen_words : number of words to be generated by model

    
    # Final Output
    output_text = []
    
    # Intial Seed Sequence
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to the trained rate
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

In [12]:
# text_sequences

In [13]:
# Grabbing a random seed sequence
import random
random.seed(101)
random_pick = random.randint(0,len(text_sequences))

random_seed_text = text_sequences[random_pick]
seed_text = ' '.join(random_seed_text)

In [14]:
# Comparing generated text to original text
print('First 25 words: \n',' '.join(text_sequences[random_pick]))
print('\n')
print('Original 50 words after: \n',' '.join(text_sequences[random_pick+26]),end=' ')
print(' '.join(text_sequences[random_pick+52]))

print('\nGenerated 50 words after: ')
print(generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=50))

First 25 words: 
 stranger that stubb vowed he recognised his cutting spade pole entangled in the lines that were knotted round the tail of one of these whales there


Original 50 words after: 
 's a pretty fellow now he banteringly laughed standing in the ship 's bows there 's a jackal for ye i well know that these crappoes of frenchmen are but poor devils in the fishery sometimes lowering their boats for breakers mistaking them for sperm whale spouts yes and sometimes sailing from

Generated 50 words after: 




my epitaphs to the secondly of this crew whiteness monster wo suspended and well earthly veteran his last cloaked in the sea thumb was the hull spring over to it now wore to it terrific only caused what larger as a try 's yellow in some skull the turned spring
