## Read Data

In [9]:
def read_file (filePath):
    with open(filePath) as f:
        str_text = f.read()
    return str_text

## Prepocessing 

In [4]:
import spacy
nlp = spacy.load('en_core_web_sm', disable = ['ner', 'parser', 'tagger'])
nlp.max_length = 1198623

In [86]:
def remove_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in ' --!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n']

In [87]:
d = read_file('moby_dick_four_chapters.txt')
tokens = remove_punc(d)

In [88]:
tokens[0]

'call'

In [89]:
train_len = 25 + 1

In [125]:
def get_text_sequences(tokens, train_len):
    text_sequences = []
    for i in range(train_len, len(tokens)):
        seq = tokens[i - train_len : i]
        text_sequences.append(seq)
    return text_sequences

In [126]:
text_sequences = get_text_sequences(tokens, train_len)
[" ".join(s) for s in text_sequences[0:5]]

['call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on',
 'me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore',
 'ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore i',
 'some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore i thought',
 'years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore i thought i']

## Tokenization

In [93]:
from keras.preprocessing.text import Tokenizer

In [171]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

sequences = tokenizer.texts_to_sequences(text_sequences)

In [207]:
# sequences[0]

In [208]:
# tokenizer.index_word[1]

In [166]:
vocabulary_size = len(tokenizer.word_counts)

In [173]:
import numpy as np
sequences = np.array(sequences)

## Prepare traing data

In [209]:
# sequences

In [180]:
X = sequences[:,:-1]
Y = sequences[:,-1]

In [183]:
from keras.utils import to_categorical
Y = to_categorical(Y, num_classes=vocabulary_size + 1) # extra on for category 0

In [187]:
vocabulary_size

2719

In [188]:
sequence_len = X.shape[1]
sequence_len

25

## Prepare Keras Model

In [189]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

In [224]:
def create_model(vocabulary_size, sequence_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, sequence_len, input_length=sequence_len))
    model.add(LSTM(sequence_len * 4, return_sequences=True))
    model.add(LSTM(sequence_len * 4))
    model.add(Dense(sequence_len, activation='relu'))
    model.add(Dense(vocabulary_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model
model =create_model(vocabulary_size + 1, sequence_len)

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 25, 25)            68000     
_________________________________________________________________
lstm_13 (LSTM)               (None, 25, 100)           50400     
_________________________________________________________________
lstm_14 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dense_8 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_9 (Dense)              (None, 2720)              274720    
Total params: 483,620
Trainable params: 483,620
Non-trainable params: 0
_________________________________________________________________


## Train model

In [225]:
model.fit(X,Y,batch_size=128, epochs=50, verbose=1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50

KeyboardInterrupt: 

## Generate Text

In [226]:
from keras.preprocessing.sequence import pad_sequences

In [227]:
def generate_text(model, tokenizer, seed_text, num_gen_words):
    output_text = []
    input_text = seed_text
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=sequence_len, truncating='pre')
        
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        pred_word = tokenizer.index_word[pred_word_ind]
        input_text += ' ' + pred_word
        output_text += ' ' + pred_word
    return ''.join(output_text)

In [231]:
generate_text(model, tokenizer, ' '.join(text_sequences[3]), 10)

' and the little night and the little deal and a'