## Text generation Using RNN

In [None]:
def read_file(filepath):
  with open(filepath) as f:
    str_text = f.read()
  return str_text

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
nlp.max_length = 1198623

In [None]:
def separate_punc(doc_text):
  return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [None]:
d = read_file('/content/drive/MyDrive/Final Projects/NLP/Data/melville-moby_dick.txt')

In [None]:
tokens = separate_punc(d)

In [None]:
tokens

['chapter',
 '1',
 'loomings',
 'call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on',
 'shore',
 'i',
 'thought',
 'i',
 'would',
 'sail',
 'about',
 'a',
 'little',
 'and',
 'see',
 'the',
 'watery',
 'part',
 'of',
 'the',
 'world',
 'it',
 'is',
 'a',
 'way',
 'i',
 'have',
 'of',
 'driving',
 'off',
 'the',
 'spleen',
 'and',
 'regulating',
 'the',
 'circulation',
 'whenever',
 'i',
 'find',
 'myself',
 'growing',
 'grim',
 'about',
 'the',
 'mouth',
 'whenever',
 'it',
 'is',
 'a',
 'damp',
 'drizzly',
 'november',
 'in',
 'my',
 'soul',
 'whenever',
 'i',
 'find',
 'myself',
 'involuntarily',
 'pausing',
 'before',
 'coffin',
 'warehouses',
 'and',
 'bringing',
 'up',
 'the',
 'rear',
 'of',
 'every',
 'funeral',
 'i',
 'meet',
 'and',
 'especially',
 'whenever',
 'my',
 'hypos',
 'get',
 'such

In [None]:
## Create Sequence of Tokens
train_len = 25 + 1
text_sequence = []
for i in range(train_len,len(tokens)):
  text_sequence.append(tokens[i-train_len:i])
text_sequence[0]

['chapter',
 '1',
 'loomings',
 'call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to']

In [None]:
' '.join(text_sequence[0])

'chapter 1 loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to'

#keras Tokenizer

In [None]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequence)
sequence = tokenizer.texts_to_sequences(text_sequence)

In [None]:
sequence[0]

[158,
 9443,
 17526,
 402,
 42,
 1043,
 43,
 247,
 659,
 140,
 296,
 116,
 82,
 787,
 347,
 113,
 36,
 50,
 1788,
 6,
 49,
 3028,
 3,
 218,
 442,
 5]

In [None]:
tokenizer.word_counts

OrderedDict([('chapter', 4447),
             ('1', 28),
             ('loomings', 3),
             ('call', 1382),
             ('me', 16095),
             ('ishmael', 500),
             ('some', 15789),
             ('years', 2400),
             ('ago', 815),
             ('never', 5262),
             ('mind', 2039),
             ('how', 6330),
             ('long', 8567),
             ('precisely', 690),
             ('having', 1679),
             ('little', 6412),
             ('or', 17879),
             ('no', 14916),
             ('money', 305),
             ('in', 105799),
             ('my', 15231),
             ('purse', 178),
             ('and', 164029),
             ('nothing', 2936),
             ('particular', 1273),
             ('to', 117832),
             ('interest', 442),
             ('on', 26910),
             ('shore', 572),
             ('i', 53430),
             ('thought', 3874),
             ('would', 11232),
             ('sail', 2522),
             ('about', 

In [None]:
vocabulary_size = len(tokenizer.word_counts)

In [None]:
#convert to matrix
import numpy as np
sequence = np.array(sequence)


In [None]:
X = sequence[:,:-1]
y = sequence[:,-1]

In [None]:
from keras.utils import to_categorical
y = to_categorical(y, num_classes=vocabulary_size)
y.shape

(214682, 17526)

# Keras Model

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Dense, Embedding, LSTM
def build_model(vocabulay_size, seq_len):
  model = Sequential()
  model.add(Embedding(vocabulary_size,25, input_length=seq_len))
  model.add(LSTM(150, return_sequences=True))
  model.add(Dropout(.2))
  model.add(LSTM(150, return_sequences=True))
  model.add(Dropout(.2))
  model.add(LSTM(150))
  model.add(Dense(vocabulary_size, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.summary()
  return model

In [None]:
seq_len = X.shape[1]

In [None]:
model = build_model(vocabulary_size+1, seq_len)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 25)            438150    
                                                                 
 lstm (LSTM)                 (None, 25, 150)           105600    
                                                                 
 dropout (Dropout)           (None, 25, 150)           0         
                                                                 
 lstm_1 (LSTM)               (None, 25, 150)           180600    
                                                                 
 dropout_1 (Dropout)         (None, 25, 150)           0         
                                                                 
 lstm_2 (LSTM)               (None, 150)               180600    
                                                                 
 dense (Dense)               (None, 17526)             2

In [None]:
from pickle import dump,load
model.fit(X,y, batch_size=32, epochs=2, verbose=1)

In [None]:
from keras.utils import pad_sequences
from random import randint
from keras.models import load_model
from pickle import load

In [None]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''
    
    # Final Output
    output_text = []
    
    # Intial Seed Sequence
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = np.argmax(model.predict(pad_encoded, verbose=0), axis=-1)[0]
        
        
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

In [None]:
model = load_model('/content/drive/MyDrive/Final Projects/NLP/epochBIG.h5')

In [None]:
text_sequence[5]

['ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on',
 'shore',
 'i']

In [None]:
import random
random.seed(101)
random_pick = random.randint(0,len(text_sequence))

In [None]:
random_seed_text = text_sequence[random_pick]

In [None]:
seed_text = ' '.join(random_seed_text)

In [None]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=50)

'more for some comes ahasuerus is temporary that blessed what myself was returned to one and microscopic and dropped wrapall at total in falling as he was whereas with a seasons thus the below flames has made no prove gods from these vast that my solemn been four over and'

In [None]:
encoded_text = tokenizer.texts_to_sequences([seed_text])[0]
pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
model.predict(pad_encoded, verbose=0)[0]

array([1.5922462e-23, 8.8436257e-13, 9.0134567e-25, ..., 0.0000000e+00,
       1.7013212e-34, 1.5775789e-23], dtype=float32)

In [None]:
full_text = read_file('/content/drive/MyDrive/Final Projects/NLP/Data/moby_dick_four_chapters.txt')