
# Text Generation with Neural Networks

## Functions for Processing Text

### Reading in files as a string text

In [8]:
# Function For reading File

def read_file(filepath):
    
    with open(filepath) as f:
        str_text = f.read()
    
    return str_text

### Tokenize and Clean Text

In [10]:
# Read the File Content
d = read_file('melville-moby_dick.txt')

# import re
# d = re.sub('[0-9]+', ' ', d)

import nltk
# nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltkTokens=word_tokenize(d)

In [11]:
nltkTokens

['CHAPTER',
 '1',
 'Loomings',
 '.',
 'Call',
 'me',
 'Ishmael',
 '.',
 'Some',
 'years',
 'ago',
 '--',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 '--',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 ',',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on',
 'shore',
 ',',
 'I',
 'thought',
 'I',
 'would',
 'sail',
 'about',
 'a',
 'little',
 'and',
 'see',
 'the',
 'watery',
 'part',
 'of',
 'the',
 'world',
 '.',
 'It',
 'is',
 'a',
 'way',
 'I',
 'have',
 'of',
 'driving',
 'off',
 'the',
 'spleen',
 'and',
 'regulating',
 'the',
 'circulation',
 '.',
 'Whenever',
 'I',
 'find',
 'myself',
 'growing',
 'grim',
 'about',
 'the',
 'mouth',
 ';',
 'whenever',
 'it',
 'is',
 'a',
 'damp',
 ',',
 'drizzly',
 'November',
 'in',
 'my',
 'soul',
 ';',
 'whenever',
 'I',
 'find',
 'myself',
 'involuntarily',
 'pausing',
 'before',
 'coffin',
 'warehouses',
 ',',
 'and',
 'bringing',
 'up',
 'the',
 'rear',
 'of',
 'every',
 'funeral',
 'I',
 '

In [12]:
print(len(nltkTokens))
print("Type ",type(nltkTokens))
print(nltkTokens[0:30])

250374
Type  <class 'list'>
['CHAPTER', '1', 'Loomings', '.', 'Call', 'me', 'Ishmael', '.', 'Some', 'years', 'ago', '--', 'never', 'mind', 'how', 'long', 'precisely', '--', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', ',', 'and', 'nothing', 'particular']


## Create Sequences of Tokens

In [14]:
# Funtions to remove Punctuations

def separate_punc(doc_text):
    return [token.lower() for token in doc_text if token not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

nltkTokens=separate_punc(nltkTokens)

In [15]:
print("Type ",type(nltkTokens))
print(nltkTokens[0:30])
print(len(nltkTokens))

Type  <class 'list'>
['chapter', '1', 'loomings', 'call', 'me', 'ishmael', 'some', 'years', 'ago', 'never', 'mind', 'how', 'long', 'precisely', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore']
215263


In [17]:
# Converting tokens into list of 25 words each
# organize into sequences of tokens

train_len = 25+1 # 25/50 training words , then one target word

# Empty list of sequences
# text_sequences = [[0.....25],[1....26],[2......27]]
text_sequences = []
#                 26        215263
for i in range(train_len, len(nltkTokens)):
   
    # Grab train_len# amount of characters
#                      28-26:28 = [2:28]    
    seq = nltkTokens[i-train_len:i]
    
    # Add to list of sequences
    text_sequences.append(seq)

In [18]:
print("Type ",type(text_sequences))
print("Total sequences created ",len(text_sequences))
print("Single Sequence Length ",len(text_sequences[0]))

Type  <class 'list'>
Total sequences created  215237
Single Sequence Length  26


In [22]:
# Just viewing the sequence as a sentence

' '.join(text_sequences[0])

'chapter 1 loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to'

In [23]:
' '.join(text_sequences[1])

'1 loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest'

In [24]:
' '.join(text_sequences[2])

'loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me'

# Keras

### Keras Tokenization

In [26]:
from keras.preprocessing.text import Tokenizer

In [27]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [28]:
text_sequences[0]

['chapter',
 '1',
 'loomings',
 'call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to']

In [29]:
sequences[0]

[157,
 9597,
 18405,
 385,
 41,
 1008,
 44,
 257,
 586,
 139,
 284,
 113,
 87,
 751,
 339,
 116,
 38,
 51,
 1881,
 6,
 50,
 2959,
 3,
 211,
 420,
 5]

In [30]:
tokenizer.index_word

{1: 'the',
 2: 'of',
 3: 'and',
 4: 'a',
 5: 'to',
 6: 'in',
 7: 'that',
 8: 'his',
 9: 'it',
 10: 'i',
 11: 'he',
 12: 'but',
 13: 'as',
 14: 'is',
 15: 'with',
 16: 'was',
 17: 'for',
 18: "'s",
 19: "''",
 20: 'all',
 21: 'this',
 22: '``',
 23: 'at',
 24: 'not',
 25: 'by',
 26: 'from',
 27: 'him',
 28: 'so',
 29: 'on',
 30: 'be',
 31: 'whale',
 32: 'one',
 33: 'you',
 34: 'there',
 35: 'now',
 36: 'had',
 37: 'have',
 38: 'or',
 39: 'were',
 40: 'they',
 41: 'me',
 42: 'then',
 43: 'which',
 44: 'some',
 45: 'their',
 46: 'when',
 47: 'what',
 48: 'an',
 49: 'are',
 50: 'my',
 51: 'no',
 52: 'like',
 53: 'upon',
 54: 'into',
 55: 'out',
 56: 'ahab',
 57: 'more',
 58: 'up',
 59: 'man',
 60: 'if',
 61: 'them',
 62: 'ship',
 63: 'we',
 64: 'old',
 65: 'ye',
 66: 'would',
 67: 'do',
 68: 'been',
 69: 'other',
 70: 'over',
 71: 'these',
 72: 'though',
 73: 'will',
 74: 'only',
 75: 'its',
 76: 'such',
 77: 'sea',
 78: 'down',
 79: 'who',
 80: 'yet',
 81: 'her',
 82: 'time',
 83: 'any',


In [31]:
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

157 : chapter
9597 : 1
18405 : loomings
385 : call
41 : me
1008 : ishmael
44 : some
257 : years
586 : ago
139 : never
284 : mind
113 : how
87 : long
751 : precisely
339 : having
116 : little
38 : or
51 : no
1881 : money
6 : in
50 : my
2959 : purse
3 : and
211 : nothing
420 : particular
5 : to


In [32]:
tokenizer.word_counts

OrderedDict([('chapter', 4447),
             ('1', 28),
             ('loomings', 3),
             ('call', 1382),
             ('me', 16276),
             ('ishmael', 500),
             ('some', 15789),
             ('years', 2270),
             ('ago', 893),
             ('never', 5262),
             ('mind', 2065),
             ('how', 6486),
             ('long', 8203),
             ('precisely', 690),
             ('having', 1679),
             ('little', 6412),
             ('or', 17905),
             ('no', 15020),
             ('money', 279),
             ('in', 105747),
             ('my', 15231),
             ('purse', 178),
             ('and', 164028),
             ('nothing', 2962),
             ('particular', 1273),
             ('to', 117286),
             ('interest', 442),
             ('on', 26988),
             ('shore', 520),
             ('i', 54184),
             ('thought', 3848),
             ('would', 11154),
             ('sail', 2132),
             ('about', 

In [33]:
# Total unique words
vocabulary_size = len(tokenizer.word_counts)

In [34]:
vocabulary_size

18405

### Convert to Numpy Matrix

In [35]:
import numpy as np

In [38]:
sequences = np.array(sequences)

In [39]:
text_sequences

[['chapter',
  '1',
  'loomings',
  'call',
  'me',
  'ishmael',
  'some',
  'years',
  'ago',
  'never',
  'mind',
  'how',
  'long',
  'precisely',
  'having',
  'little',
  'or',
  'no',
  'money',
  'in',
  'my',
  'purse',
  'and',
  'nothing',
  'particular',
  'to'],
 ['1',
  'loomings',
  'call',
  'me',
  'ishmael',
  'some',
  'years',
  'ago',
  'never',
  'mind',
  'how',
  'long',
  'precisely',
  'having',
  'little',
  'or',
  'no',
  'money',
  'in',
  'my',
  'purse',
  'and',
  'nothing',
  'particular',
  'to',
  'interest'],
 ['loomings',
  'call',
  'me',
  'ishmael',
  'some',
  'years',
  'ago',
  'never',
  'mind',
  'how',
  'long',
  'precisely',
  'having',
  'little',
  'or',
  'no',
  'money',
  'in',
  'my',
  'purse',
  'and',
  'nothing',
  'particular',
  'to',
  'interest',
  'me'],
 ['call',
  'me',
  'ishmael',
  'some',
  'years',
  'ago',
  'never',
  'mind',
  'how',
  'long',
  'precisely',
  'having',
  'little',
  'or',
  'no',
  'money',
  'in

In [43]:
sequences

array([[  157,  9597, 18405, ...,   211,   420,     5],
       [ 9597, 18405,   385, ...,   420,     5,  1130],
       [18405,   385,    41, ...,     5,  1130,    41],
       ...,
       [    4,   274,   906, ...,  1270,  1370,    74],
       [  274,   906,   344, ...,  1370,    74,   215],
       [  906,   344,  1369, ...,    74,   215,   212]])

# Creating an LSTM based model

In [44]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

In [45]:
vocabulary_size

18405

In [46]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

### Train / Test Split

In [47]:
from keras.utils import to_categorical

In [48]:
sequences

array([[  157,  9597, 18405, ...,   211,   420,     5],
       [ 9597, 18405,   385, ...,   420,     5,  1130],
       [18405,   385,    41, ...,     5,  1130,    41],
       ...,
       [    4,   274,   906, ...,  1270,  1370,    74],
       [  274,   906,   344, ...,  1370,    74,   215],
       [  906,   344,  1369, ...,    74,   215,   212]])

In [49]:
sequences[0]

array([  157,  9597, 18405,   385,    41,  1008,    44,   257,   586,
         139,   284,   113,    87,   751,   339,   116,    38,    51,
        1881,     6,    50,  2959,     3,   211,   420,     5])

In [50]:
sequences

array([[  157,  9597, 18405, ...,   211,   420,     5],
       [ 9597, 18405,   385, ...,   420,     5,  1130],
       [18405,   385,    41, ...,     5,  1130,    41],
       ...,
       [    4,   274,   906, ...,  1270,  1370,    74],
       [  274,   906,   344, ...,  1370,    74,   215],
       [  906,   344,  1369, ...,    74,   215,   212]])

In [51]:
sequences[:,:-1]

array([[  157,  9597, 18405, ...,     3,   211,   420],
       [ 9597, 18405,   385, ...,   211,   420,     5],
       [18405,   385,    41, ...,   420,     5,  1130],
       ...,
       [    4,   274,   906, ...,    81,  1270,  1370],
       [  274,   906,   344, ...,  1270,  1370,    74],
       [  906,   344,  1369, ...,  1370,    74,   215]])

In [52]:
# last Word
sequences[:,-1]

array([   5, 1130,   41, ...,   74,  215,  212])

In [53]:
X = sequences[:,:-1]  # 25 words

In [61]:
y = sequences[:,-1]   # just the 26th word

In [63]:
y = to_categorical(y, num_classes=vocabulary_size)

In [64]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [65]:
y.shape

(215237, 18405)

In [66]:
X.shape

(215237, 25)

In [68]:
seq_len = X.shape[1]

In [None]:
X.shape

In [69]:
seq_len

25

### Training the Model

In [70]:
# define model
model = create_model(vocabulary_size, seq_len)

Instructions for updating:
Colocations handled automatically by placer.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 25)            460125    
_________________________________________________________________
lstm_1 (LSTM)                (None, 25, 150)           105600    
_________________________________________________________________
lstm_2 (LSTM)                (None, 150)               180600    
_________________________________________________________________
dense_1 (Dense)              (None, 150)               22650     
_________________________________________________________________
dense_2 (Dense)              (None, 18405)             2779155   
Total params: 3,548,130
Trainable params: 3,548,130
Non-trainable params: 0
_________________________________________________________________


---

----

In [71]:
# fit model
model.fit(X, y, batch_size=128, epochs=300,verbose=1)


Instructions for updating:
Use tf.cast instead.


Epoch 1/300

KeyboardInterrupt: 

# save the model to file
model.save('epochBIG.h5')
# save the tokenizer
dump(tokenizer, open('epochBIG', 'wb'))