
# Text Generation with Neural Networks

## Functions for Processing Text

### Reading in files as a string text

In [None]:
# Function For reading File

def read_file(filepath):
    
    with open(filepath) as f:
        str_text = f.read()
    
    return str_text

### Tokenize and Clean Text

In [None]:
# Read the File Content
d = read_file('melville-moby_dick.txt')

# import re
# d = re.sub('[0-9]+', ' ', d)

import nltk
from nltk.tokenize import word_tokenize
nltkTokens=word_tokenize(d)

In [None]:
nltkTokens

In [None]:
print(len(nltkTokens))
print("Type ",type(nltkTokens))
print(nltkTokens[0:30])

## Create Sequences of Tokens

In [None]:
# Funtions to remove Punctuations

def separate_punc(doc_text):
    return [token.lower() for token in doc_text if token not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

nltkTokens=separate_punc(nltkTokens)

In [None]:
print("Type ",type(nltkTokens))
print(nltkTokens[0:30])

In [None]:
# Converting tokens into list of 25 words each
# organize into sequences of tokens

train_len = 25+1 # 25/50 training words , then one target word

# Empty list of sequences
text_sequences = []
                
for i in range(train_len, len(nltkTokens)):
    
    # Grab train_len# amount of characters
                    
    seq = nltkTokens[i-train_len:i]
    
    # Add to list of sequences
    text_sequences.append(seq)

In [None]:
print("Type ",type(text_sequences))
print("Total sequences created ",len(text_sequences))
print("Single Sequence Length ",len(text_sequences[0]))

In [None]:
# Just viewing the sequence as a sentence

' '.join(text_sequences[0])

In [None]:
' '.join(text_sequences[1])

In [None]:
' '.join(text_sequences[2])

# Keras

### Keras Tokenization

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [None]:
text_sequences[0]

In [None]:
sequences[0]

In [None]:
tokenizer.index_word

In [None]:
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

In [None]:
tokenizer.word_counts

In [None]:
# Total unique words
vocabulary_size = len(tokenizer.word_counts)

In [None]:
vocabulary_size

### Convert to Numpy Matrix

In [None]:
import numpy as np

In [None]:
sequences = np.array(sequences)

In [None]:
text_sequences

In [None]:
sequences

# Creating an LSTM based model

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

In [None]:
vocabulary_size

In [None]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

### Train / Test Split

In [None]:
from keras.utils import to_categorical

In [None]:
sequences

In [None]:
sequences[0]

In [None]:
sequences

In [None]:
sequences[:,:-1]

In [None]:
# last Word
sequences[:,-1]

In [None]:
X = sequences[:,:-1]  # 25 words

In [None]:
y = sequences[:,-1]   # just the 26th word

In [None]:
y = to_categorical(y, num_classes=vocabulary_size)

In [None]:
y.shape

In [None]:
seq_len = X.shape[1]

In [None]:
X.shape

In [None]:
seq_len

### Training the Model

In [None]:
# define model
model = create_model(vocabulary_size, seq_len)

---

----

In [None]:
# fit model
model.fit(X, y, batch_size=128, epochs=300,verbose=1)


# save the model to file
model.save('epochBIG.h5')
# save the tokenizer
dump(tokenizer, open('epochBIG', 'wb'))