In [1]:
# imports
import numpy as np

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Input
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.text import Tokenizer

import string

# changeable params
my_file = "TheGreatGatsby.txt"
seq_len = 100

# load doc into memory
def load_doc(filename):
 # open the file as read only
 file = open(filename, 'r')
 # read all text
 text = file.read()
 # close the file
 file.close()
 return text

# turn a doc into clean tokens
def clean_doc(doc):
 # replace '--' with a space ' '
 doc = doc.replace('--', ' ')
 # split into tokens by white space
 tokens = doc.split()
 # remove punctuation from each token
 table = str.maketrans('', '', string.punctuation)
 tokens = [w.translate(table) for w in tokens]
 # remove remaining tokens that are not alphabetic
 tokens = [word for word in tokens if word.isalpha()]
 # make lower case
 tokens = [word.lower() for word in tokens]
 return tokens

# save tokens to file, one dialog per line
def save_doc(lines, filename):
 data = '\n'.join(lines)
 file = open(filename, 'w')
 file.write(data)
 file.close()

# load document
doc = load_doc(my_file)
print(doc[:200])

# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

# organize into sequences of tokens
length = seq_len + 1
sequences = list()
for i in range(length, len(tokens)):
 # select sequence of tokens
 seq = tokens[i-length:i]
 # convert into a line
 line = ' '.join(seq)
 # store
 sequences.append(line)
print('Total Sequences: %d' % len(sequences))

# save sequences to file
out_filename = my_file[:-4] + '_seq.txt'
save_doc(sequences, out_filename)

The Project Gutenberg eBook of The Great Gatsby
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
what
['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'great', 'gatsby', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'united', 'states', 'and', 'most', 'other', 'parts', 'of', 'the', 'world', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 'reuse', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'wwwgutenbergorg', 'if', 'you', 'are', 'not', 'located', 'in', 'the', 'united', 'states', 'you', 'will', 'have', 'to', 'check', 'the', 'laws', 'of', 'the', 'country', 'where', 'you', 'are', 'located', 'before', 'using', 'this', 'ebook', 'title', 'the', 'great', 'gatsby', 'author', 

In [2]:
# load doc into memory
def load_doc(filename):
 # open the file as read only
 file = open(filename, 'r')
 # read all text
 text = file.read()
 # close the file
 file.close()
 return text

# load
doc = load_doc(out_filename)
lines = doc.split('\n')

# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [3]:
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1


# separate into input and output
sequences = np.array(sequences)
sequences.shape
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

p_train = 0.8

n_train = int(X.shape[0]//(1/p_train))
X_train = X[0:n_train]
y_train = y[0:n_train]
X_test = X[n_train:]
y_test = y[n_train:]



In [4]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 25, input_length=seq_length))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(150))
model.add(Dense(150, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X_train, y_train, batch_size=128, epochs=200)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 25)           149925    
                                                                 
 lstm (LSTM)                 (None, 100, 150)          105600    
                                                                 
 lstm_1 (LSTM)               (None, 150)               180600    
                                                                 
 dense (Dense)               (None, 150)               22650     
                                                                 
 dense_1 (Dense)             (None, 5997)              905547    
                                                                 
Total params: 1364322 (5.20 MB)
Trainable params: 1364322 (5.20 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/200
E

<keras.src.callbacks.History at 0x7a4f220aeec0>

In [16]:
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
  result = list()
  in_text = seed_text
# generate a fixed number of words
  for _ in range(n_words):
# encode the text as integer
    encoded = tokenizer.texts_to_sequences([in_text])[0]
# truncate sequences to a fixed length
    encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
# predict probabilities for each word
    yhat = np.argmax(model.predict(encoded, verbose=0), axis=-1)
    print(yhat)
# map predicted word index to word
    out_word = ''
    for word, index in tokenizer.word_index.items():
      if index == yhat:
        out_word = word
        break
# append to input
    in_text += ' ' + out_word
    result.append(out_word)
  return ' '.join(result)

# load cleaned text sequences
in_filename = 'TheGreatGatsby_seq.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1


# select a seed text
#seed_text = lines[randint(0,len(lines))]
#print(seed_text + '\n')

# generate new text
#generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
#print(generated)


# Generate and assess 10 sequences
for _ in range(10):
    seed_text = lines[randint(0, len(lines))]
    generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
    #print(f"Seed Text: {seed_text}\nGenerated Sequence: {generated}\n")
    print(f"\nGenerated Sequence: {generated}\n")
    #print("Basic Assessment: [Your assessment here]\n")



[38]
[141]
[45]
[2527]
[4730]
[294]
[20]
[31]
[4564]
[1337]
[4565]
[208]
[7]
[3]
[201]
[8]
[17]
[784]
[3]
[418]
[287]
[23]
[2]
[8]
[9]
[809]
[179]
[16]
[227]
[7]
[3]
[3678]
[7]
[1]
[3679]
[1]
[65]
[74]
[15]
[21]
[2]
[473]
[57]
[415]
[15]
[47]
[114]
[5]
[413]
[5]

Generated Sequence: there must have lasted indefinitely except for an overwound clock recovering himself in a minute he had often a policeman married him and he was leaning against her world in a lull in the entertainment the man looked at me and smiled back glanced at daisy want to speak to

[948]
[214]
[9]
[1060]
[12]
[177]
[181]
[186]
[2085]
[7]
[1]
[1313]
[2788]
[4]
[1961]
[1]
[1700]
[6]
[129]
[18]
[28]
[143]
[24]
[3]
[545]
[2496]
[124]
[2319]
[34]
[55]
[57]
[22]
[8]
[74]
[15]
[11]
[592]
[751]
[30]
[2]
[16]
[69]
[343]
[168]
[39]
[73]
[1480]
[2467]
[5]
[2581]

Generated Sequence: damned every was sorry it three baker sat discreetly in the tragic griefs of speech the wedding i began on all girl from a trembling match where p