In [1]:
import tensorflow.keras

from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import InputLayer

from tensorflow.keras.utils import to_categorical

import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
import random

from tqdm.notebook import tqdm as log_progress

In [2]:
#load and clean data
raw = open("data.txt", "r")

lines = []
for line in raw:
    if str(line) != '\n':
        lines.append(str(line).lower())

#putting the entirety of the input data into one string
text = ""
for line in lines:
    text += line
tokenizer = RegexpTokenizer(r'\w+')
tokenized = tokenizer.tokenize(text)
#tokenized = nltk.word_tokenize(text)

#converting the text into numbers to be processed by the embedding layer of the model
words = [] #one of each word in tokenized will be in here
filtering = lambda x : not x in words #for finding if the word should be added to the words array
find = lambda x : words.index(x) if x in words else -1 #convert each word into a number. -1 means that the item isn't in the vocabulary

for word in tokenized:
    if filtering(word):
        words.append(word)

#x data is every single word in the data set, in order
#y data is every single word that comes after the corresponding x value
x = []
y = []
for i in range(len(tokenized)):
    x.append(find(tokenized[i]))
    if i+1 < len(tokenized):
        y.append(find(tokenized[i+1]))
    else:
        y.append(find("."))

x = np.asarray(x)
y = np.asarray(y)
y = to_categorical(y)

print(x.shape)
print(y.shape)

vocab_size = len(words)

(18314,)
(18314, 3203)


In [12]:
#create and compile model
model = Sequential()

model.add(InputLayer(batch_input_shape=(100,1)))

model.add(Embedding(vocab_size, 32, input_length=1)) #embedding: size of vocabulary, dimension of each value
model.add(LSTM(40, return_sequences=True, stateful=True, dropout=0.01))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.05))
model.add(Dense(y.shape[1], activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (100, 1, 32)              102496    
_________________________________________________________________
lstm_2 (LSTM)                (100, 1, 40)              11680     
_________________________________________________________________
dense_2 (Dense)              (100, 1, 32)              1312      
_________________________________________________________________
dropout_1 (Dropout)          (100, 1, 32)              0         
_________________________________________________________________
dense_3 (Dense)              (100, 1, 3203)            105699    
Total params: 221,187
Trainable params: 221,187
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
#train model
hist = model.fit(x, y, epochs=200, verbose=1, batch_size=100)
model.save("shakespear2.0.h5")

plt.title("loss")
plt.legend(['loss', 'val_loss'])
plt.plot(hist.history['loss'], label='loss')
plt.plot(hist.history['val_loss'], label='val_loss')

Epoch 1/200


ValueError: in user code:

    C:\Users\lukec\miniconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:855 train_function  *
        return step_function(self, iterator)
    C:\Users\lukec\miniconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:845 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\lukec\miniconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1285 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\lukec\miniconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2833 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\lukec\miniconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3608 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\lukec\miniconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:838 run_step  **
        outputs = model.train_step(data)
    C:\Users\lukec\miniconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:796 train_step
        loss = self.compiled_loss(
    C:\Users\lukec\miniconda3\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:204 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    C:\Users\lukec\miniconda3\lib\site-packages\tensorflow\python\keras\losses.py:155 __call__
        losses = call_fn(y_true, y_pred)
    C:\Users\lukec\miniconda3\lib\site-packages\tensorflow\python\keras\losses.py:259 call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    C:\Users\lukec\miniconda3\lib\site-packages\tensorflow\python\util\dispatch.py:206 wrapper
        return target(*args, **kwargs)
    C:\Users\lukec\miniconda3\lib\site-packages\tensorflow\python\keras\losses.py:1643 categorical_crossentropy
        return backend.categorical_crossentropy(
    C:\Users\lukec\miniconda3\lib\site-packages\tensorflow\python\util\dispatch.py:206 wrapper
        return target(*args, **kwargs)
    C:\Users\lukec\miniconda3\lib\site-packages\tensorflow\python\keras\backend.py:4862 categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)
    C:\Users\lukec\miniconda3\lib\site-packages\tensorflow\python\framework\tensor_shape.py:1161 assert_is_compatible_with
        raise ValueError("Shapes %s and %s are incompatible" % (self, other))

    ValueError: Shapes (None, 3203) and (100, 1, 3203) are incompatible


In [15]:
#test model
sample_length = 100
model = load_model("shakespear2.0.h5")

root = random.choice(words)
output = ""

for i in log_progress(range(sample_length)):
    num = find(root)
    pred = model.predict([num])
    next_word = pred.argmax()
    #next_word = np.random.choice(len(pred[0]), p=pred[0])
    output += words[next_word] + " "
    root = words[pred.argmax()] #setting the next word

print(output)

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))


yours alive then things could so you husband by more when losing rise . that such come forsworn her ever ensconce forsaken though is an rotten when died my shamed to i have , thy for to confin love scarcely say i when change am thine me not pray perjur and i sing . are i , rotten pitiful outward love debarre my thriftless do near and age in mortgag on how will attainted reserve water before mortgag who curse bring be do found break suffic darkening within see to and beauty ; attainted still increase am forsaken she i 
