In [1]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
print(gutenberg.fileids())


['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\LOQ\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [2]:
data = gutenberg.raw('shakespeare-hamlet.txt')

with open('shakespeare-hamlet.txt', 'w') as f:
    f.write(data)

In [3]:
##Preprocessing

import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
total_words = len(tokenizer.word_index) + 1
total_words
##total number of unique words in the corpus

4818

In [5]:
### Create input sequences using a sliding window

input_sequences = []
for line in data.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[ : i+1]
        input_sequences.append(n_gram_sequence)

In [6]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, max_sequence_len, padding='pre'))

In [7]:
input_sequences

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]])

In [12]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150,return_sequences=True))
model.add(Dropout(0.2)) ##Disable 20% of neurons(while training) to avoid overfitting
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))



In [None]:
## WITH GRU

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

model_gru = Sequential()
model_gru.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model_gru.add(GRU(150,return_sequences=True))
model_gru.add(Dropout(0.2)) ##Disable 20% of neurons(while training) to avoid overfitting
model_gru.add(GRU(100))
model_gru.add(Dense(total_words, activation='softmax'))

In [20]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
model.build(input_shape=(None, max_sequence_len-1))
model.summary()

In [None]:
## 150 epoches done
history = model.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test), verbose=1)

Epoch 1/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 20ms/step - accuracy: 0.4711 - loss: 2.3851 - val_accuracy: 0.0503 - val_loss: 11.2924
Epoch 2/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - accuracy: 0.4809 - loss: 2.3448 - val_accuracy: 0.0548 - val_loss: 11.3483
Epoch 3/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.4897 - loss: 2.3138 - val_accuracy: 0.0525 - val_loss: 11.4058
Epoch 4/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.4927 - loss: 2.2841 - val_accuracy: 0.0527 - val_loss: 11.5159
Epoch 5/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.5009 - loss: 2.2561 - val_accuracy: 0.0513 - val_loss: 11.5361
Epoch 6/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.5035 - loss: 2.2265 - val_accuracy: 0.0521 - val_loss: 11.6013
Epoc

In [26]:
##Prediction

def generate_text(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

print(generate_text("To be or not to be", 20, max_sequence_len))


To be or not to be that is the question stages and the kings countenance hast without without the any torches thinke it shall haue shew


In [None]:
## To increase Accuracy
#Increase sequence length
#More LSTM units / layers (e.g., 256 units × 2 layers).
#Train longer

In [27]:
model.save('lstm_text_gen_model.h5')

