In [5]:
# Data Collection
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd
gutenberg.fileids()
data = gutenberg.raw('shakespeare-hamlet.txt')

# Save all data to CSV, and txt
with open('hamlet.txt', 'w', encoding='utf-8') as f:
    f.write(data)

df= pd.DataFrame({'text': [data]})
df.to_csv('hamlet_text.csv', index=False)



# texts = [gutenberg.raw(fileid) for fileid in gutenberg.fileids()]
# df = pd.DataFrame({'text': texts})
# df.to_csv('gutenberg_texts.csv', index=False)

[nltk_data] Downloading package gutenberg to C:\Users\Raj Kalash
[nltk_data]     Tiwari\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [12]:
# Data Preprocessing
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load data
with open('hamlet.txt', 'r', encoding='utf-8') as f:
    data = f.read()

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
total_words = len(tokenizer.word_index) + 1
total_words


4818

In [13]:
# Create input sequences
input_sequences = []
for line in data.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
input_sequences[:5]

[[1, 687],
 [1, 687, 4],
 [1, 687, 4, 45],
 [1, 687, 4, 45, 41],
 [1, 687, 4, 45, 41, 1886]]

In [15]:
# Padding sequences
import numpy as np
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
input_sequences= np.array(input_sequences)
input_sequences[:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    1,  687],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           1,  687,    4],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    1,
         687,    4,   45],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    1,  687,
           4,   45,   41],
       [   0,    0,    0,    0,    0,    0,    0,    0,    1,  687,    4,
          45,   41, 1886]])

In [18]:
# Create predictors and label
import tensorflow as tf
X = input_sequences[:,:-1]
y = input_sequences[:,-1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)


In [19]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Model Building

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard

model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'], )
model.summary()




In [27]:
# Run the model
early_stop = EarlyStopping(monitor='val_loss', patience=3)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)
tensorboard_callback = TensorBoard(log_dir='./logs', histogram_freq=1)
history = model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_val, y_val), )

Epoch 1/10
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 36ms/step - accuracy: 0.7707 - loss: 0.9837 - val_accuracy: 0.0476 - val_loss: 15.3002
Epoch 2/10
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 35ms/step - accuracy: 0.7717 - loss: 0.9842 - val_accuracy: 0.0493 - val_loss: 15.3505
Epoch 3/10
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 34ms/step - accuracy: 0.7721 - loss: 0.9816 - val_accuracy: 0.0482 - val_loss: 15.3581
Epoch 4/10
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 34ms/step - accuracy: 0.7739 - loss: 0.9733 - val_accuracy: 0.0492 - val_loss: 15.3553
Epoch 5/10
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 34ms/step - accuracy: 0.7714 - loss: 0.9800 - val_accuracy: 0.0462 - val_loss: 15.3573
Epoch 6/10
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 35ms/step - accuracy: 0.7734 - loss: 0.9726 - val_accuracy: 0.0480 - val_loss: 15.3892
Epoch 7/10
[1m1



In [37]:
# Function to generate text
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word_index = tf.argmax(predicted, axis=-1).numpy()[0]
        
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text
print(generate_text("Farewell, and let you", 10, model, max_sequence_len))


Farewell, and let you grant and fiue and guildensterne doth moue am long power


In [34]:
# Save the tokenizer and model
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
model.save('lstm_text_generator.h5')

