In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Read the text file
with open('sherlock_holmes.txt', 'r', encoding='utf-8') as file:
    text = file.read()


In [2]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

In [3]:
# Prepare input sequences
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [4]:
# Pad sequences
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

# Shuffle input_sequences
np.random.seed(100)
np.random.shuffle(input_sequences)

# Determine the split index (80% training, 20% testing)
split_index = int(len(input_sequences) * 0.8)

# Split into training and testing sets
train_sequences = input_sequences[:split_index]
test_sequences = input_sequences[split_index:]

# For training data
X_train = train_sequences[:, :-1]
y_train = train_sequences[:, -1]

# For testing data
X_test = test_sequences[:, :-1]
y_test = test_sequences[:, -1]

# Convert labels to one-hot encoding
y_train = tf.keras.utils.to_categorical(y_train, num_classes=total_words)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=total_words)

In [5]:
# Model definition
model = Sequential()
model.add(Embedding(total_words, 100))  # Removed input_length
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))

# Build the model with dummy data
dummy_input = np.zeros((1, max_sequence_len-1))
model(dummy_input)

# Print the model summary
print(model.summary())

None


In [6]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', 'f1_score'])
model.fit(X_train, y_train, epochs=200, verbose=1)

Epoch 1/200
[1m2408/2408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 30ms/step - accuracy: 0.0561 - f1_score: 4.5025e-05 - loss: 6.6122
Epoch 2/200
[1m2408/2408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 29ms/step - accuracy: 0.1075 - f1_score: 3.4173e-04 - loss: 5.6411
Epoch 3/200
[1m2408/2408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 29ms/step - accuracy: 0.1398 - f1_score: 7.8333e-04 - loss: 5.2101
Epoch 4/200
[1m2408/2408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 32ms/step - accuracy: 0.1581 - f1_score: 0.0016 - loss: 4.8645
Epoch 5/200
[1m2408/2408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 31ms/step - accuracy: 0.1793 - f1_score: 0.0048 - loss: 4.5328
Epoch 6/200
[1m2408/2408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 31ms/step - accuracy: 0.1982 - f1_score: 0.0161 - loss: 4.2306
Epoch 7/200
[1m2408/2408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 30ms/step - accuracy: 0.2276 - f1_score: 0.

<keras.src.callbacks.history.History at 0x1321a5391f0>

In [11]:
# Evaluate the model on the test set
test_loss, test_accuracy, test_f1 = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")
print(f"Test F1 Score: {test_f1}")
print(f"average F1 Score: {np.mean(np.array(test_f1))}")

[1m602/602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.0837 - f1_score: 0.0042 - loss: 16.2926
Test Loss: 16.236291885375977
Test Accuracy: 0.08326844125986099
Test F1 Score: [0.         0.24425526 0.13356763 ... 0.         0.         0.        ]
average F1 Score: 0.0059158471412956715


In [12]:
seed_text = "I will leave if they"
next_words = 3

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word

print(seed_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
I will leave if they had at the
