In [1]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import requests

In [3]:
# Step 1: Download and Preprocess the Corpus
# Download the text of "Alice in Wonderland"
url = "https://www.gutenberg.org/files/11/11-0.txt"
response = requests.get(url)
text = response.text.lower()  # Convert the text to lowercase
corpus = text.split("\n")[:100]  # Use the first 00 lines for simplicity

# Expected output:
# - Text converted to lowercase
# - Corpus as a list of sentences (first 500 lines)
# Example:
# corpus[:3] => ['alice's adventures in wonderland', '', 'chapter i. down the rabbit hole']


In [5]:
# Step 2: Tokenization and Sequence Creation
tokenizer = Tokenizer()  # Initialize the tokenizer
tokenizer.fit_on_texts(corpus)  # Learn the vocabulary from the corpus

# Expected output:
# - Each unique word gets a numerical ID
# Example:
# tokenizer.word_index => {'the': 1, 'and': 2, 'to': 3, ...}

In [7]:
total_words = len(tokenizer.word_index) + 1  # Total number of unique words
print(f"Total Unique Words: {total_words}")

# Create input-output pairs
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]  # Convert line to sequence of IDs
    for i in range(1, len(token_list)):  # Generate n-grams
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
        
# Expected output:
# - A list of n-gram sequences
# Example (for the line "the rabbit ran"):
# token_list => [1, 123, 456] (IDs for "the", "rabbit", "ran")
# input_sequences => [[1, 123], [1, 123, 456]]

Total Unique Words: 380


In [9]:
# Pad sequences to ensure all inputs have the same length
max_len = max([len(seq) for seq in input_sequences])  # Maximum sequence length
input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='pre')

# Expected output:
# - Padded sequences with zeros at the start
# Example:
# Original sequence: [1, 123]
# Padded sequence: [0, 0, 1, 123]

# Split data into input (X) and output (y)
X = input_sequences[:, :-1]  # All but the last word
y = input_sequences[:, -1]   # Last word as the output

# One-hot encode the output labels
y = np.eye(total_words)[y]  # Convert word IDs to one-hot vectors

# Expected output:
# X => Padded input sequences (2D array)
# y => One-hot encoded output (2D array)
# Example:
# X[0] => [0, 0, 1]
# y[0] => [0, 0, 0, 1, 0, ...]


In [11]:
# Step 3: Build the Model
model = Sequential([
    Embedding(input_dim=total_words, output_dim=100, input_length=max_len-1),  # Word embedding
    LSTM(150, return_sequences=False),  # Learn patterns in sequences
    Dense(total_words, activation='softmax')  # Predict next word
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Expected output:
# - A summary of the model showing the number of parameters and layer details
# Example:
# Layer (type)                Output Shape              Param #   
# embedding (Embedding)       (None, 10, 100)           100000    
# lstm (LSTM)                 (None, 150)               150600    
# dense (Dense)               (None, 1000)              151000    




In [13]:
 #Step 4: Train the Model
model.fit(X, y, epochs=100, batch_size=128, verbose=1)

# Expected output:
# - Training loss and accuracy for each epoch
# Example:
# Epoch 1/20
# loss: 4.9234 - accuracy: 0.1234
# Epoch 20/20
# loss: 2.5632 - accuracy: 0.4567


Epoch 1/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 36ms/step - accuracy: 0.0235 - loss: 5.9366
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.0358 - loss: 5.8801
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.0527 - loss: 5.5515
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.0391 - loss: 5.3858
Epoch 5/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.0327 - loss: 5.3538
Epoch 6/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.0398 - loss: 5.3066
Epoch 7/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.0488 - loss: 5.2828
Epoch 8/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.0442 - loss: 5.2134
Epoch 9/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x1ffc4b13c80>

In [15]:
# Step 5: Sentence Completion Function
def complete_sentence(seed_text, num_words):
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]  # Tokenize the input
        token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')  # Pad sequence
        predicted_probs = model.predict(token_list, verbose=0)  # Predict probabilities
        predicted_word_index = np.argmax(predicted_probs)  # Get index of highest probability
        for word, index in tokenizer.word_index.items():  # Map index back to word
            if index == predicted_word_index:
                seed_text += " " + word  # Append predicted word to the seed text
                break
    return seed_text


In [17]:
# Step 6: Test the Model for Sentence Completion
seed_text = "Alice"
completed_sentence = complete_sentence(seed_text, num_words=10)
print(f"Seed: {seed_text}\nCompleted Sentence: {completed_sentence}")

# Expected output:
# - A completed sentence based on the seed text
# Example:
# Seed: the rabbit
# Completed Sentence: the rabbit was in a hurry and he ran into the hall

Seed: Alice
Completed Sentence: Alice was beginning to get very tired of sitting by her
