<a href="https://colab.research.google.com/github/sneharreddy25/project_2025/blob/main/Project_Next_Word_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Project: Next Word Prediction

In [37]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import regex as re

In [45]:
def file_to_sentence_list(file_path):
    with open(file_path, 'r') as file:
        text = file.read()

    # Splitting the text into sentences using
    # delimiters like '.', '?', and '!'
    sentences = [sentence.strip() for sentence in re.split(
        r'(?<=[.!?])\s+', text) if sentence.strip()]

    return sentences

file_path = 'pizza.txt'
text_data = file_to_sentence_list(file_path)

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1

# Create input sequences
input_sequences = []
for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences and split into predictors and label
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(
    input_sequences, maxlen=max_sequence_len, padding='pre'))
X, y = input_sequences[:, :-1], input_sequences[:, -1]

# Convert target data to one-hot encoding
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [46]:
# Define the model
model = Sequential()
model.add(Embedding(total_words, 10,
                    input_length=max_sequence_len-1))
model.add(LSTM(128))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])



In [51]:
# Assuming the model weights from epoch 495 are successfully loaded here
# using the ipywidgets FileUpload method as discussed previously.

# Now continue training specifically for epochs 496, 497, 498, 499, and 500.
# Ensure X and y are available from previous cells.
if 'X' in globals() and 'y' in globals():
    print("Starting model training for epochs 496 through 500...")
    # To train from epoch 496 to 500:
    # - Set 'epochs' to the total number of epochs you want to reach (500).
    # - Set 'initial_epoch' to the epoch number *before* the one you want to start (495).
    #   This makes the first epoch trained be epoch 496.
    model.fit(X, y, epochs=500, initial_epoch=495, verbose=1)
else:
    print("Error: X or y data is not available. Please run the previous cells to load and preprocess data.")

Starting model training for epochs 496 through 500...
Epoch 496/500
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 63ms/step - accuracy: 0.0969 - loss: 5.2123
Epoch 497/500
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step - accuracy: 0.0955 - loss: 5.0973
Epoch 498/500
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step - accuracy: 0.1022 - loss: 4.9646
Epoch 499/500
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 40ms/step - accuracy: 0.1017 - loss: 4.8483
Epoch 500/500
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - accuracy: 0.0987 - loss: 4.7956


In [52]:
# Generate next word predictions
seed_text = "Pizza have different "
next_words = 5

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences(
        [token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted_probs = model.predict(token_list)
    predicted_word = tokenizer.index_word[np.argmax(predicted_probs)]
    seed_text += " " + predicted_word

print("Next predicted words:", seed_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Next predicted words: Pizza have different  become and pizza of pizza
