In [None]:
# 📌 Import necessary libraries
import numpy as np
import re
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [None]:
# Path to your dataset file (txt)
file_path = '/content/dataset_book.txt'

input_texts = []
next_words = []

with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line:
            continue

        # Assuming format: input sequence + space + next_word
        # For example: "I love to play football"
        parts = line.split()
        input_seq = " ".join(parts[:-1])  # all except last word
        next_word = parts[-1]             # last word is next word to predict

        input_texts.append(input_seq)
        next_words.append(next_word)

print("Sample input:", input_texts[1])
print("Sample next word:", next_words[2])


Sample input: It is a truth universally acknowledged, that a single man
Sample next word: wife.


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer # Import Tokenizer

# Convert input texts and next words into sequences

# Create and fit tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(input_texts + next_words) # Fit tokenizer on all text data
total_words = len(tokenizer.word_index) + 1 # Get total number of unique words

input_sequences = tokenizer.texts_to_sequences(input_texts)
labels = tokenizer.texts_to_sequences(next_words)

# Ensure each label is a single integer
labels = [label[0] if len(label) > 0 else 0 for label in labels]

# Check for empty input sequences
if not input_sequences:
    print("No valid input sequences found in the file.")
    exit()  # or handle appropriately
else:
    # Pad input sequences to same length
    max_seq_len = max(len(seq) for seq in input_sequences)
    X = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')
    y = to_categorical(labels, num_classes=total_words)

In [None]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
X

array([[   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    6, 1052,  119],
       [   0,    0,    0, ...,  349,    3,    6],
       ...,
       [   0,    0,    0, ...,    1,  181,  165],
       [   0,    0,    0, ..., 1735,    4,   92],
       [   0,    0,    0, ...,  217,  130,  592]], dtype=int32)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

# Build the model
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=10, input_length=max_seq_len))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())




None


In [None]:
# Train the model
model.fit(X, y, epochs=30, verbose=1)

[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 64ms/step - accuracy: 0.0379 - loss: 7.3646


<keras.src.callbacks.history.History at 0x790eec496010>

In [None]:
# Function to predict the next word for a given input text
def predict_next_word(model, tokenizer, text, max_seq_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    token_list = pad_sequences([token_list], maxlen=max_seq_len, padding='pre')
    predicted_probs = model.predict(token_list, verbose=0)
    predicted_index = np.argmax(predicted_probs)

    for word, index in tokenizer.word_index.items():
        if index == predicted_index:
            return word
    return None

# Example test
input_text = "You know "
predicted_word = predict_next_word(model, tokenizer, input_text, max_seq_len)
print(f"Input: '{input_text}' -> Predicted next word: '{predicted_word}'")


Input: 'You know ' -> Predicted next word: 'the'


# Transformer From Hugging Face

In [None]:
from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer

model_name = "allenai/t5-small-next-word-generator-qoogle"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def run_model(input_string, **generator_args):
    input_ids = tokenizer.encode(input_string, return_tensors="pt")
    res = model.generate(input_ids, **generator_args)
    output = tokenizer.batch_decode(res, skip_special_tokens=True)
    print(output)
    return output


In [None]:
run_model("Which")

https://huggingface.co/allenai/t5-small-next-word-generator-qoogle