# Next word predictions

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import time


In [None]:

# ---------- Raw FAQ Text ----------
faqs = """About the Program
What is the course fee for  Data Science Mentorship Program (CSWS 2025)
The course follows a monthly subscription model where you have to make monthly payments of Rs 799/month.
What is the total duration of the course?
The total duration of the course is 7 months. So the total course fee becomes 799*7 = Rs 5600(approx.)
What is the syllabus of the mentorship program?
We will be covering the following modules:
Python Fundamentals
Python libraries for Data Science
Data Analysis
SQL for Data Science
Maths for Machine Learning
ML Algorithms
Practical ML
MLOPs
Case studies
You can check the detailed syllabus here - https://learnwith.campusx.in/courses/CampusX-Data-Science-Mentorship-Program-637339afe4b0615a1bbed390
Will Deep Learning and NLP be a part of this program?
No, NLP and Deep Learning both are not a part of this program’s curriculum.
What if I miss a live session? Will I get a recording of the session?
Yes all our sessions are recorded, so even if you miss a session you can go back and watch the recording.
Where can I find the class schedule?
Checkout this google sheet to see month by month time table of the course - https://docs.google.com/spreadsheets/d/16OoTax_A6ORAeCg4emgexhqqPv3noQPYKU7RJ6ArOzk/edit?usp=sharing.
...
(Trimmed for brevity; use full FAQ from your original post)
"""


In [None]:
# ---------- Step 1: Tokenization ----------
tokenizer = Tokenizer()
tokenizer.fit_on_texts([faqs])
total_words = len(tokenizer.word_index) + 1  # add 1 for padding token


In [None]:
# ---------- Step 2: Create Input Sequences ----------
input_sequences = []

for sentence in faqs.split('\n'):
    token_list = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(token_list)):
        input_sequences.append(token_list[:i+1])


In [None]:
# ---------- Step 3: Padding ----------
max_sequence_len = max([len(seq) for seq in input_sequences])
padded_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')


In [None]:

# Split into input (X) and label (y)
X = padded_sequences[:, :-1]
y = padded_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)



In [None]:
# ---------- Step 4: Model ----------
modelrnn = Sequential([
    Embedding(input_dim=total_words, output_dim=100, input_length=max_sequence_len-1),
    tf.keras.layers.SimpleRNN(150, return_sequences=True),
    tf.keras.layers.SimpleRNN(150),
    Dense(total_words, activation='softmax')
])

In [None]:
modelrnn.build(input_shape=(None, max_sequence_len - 1)) # Explicitly build the modelrnn
modelrnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
modelrnn.summary()

In [None]:
# ---------- Step 5: Train the Model ----------
modelrnn.fit(X, y, epochs=10, verbose=1)

In [None]:
# ---------- Step 6: Generate Text ----------
seed_text = "can i find detail syllabus"
next_words = 10

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = modelrnn.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted)

    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            seed_text += " " + word
            break

    print(seed_text)
    time.sleep(1)

# ---------- Optional: Check tokenizer mapping ----------
# print(tokenizer.word_index)


## LSTM

In [None]:
# ---------- Step 4: Model ----------
model = Sequential([
    Embedding(input_dim=total_words, output_dim=100, input_length=max_sequence_len-1),
    LSTM(150, return_sequences=True),
    LSTM(150),
    Dense(total_words, activation='softmax')
])

In [None]:
model.build(input_shape=(None, max_sequence_len - 1)) # Explicitly build the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# ---------- Step 5: Train the Model ----------
model.fit(X, y, epochs=10, verbose=1)

In [None]:
# ---------- Step 6: Generate Text ----------
seed_text = "what is the fee"
next_words = 10


In [None]:
# ---------- Step 6: Generate Text ----------
seed_text = "can i find detail syllabus"
next_words = 10

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted)

    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            seed_text += " " + word
            break

    print(seed_text)
    time.sleep(1)

# ---------- Optional: Check tokenizer mapping ----------
# print(tokenizer.word_index)


## GRU

Gated recurrent neural networks Gated Recurrent Unit (GRU) is a commonly used type of gated recurrent neural network, which is a simplified version of LSTM (Long Short-Term Memory) . Compared to LSTM, GRU simplifies one gating unit, which reduces the number of network parameters and reduces the risk of overfitting.


In [19]:
# ---------- Step 4: Model (GRU) ----------
model_gru = Sequential([
    Embedding(input_dim=total_words, output_dim=100, input_length=max_sequence_len-1),
    tf.keras.layers.GRU(150, return_sequences=True),
    tf.keras.layers.GRU(150),
    Dense(total_words, activation='softmax')
])

In [20]:
model_gru.build(input_shape=(None, max_sequence_len - 1)) # Explicitly build the model_gru
model_gru.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_gru.summary()

In [21]:
# ---------- Step 5: Train the Model ----------
model_gru.fit(X, y, epochs=10, verbose=1)

Epoch 1/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.0389 - loss: 4.8084  
Epoch 2/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0917 - loss: 4.7342
Epoch 3/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0893 - loss: 4.5355
Epoch 4/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0789 - loss: 4.3707
Epoch 5/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0538 - loss: 4.4088
Epoch 6/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1085 - loss: 4.2708
Epoch 7/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0919 - loss: 4.2905
Epoch 8/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0878 - loss: 4.1329
Epoch 9/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x7b1a54164c20>

In [22]:
# ---------- Step 6: Generate Text ----------
seed_text = "can i find detail syllabus"
next_words = 10

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model_gru.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted)

    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            seed_text += " " + word
            break

    print(seed_text)
    time.sleep(1)

# ---------- Optional: Check tokenizer mapping ----------
# print(tokenizer.word_index)


can i find detail syllabus the
can i find detail syllabus the the
can i find detail syllabus the the the
can i find detail syllabus the the the of
can i find detail syllabus the the the of of
can i find detail syllabus the the the of of of
can i find detail syllabus the the the of of of of
can i find detail syllabus the the the of of of of of
can i find detail syllabus the the the of of of of of of
can i find detail syllabus the the the of of of of of of of
