In [12]:
import nltk

In [13]:
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd 

dat = gutenberg.raw('shakespeare-hamlet.txt')
with open('hamlet.txt','w') as file:
  file.write(dat)

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [14]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from sklearn.model_selection import train_test_split



In [15]:
with open('hamlet.txt','r') as file:
  text = file.read().lower()

tokenizer = Tokenizer()
# So in your code, the tokenizer is doing:
# Step by step
# Reads text
# Splits into words (internally)
# Assigns each word a unique number
# Replaces words with numbers
#The tokenizer converts raw text into numerical sequences by building a vocabulary and mapping each word to an integer.

tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index)+1
#total words is total unique words 
# tokenizer.word_index returns the dictionary of word and index of that word ie the number used to represent that word {'the': 1,
#  'and': 2,
#  'to': 3,
#  'of': 4 and so on....}


# Creating n-gram sequences from text

Suppose we have this text:

```python
text = "I love AI\nIt is fun"


And assume our tokenizer converts words to integers like this:

"I" -> 1, "love" -> 2, "AI" -> 3, "It" -> 4, "is" -> 5, "fun" -> 6

Step 1: Initialize list for sequences
inputsequences = []


Purpose: Creates an empty list to store n-gram sequences.
Example:

inputsequences = []  # currently empty

Step 2: Loop through each line
for line in text.split('\n'):


Purpose: Loops through each line of text. split('\n') splits text at newlines.
Example:

text.split('\n')  # → ["I love AI", "It is fun"]
# Loop takes "I love AI" first, then "It is fun"

Step 3: Convert line to token list
token_list = tokenizer.texts_to_sequences([line])[0]


Purpose: Converts a line of text into a list of integers using the tokenizer. [0] gets the inner list.
Example:

# First line "I love AI" → [1, 2, 3]
# Second line "It is fun" → [4, 5, 6]

Step 4: Generate n-grams
for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    inputsequences.append(n_gram_sequence)


Purpose:

Loops through indices of the token list to generate sequences of increasing length (n-grams).

token_list[:i+1] takes a slice from the start up to index i+1.

Adds each n-gram to inputsequences.

Example:

For the first line [1, 2, 3]:

i = 1 → token_list[:2] = [1, 2]
i = 2 → token_list[:3] = [1, 2, 3]


For the second line [4, 5, 6]:

i = 1 → [4, 5]
i = 2 → [4, 5, 6]


After appending:

inputsequences = [[1, 2], [1, 2, 3], [4, 5], [4, 5, 6]]


✅ Final inputsequences:

[[1, 2], [1, 2, 3], [4, 5], [4, 5, 6]]


These sequences are now ready to be used as input for a predictive text model.



In [16]:
inputsequences = []
for line in text.split("\n"):
  sequence = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(sequence)):
    inputsequences.append(sequence[:i+1])


inputsequences



##here last element of an array is output , 
# yo word  paxi yo word type ko 

[[1, 687],
 [1, 687, 4],
 [1, 687, 4, 45],
 [1, 687, 4, 45, 41],
 [1, 687, 4, 45, 41, 1886],
 [1, 687, 4, 45, 41, 1886, 1887],
 [1, 687, 4, 45, 41, 1886, 1887, 1888],
 [1180, 1889],
 [1180, 1889, 1890],
 [1180, 1889, 1890, 1891],
 [57, 407],
 [57, 407, 2],
 [57, 407, 2, 1181],
 [57, 407, 2, 1181, 177],
 [57, 407, 2, 1181, 177, 1892],
 [407, 1182],
 [407, 1182, 63],
 [408, 162],
 [408, 162, 377],
 [408, 162, 377, 21],
 [408, 162, 377, 21, 247],
 [408, 162, 377, 21, 247, 882],
 [18, 66],
 [451, 224],
 [451, 224, 248],
 [451, 224, 248, 1],
 [451, 224, 248, 1, 30],
 [408, 407],
 [451, 25],
 [408, 6],
 [408, 6, 43],
 [408, 6, 43, 62],
 [408, 6, 43, 62, 1893],
 [408, 6, 43, 62, 1893, 96],
 [408, 6, 43, 62, 1893, 96, 18],
 [408, 6, 43, 62, 1893, 96, 18, 566],
 [451, 71],
 [451, 71, 51],
 [451, 71, 51, 1894],
 [451, 71, 51, 1894, 567],
 [451, 71, 51, 1894, 567, 378],
 [451, 71, 51, 1894, 567, 378, 80],
 [451, 71, 51, 1894, 567, 378, 80, 3],
 [451, 71, 51, 1894, 567, 378, 80, 3, 273],
 [451, 71

In [17]:
max_sequence_len = max([len(x) for x in inputsequences])
max_sequence_len



14

In [18]:
input_sequences = np.array(pad_sequences(inputsequences,maxlen=max_sequence_len,padding='pre'))
input_sequences

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]])

In [19]:
import tensorflow as tf

x,y = input_sequences[:,:-1],input_sequences[:,-1:]

# arr[row_start : row_end, col_start : col_end] --> array slicing
#y is 2d array

OneHotEncoder encodes only the classes that appear in the data, and the class order is determined automatically.

tf.keras.utils.to_categorical requires integer labels and usually a fixed num_classes, ensuring a consistent class order.

to_categorical is preferred for deep learning models, while OneHotEncoder is more common in traditional machine learning pipelines

[Learn More ](https://www.notion.so/Label-2cc1feb8f79a8035a929c7306942b1d9)

In [20]:
y = tf.keras.utils.to_categorical(y,num_classes=total_words)

#this can be done using onehotencoder from sklearn but..
#one_hot_encoder_y = OneHotEncoder()
# y_encoded = one_hot_encoder_y.fit_transform(y).toarray()
# Fit the encoder on labels and transform them into one-hot encoded vectors


In [21]:
x_train , x_test, y_train,y_test = train_test_split(x,y,test_size=0.2)

In [58]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM,Embedding, Dropout



In [None]:
#define the model
model = Sequential()
model.add(Embedding(total_words,100,input_length = max_sequence_len-1))
model.add(LSTM(150,return_sequences = True))
#here 150 is the number of neuron of Fully connected hidden layer , it is the size of hidden vector ht-1 and cell state ct
model.add(Dropout (0.2))
#dropout here turns off 20% of the random neurons at each step , step doesnt mean 1 epoch , epoch is when model completes a pass through entire dataset, step is when model pass through 1 batch of data .
#dropout is used--> lstm is contains many parameters, so to avoid overfitting 
model.add(LSTM(100))
model.add(Dense(total_words,activation = "softmax"))

model.compile( loss = 'categorical_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
model.summary()



learn about LSTM [here](https://medium.com/@chunduri11/understanding-lstm-plain-and-simple-96026b4468c6)  


Why is return_sequences used? (The "Stacking" Rule)
The most common reason to use return_sequences=True is Stacking LSTMs.

In Keras, an LSTM layer expects an input with 3 dimensions: (Batch_Size, Time_Steps, Features).

If you set it to False: The output is 2D: (Batch_Size, Features). The "Time" dimension is gone because you only kept the last step. If you try to put another LSTM after this, it will crash.

If you set it to True: The output stays 3D: (Batch_Size, Time_Steps, Features). The second LSTM is happy because it still sees a sequence of data to process.

In [None]:
model.fit(x_train,y_train, validation_data=(x_test,y_test),epochs= 10,verbose=1)

In [None]:
model.save('model.h5')



In [22]:
from tensorflow.keras.models import load_model
model = load_model('model.h5')



In [29]:
def predict_next_word(model, tokenizer, text, max_sequence_len):
  token_list = tokenizer.texts_to_sequences([text])[0]
  if len(token_list) > max_sequence_len :
    token_list = token_list[-(max_sequence_len-1):] 
#list ko agadi ko element lai hatauxa
# Keeps the most recent tokens
# In sequence models (RNN, LSTM, GRU):
# Recent words often matter more → use slicing from the end
# Taking from the start may drop important context
  token_list = pad_sequences([token_list],maxlen = max_sequence_len-1, padding = 'pre')
  predicted = model.predict(token_list)

  predicted_word_index = np.argmax(predicted ,axis = 1)
  for word,index in tokenizer.word_index.items():
    if index == predicted_word_index:
      return word 
  return None



In [56]:
input_text = " Barn. Well, goodnight. If you do meet Horatio mMarcellus, the Riuals of my"
max_sequence_len = model.input_shape[1]+1
print(predict_next_word(model,tokenizer,input_text,max_sequence_len))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
fathers


In [None]:
import pickle as pk
with open ('tokenizer.pkl','wb') as file:
  pk.dump(tokenizer,file,protocol=pk.HIGHEST_PROTOCOL)
# HIGHEST_PROTOCOL ensures the tokenizer is saved using the most efficient
# and fastest pickle format supported by the current Python version