In [1]:
import numpy as np
import pandas as pd

In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [3]:
with open('/content/next_word.txt','r') as file:
  text = file.read()

print(type(text))

<class 'str'>


In [4]:
#Now let’s tokenize the text to create a sequence of words:

In [5]:
tokenizer= Tokenizer()
tokenizer.fit_on_texts([text])

In [6]:
v= tokenizer.word_index
dd = pd.DataFrame(v.items(),columns =['word','index'])
dd.head(15)

Unnamed: 0,word,index
0,the,1
1,and,2
2,i,3
3,to,4
4,of,5
5,a,6
6,in,7
7,that,8
8,it,9
9,he,10


In [7]:
dd.describe()

Unnamed: 0,index
count,8199.0
mean,4100.0
std,2366.991762
min,1.0
25%,2050.5
50%,4100.0
75%,6149.5
max,8199.0


In [8]:
total_words = len(tokenizer.word_index)+1
total_words

8200

In the above code, the text is tokenized, which means it is divided into individual words or tokens. The ‘Tokenizer’ object is created, which will handle the tokenization process. The ‘fit_on_texts’ method of the tokenizer is called, passing the ‘text’ as input. This method analyzes the text and builds a vocabulary of unique words, assigning each word a numerical index. The ‘total_words’ variable is then assigned the value of the length of the word index plus one, representing the total number of distinct words in the text.

Now let’s create input-output pairs by splitting the text into sequences of tokens and forming n-grams from the sequences:

In [9]:
input_sequences = []

In [10]:
for line in text.split("\n"):
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)

In [11]:
input_sequences[:5]

[[1, 1561],
 [1, 1561, 5],
 [1, 1561, 5, 129],
 [1, 1561, 5, 129, 34],
 [647, 4498]]

In [12]:
tokenizer.texts_to_sequences([line])

[[]]

In [13]:
for i in range(1, 0):
  print(i)

In [33]:
fff = 'The Man with the Twisted Lip'

In [36]:
token_list = tokenizer.texts_to_sequences([fff])[0]
token_list

[1, 56, 18, 1, 1014, 873]

In the above code, the text data is split into lines using the ‘\n’ character as a delimiter. For each line in the text, the ‘texts_to_sequences’ method of the tokenizer is used to convert the line into a sequence of numerical tokens based on the previously created vocabulary. The resulting token list is then iterated over using a for loop. For each iteration, a subsequence, or n-gram, of tokens is extracted, ranging from the beginning of the token list up to the current index ‘i’.

This n-gram sequence represents the input context, with the last token being the target or predicted word. This n-gram sequence is then appended to the ‘input_sequences’ list. This process is repeated for all lines in the text, generating multiple input-output sequences that will be used for training the next word prediction model.

Now let’s pad the input sequences to have equal length:

In [14]:
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In the above code, the input sequences are padded to ensure all sequences have the same length. The variable ‘max_sequence_len’ is assigned the maximum length among all the input sequences. The ‘pad_sequences’ function is used to pad or truncate the input sequences to match this maximum length.

The ‘pad_sequences’ function takes the input_sequences list, sets the maximum length to ‘max_sequence_len’, and specifies that the padding should be added at the beginning of each sequence using the ‘padding=pre’ argument. Finally, the input sequences are converted into a numpy array to facilitate further processing.

Now let’s split the sequences into input and output:

In [15]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In the above code, the input sequences are split into two arrays, ‘X’ and ‘y’, to create the input and output for training the next word prediction model. The ‘X’ array is assigned the values of all rows in the ‘input_sequences’ array except for the last column. It means that ‘X’ contains all the tokens in each sequence except for the last one, representing the input context.

On the other hand, the ‘y’ array is assigned the values of the last column in the ‘input_sequences’ array, which represents the target or predicted word.

In [16]:
input_sequences

array([[   0,    0,    0, ...,    0,    1, 1561],
       [   0,    0,    0, ...,    1, 1561,    5],
       [   0,    0,    0, ..., 1561,    5,  129],
       ...,
       [   0,    0,    0, ...,    1, 8198, 8199],
       [   0,    0,    0, ..., 8198, 8199, 3187],
       [   0,    0,    0, ..., 8199, 3187, 3186]], dtype=int32)

In [17]:
X

array([[   0,    0,    0, ...,    0,    0,    1],
       [   0,    0,    0, ...,    0,    1, 1561],
       [   0,    0,    0, ...,    1, 1561,    5],
       ...,
       [   0,    0,    0, ...,   28,    1, 8198],
       [   0,    0,    0, ...,    1, 8198, 8199],
       [   0,    0,    0, ..., 8198, 8199, 3187]], dtype=int32)

In [18]:
y

array([1561,    5,  129, ..., 8199, 3187, 3186], dtype=int32)

Now let’s convert the output to one-hot encode vectors

In [19]:
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

In [20]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In the above code, we are converting the output array into a suitable format for training a model, where each target word is represented as a binary vector.

In [21]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))
print(model.summary())



None


The code above defines the model architecture for the next word prediction model. The ‘Sequential’ model is created, which represents a linear stack of layers. The first layer added to the model is the ‘Embedding’ layer, which is responsible for converting the input sequences into dense vectors of fixed size. It takes three arguments:

‘total_words’, which represents the total number of distinct words in the vocabulary;
‘100’, which denotes the dimensionality of the word embeddings;
and ‘input_length’, which specifies the length of the input sequences.
The next layer added is the ‘LSTM’ layer, a type of recurrent neural network (RNN) layer designed for capturing sequential dependencies in the data. It has 150 units, which means it will learn 150 internal representations or memory cells.

Finally, the ‘Dense’ layer is added, which is a fully connected layer that produces the output predictions. It has ‘total_words’ units and uses the ‘softmax’ activation function to convert the predicted scores into probabilities, indicating the likelihood of each word being the next one in the sequence.

Now let’s compile and train the model:

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=1)

In [None]:
seed_text = "I will leave if they"
next_words = 3

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word

print(seed_text)

Output: I will leave if they have already married