Importing dataset using Kaggle API key

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [3]:
!kaggle datasets download -d ronikdedhia/next-word-prediction

Downloading next-word-prediction.zip to /content
100% 228k/228k [00:00<00:00, 552kB/s]
100% 228k/228k [00:00<00:00, 552kB/s]


In [4]:
import zipfile
zip_ref = zipfile.ZipFile('/content/next-word-prediction.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

Importing necessary libraries and modules

In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense
from keras.models import load_model
import time

In [6]:
# this is basically a text generation task , supervised learning
# input and output are made using different parts of the data

In [7]:
tokenizer=Tokenizer()

In [8]:
file_path='/content/1661-0.txt'
with open(file_path,'r') as file:
    text=file.read()

In [9]:
tokenizer.fit_on_texts([text])

In [10]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'of': 4,
 'a': 5,
 'i': 6,
 '”': 7,
 'in': 8,
 'that': 9,
 'it': 10,
 'he': 11,
 'was': 12,
 'you': 13,
 'his': 14,
 'is': 15,
 'my': 16,
 'have': 17,
 'with': 18,
 'as': 19,
 'had': 20,
 'at': 21,
 'which': 22,
 'for': 23,
 'be': 24,
 'not': 25,
 'me': 26,
 'but': 27,
 'from': 28,
 'we': 29,
 'this': 30,
 'said': 31,
 'upon': 32,
 'there': 33,
 'holmes': 34,
 'him': 35,
 'so': 36,
 'her': 37,
 'she': 38,
 'all': 39,
 '’': 40,
 'been': 41,
 'your': 42,
 'on': 43,
 'very': 44,
 'by': 45,
 'one': 46,
 'are': 47,
 '“i': 48,
 'were': 49,
 'an': 50,
 'no': 51,
 'would': 52,
 'out': 53,
 'what': 54,
 'then': 55,
 'up': 56,
 'when': 57,
 'man': 58,
 'could': 59,
 'has': 60,
 'do': 61,
 'into': 62,
 'or': 63,
 'little': 64,
 'will': 65,
 'who': 66,
 'mr': 67,
 'if': 68,
 'some': 69,
 'down': 70,
 'see': 71,
 'now': 72,
 'our': 73,
 'should': 74,
 'may': 75,
 'am': 76,
 'us': 77,
 'over': 78,
 'they': 79,
 'can': 80,
 'more': 81,
 'think': 82,
 'about': 83,
 'mu

In [11]:
len(tokenizer.word_index)

8931

In [12]:
input_sequence=[]
for sentence in text.split('\n'):
  tokenizer.texts_to_sequences([sentence])
  tokenized_sentence=tokenizer.texts_to_sequences([sentence])[0]

  for i in range(1,len(tokenized_sentence)):
    input_sequence.append(tokenized_sentence[:i+1])


In [13]:
input_sequence

[[145, 4790],
 [145, 4790, 1],
 [145, 4790, 1, 1020],
 [145, 4790, 1, 1020, 4],
 [145, 4790, 1, 1020, 4, 128],
 [145, 4790, 1, 1020, 4, 128, 34],
 [145, 4790, 1, 1020, 4, 128, 34, 45],
 [145, 4790, 1, 1020, 4, 128, 34, 45, 611],
 [145, 4790, 1, 1020, 4, 128, 34, 45, 611, 2235],
 [145, 4790, 1, 1020, 4, 128, 34, 45, 611, 2235, 2236],
 [30, 1021],
 [30, 1021, 15],
 [30, 1021, 15, 23],
 [30, 1021, 15, 23, 1],
 [30, 1021, 15, 23, 1, 275],
 [30, 1021, 15, 23, 1, 275, 4],
 [30, 1021, 15, 23, 1, 275, 4, 394],
 [30, 1021, 15, 23, 1, 275, 4, 394, 2237],
 [30, 1021, 15, 23, 1, 275, 4, 394, 2237, 21],
 [30, 1021, 15, 23, 1, 275, 4, 394, 2237, 21, 51],
 [30, 1021, 15, 23, 1, 275, 4, 394, 2237, 21, 51, 1676],
 [30, 1021, 15, 23, 1, 275, 4, 394, 2237, 21, 51, 1676, 2],
 [30, 1021, 15, 23, 1, 275, 4, 394, 2237, 21, 51, 1676, 2, 18],
 [572, 51],
 [572, 51, 3398],
 [572, 51, 3398, 3399],
 [572, 51, 3398, 3399, 13],
 [572, 51, 3398, 3399, 13, 75],
 [572, 51, 3398, 3399, 13, 75, 817],
 [572, 51, 3398, 33

In [14]:
max_len=max([len(x) for x in input_sequence])

In [15]:
max_len

20

Padding the sequence and declaring the X and y - parts of sub-sequences are considered

In [16]:
padded_input_sequence=pad_sequences(input_sequence,maxlen=max_len,padding='pre')

In [18]:
X=padded_input_sequence[:,:-1]
y=padded_input_sequence[:,-1]

In [19]:
# multiple class classification task

In [20]:
X.shape

(101619, 19)

In [21]:
y.shape

(101619,)

In [22]:
y=to_categorical(y,num_classes=8950)

In [23]:
y.shape

(101619, 8950)

In [24]:
y[0]

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

Making the Tensorflow model

In [25]:
model=Sequential()
model.add(Embedding(8950,100,input_length=19))
model.add(LSTM(150))
model.add(Dense(8950,activation='softmax')) # softmax as the task is multiclass classification


In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 19, 100)           895000    
                                                                 
 lstm (LSTM)                 (None, 150)               150600    
                                                                 
 dense (Dense)               (None, 8950)              1351450   
                                                                 
Total params: 2397050 (9.14 MB)
Trainable params: 2397050 (9.14 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [27]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [28]:
model.fit(X,y,epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x7912cc4e3dc0>

In [29]:
model.save('SD.h5')

  saving_api.save_model(


Testing on a random sample text

In [31]:
import time
input_text = "he"

for i in range(10):
    token_text=tokenizer.texts_to_sequences([input_text])
    padded_token_text=pad_sequences(token_text,maxlen=19,padding='pre')
    predictions=model.predict(padded_token_text)
    predicted_index=np.argmax(predictions)
    predicted_word=tokenizer.index_word[predicted_index]

    input_text += " " + predicted_word
    print(input_text)

    # Wait for 2 seconds
    time.sleep(2)


he was
he was in
he was in the
he was in the front
he was in the front forward
he was in the front forward opened
he was in the front forward opened the
he was in the front forward opened the window
he was in the front forward opened the window sprang
he was in the front forward opened the window sprang into
