In [1]:
text = """Data plays a vital role in our everyday life.
Directly or indirectly, for daily life decisions, we depend on some data, be it choosing a novel to read from a list of books, buying a thing after considering the budget, and so on.
Have you ever imagined searching for something on Google or Yahoo generates a lot of data?
This data is essential to analyze user experiences.
Getting recommendations on various e-commerce websites after buying a product and tracking parcels during delivery are part of Data Analytics which involves analyzing the raw data to make informed decisions.
But this raw data does not help make decisions if it has some redundancy, inconsistency, or inaccuracy.
Therefore, this data needs to be cleaned before considering for analysis."""

In [2]:
text

'Data plays a vital role in our everyday life.\nDirectly or indirectly, for daily life decisions, we depend on some data, be it choosing a novel to read from a list of books, buying a thing after considering the budget, and so on.\nHave you ever imagined searching for something on Google or Yahoo generates a lot of data?\nThis data is essential to analyze user experiences.\nGetting recommendations on various e-commerce websites after buying a product and tracking parcels during delivery are part of Data Analytics which involves analyzing the raw data to make informed decisions.\nBut this raw data does not help make decisions if it has some redundancy, inconsistency, or inaccuracy.\nTherefore, this data needs to be cleaned before considering for analysis.'

In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [4]:
# Initiate the Tokenizer

tokenizer = Tokenizer()

In [5]:
tokenizer.fit_on_texts([text])

In [6]:
len(tokenizer.word_index)

87

In [7]:
for sentence in text.split('\n'):
  print(sentence)

Data plays a vital role in our everyday life.
Directly or indirectly, for daily life decisions, we depend on some data, be it choosing a novel to read from a list of books, buying a thing after considering the budget, and so on.
Have you ever imagined searching for something on Google or Yahoo generates a lot of data?
This data is essential to analyze user experiences.
Getting recommendations on various e-commerce websites after buying a product and tracking parcels during delivery are part of Data Analytics which involves analyzing the raw data to make informed decisions.
But this raw data does not help make decisions if it has some redundancy, inconsistency, or inaccuracy.
Therefore, this data needs to be cleaned before considering for analysis.


In [8]:
for sentence in text.split('\n'):
  print(tokenizer.texts_to_sequences([sentence])[0])

[1, 21, 2, 22, 23, 24, 25, 26, 10]
[27, 5, 28, 6, 29, 10, 7, 30, 31, 3, 11, 1, 12, 13, 32, 2, 33, 4, 34, 35, 2, 36, 8, 37, 14, 2, 38, 15, 16, 17, 39, 18, 40, 3]
[41, 42, 43, 44, 45, 6, 46, 3, 47, 5, 48, 49, 2, 50, 8, 1]
[9, 1, 51, 52, 4, 53, 54, 55]
[56, 57, 3, 58, 59, 60, 61, 15, 14, 2, 62, 18, 63, 64, 65, 66, 67, 68, 8, 1, 69, 70, 71, 72, 17, 19, 1, 4, 20, 73, 7]
[74, 9, 19, 1, 75, 76, 77, 20, 7, 78, 13, 79, 11, 80, 81, 5, 82]
[83, 9, 1, 84, 4, 12, 85, 86, 16, 6, 87]


In [9]:
input_sequences = []

for sentence in text.split('\n'):
  tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

  for i in range(1, len(tokenized_sentence)):
    input_sequences.append(tokenized_sentence[:i+1])

In [10]:
input_sequences

[[1, 21],
 [1, 21, 2],
 [1, 21, 2, 22],
 [1, 21, 2, 22, 23],
 [1, 21, 2, 22, 23, 24],
 [1, 21, 2, 22, 23, 24, 25],
 [1, 21, 2, 22, 23, 24, 25, 26],
 [1, 21, 2, 22, 23, 24, 25, 26, 10],
 [27, 5],
 [27, 5, 28],
 [27, 5, 28, 6],
 [27, 5, 28, 6, 29],
 [27, 5, 28, 6, 29, 10],
 [27, 5, 28, 6, 29, 10, 7],
 [27, 5, 28, 6, 29, 10, 7, 30],
 [27, 5, 28, 6, 29, 10, 7, 30, 31],
 [27, 5, 28, 6, 29, 10, 7, 30, 31, 3],
 [27, 5, 28, 6, 29, 10, 7, 30, 31, 3, 11],
 [27, 5, 28, 6, 29, 10, 7, 30, 31, 3, 11, 1],
 [27, 5, 28, 6, 29, 10, 7, 30, 31, 3, 11, 1, 12],
 [27, 5, 28, 6, 29, 10, 7, 30, 31, 3, 11, 1, 12, 13],
 [27, 5, 28, 6, 29, 10, 7, 30, 31, 3, 11, 1, 12, 13, 32],
 [27, 5, 28, 6, 29, 10, 7, 30, 31, 3, 11, 1, 12, 13, 32, 2],
 [27, 5, 28, 6, 29, 10, 7, 30, 31, 3, 11, 1, 12, 13, 32, 2, 33],
 [27, 5, 28, 6, 29, 10, 7, 30, 31, 3, 11, 1, 12, 13, 32, 2, 33, 4],
 [27, 5, 28, 6, 29, 10, 7, 30, 31, 3, 11, 1, 12, 13, 32, 2, 33, 4, 34],
 [27, 5, 28, 6, 29, 10, 7, 30, 31, 3, 11, 1, 12, 13, 32, 2, 33, 4, 34, 35],


In [11]:
max_len = max([len(x) for x in input_sequences])

In [12]:
max_len

34

# Zero Padding for uniform data

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen = max_len, padding='pre')

In [14]:
padded_input_sequences

array([[ 0,  0,  0, ...,  0,  1, 21],
       [ 0,  0,  0, ...,  1, 21,  2],
       [ 0,  0,  0, ..., 21,  2, 22],
       ...,
       [ 0,  0,  0, ..., 85, 86, 16],
       [ 0,  0,  0, ..., 86, 16,  6],
       [ 0,  0,  0, ..., 16,  6, 87]], dtype=int32)

In [15]:
X = padded_input_sequences[:,:-1]
y = padded_input_sequences[:,-1]

In [16]:
X

array([[ 0,  0,  0, ...,  0,  0,  1],
       [ 0,  0,  0, ...,  0,  1, 21],
       [ 0,  0,  0, ...,  1, 21,  2],
       ...,
       [ 0,  0,  0, ..., 12, 85, 86],
       [ 0,  0,  0, ..., 85, 86, 16],
       [ 0,  0,  0, ..., 86, 16,  6]], dtype=int32)

In [17]:
y

array([21,  2, 22, 23, 24, 25, 26, 10,  5, 28,  6, 29, 10,  7, 30, 31,  3,
       11,  1, 12, 13, 32,  2, 33,  4, 34, 35,  2, 36,  8, 37, 14,  2, 38,
       15, 16, 17, 39, 18, 40,  3, 42, 43, 44, 45,  6, 46,  3, 47,  5, 48,
       49,  2, 50,  8,  1,  1, 51, 52,  4, 53, 54, 55, 57,  3, 58, 59, 60,
       61, 15, 14,  2, 62, 18, 63, 64, 65, 66, 67, 68,  8,  1, 69, 70, 71,
       72, 17, 19,  1,  4, 20, 73,  7,  9, 19,  1, 75, 76, 77, 20,  7, 78,
       13, 79, 11, 80, 81,  5, 82,  9,  1, 84,  4, 12, 85, 86, 16,  6, 87],
      dtype=int32)

In [18]:
tokenizer.word_index

{'data': 1,
 'a': 2,
 'on': 3,
 'to': 4,
 'or': 5,
 'for': 6,
 'decisions': 7,
 'of': 8,
 'this': 9,
 'life': 10,
 'some': 11,
 'be': 12,
 'it': 13,
 'buying': 14,
 'after': 15,
 'considering': 16,
 'the': 17,
 'and': 18,
 'raw': 19,
 'make': 20,
 'plays': 21,
 'vital': 22,
 'role': 23,
 'in': 24,
 'our': 25,
 'everyday': 26,
 'directly': 27,
 'indirectly': 28,
 'daily': 29,
 'we': 30,
 'depend': 31,
 'choosing': 32,
 'novel': 33,
 'read': 34,
 'from': 35,
 'list': 36,
 'books': 37,
 'thing': 38,
 'budget': 39,
 'so': 40,
 'have': 41,
 'you': 42,
 'ever': 43,
 'imagined': 44,
 'searching': 45,
 'something': 46,
 'google': 47,
 'yahoo': 48,
 'generates': 49,
 'lot': 50,
 'is': 51,
 'essential': 52,
 'analyze': 53,
 'user': 54,
 'experiences': 55,
 'getting': 56,
 'recommendations': 57,
 'various': 58,
 'e': 59,
 'commerce': 60,
 'websites': 61,
 'product': 62,
 'tracking': 63,
 'parcels': 64,
 'during': 65,
 'delivery': 66,
 'are': 67,
 'part': 68,
 'analytics': 69,
 'which': 70,
 'invo

# Since this is a classification problem and not regression. So we need to encode

In [19]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y, num_classes=88)

In [20]:
y.shape

(119, 88)

In [21]:
X.shape

(119, 33)

### **Model Building**

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [23]:
model = Sequential()
model.add(Embedding(88,100,input_length=33))
model.add(LSTM(150))
model.add(Dense(88,activation='softmax'))



## model.add(LSTM(150))=> This means 150 nodes. So dimensions would be 150X1

In [24]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [25]:
model.summary()

In [26]:
X.shape

(119, 33)

In [27]:
y.shape

(119, 88)

In [28]:
model.fit(X,y,epochs=100)

Epoch 1/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 64ms/step - accuracy: 0.0034 - loss: 4.4782    
Epoch 2/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - accuracy: 0.0922 - loss: 4.4594
Epoch 3/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.0795 - loss: 4.4253
Epoch 4/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.0550 - loss: 4.3326
Epoch 5/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.0610 - loss: 4.2693
Epoch 6/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.0688 - loss: 4.2654
Epoch 7/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.0613 - loss: 4.2414
Epoch 8/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.0527 - loss: 4.1612
Epoch 9/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0

<keras.src.callbacks.history.History at 0x7e4b51f2f210>

#### **Test the model**

In [29]:
text2 = "Data"

# tokenization
token_text = tokenizer.texts_to_sequences([text2])[0]
# padding
padded_text = pad_sequences([token_text], maxlen=33, padding='pre')
# model prediction
model.predict(padded_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 205ms/step


array([[3.99350283e-05, 1.05470521e-02, 1.08193923e-02, 2.31505348e-03,
        2.12406390e-03, 1.15899090e-02, 5.03256405e-03, 5.36279731e-05,
        2.14405751e-04, 9.81052518e-02, 8.25280513e-05, 2.39270503e-05,
        3.27085494e-03, 7.82370262e-05, 8.39188651e-05, 3.46236666e-05,
        2.76352279e-04, 7.27597362e-05, 6.55285839e-05, 7.24196993e-03,
        2.11883889e-04, 6.22474670e-01, 4.00825590e-03, 1.33341854e-03,
        8.70304473e-04, 3.21527827e-04, 1.53978908e-04, 2.37743807e-05,
        4.72352607e-03, 6.36460900e-04, 1.44636404e-04, 6.03020490e-05,
        1.45484024e-04, 6.46055632e-05, 4.94307496e-05, 1.04381666e-04,
        1.16434667e-05, 8.08096593e-05, 8.80284497e-05, 1.45652826e-04,
        7.95012602e-06, 2.87989551e-05, 6.36245217e-03, 6.39781822e-03,
        1.78222777e-03, 2.99228937e-04, 1.10665205e-04, 1.27454958e-04,
        3.45684020e-05, 1.33114110e-04, 1.49696323e-04, 1.03713378e-01,
        5.42739965e-03, 6.81088073e-04, 1.57447110e-04, 4.622389

In [30]:
import numpy as np
pos = np.argmax(model.predict(padded_text))
pos

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step


np.int64(21)

In [31]:
tokenizer.word_index

{'data': 1,
 'a': 2,
 'on': 3,
 'to': 4,
 'or': 5,
 'for': 6,
 'decisions': 7,
 'of': 8,
 'this': 9,
 'life': 10,
 'some': 11,
 'be': 12,
 'it': 13,
 'buying': 14,
 'after': 15,
 'considering': 16,
 'the': 17,
 'and': 18,
 'raw': 19,
 'make': 20,
 'plays': 21,
 'vital': 22,
 'role': 23,
 'in': 24,
 'our': 25,
 'everyday': 26,
 'directly': 27,
 'indirectly': 28,
 'daily': 29,
 'we': 30,
 'depend': 31,
 'choosing': 32,
 'novel': 33,
 'read': 34,
 'from': 35,
 'list': 36,
 'books': 37,
 'thing': 38,
 'budget': 39,
 'so': 40,
 'have': 41,
 'you': 42,
 'ever': 43,
 'imagined': 44,
 'searching': 45,
 'something': 46,
 'google': 47,
 'yahoo': 48,
 'generates': 49,
 'lot': 50,
 'is': 51,
 'essential': 52,
 'analyze': 53,
 'user': 54,
 'experiences': 55,
 'getting': 56,
 'recommendations': 57,
 'various': 58,
 'e': 59,
 'commerce': 60,
 'websites': 61,
 'product': 62,
 'tracking': 63,
 'parcels': 64,
 'during': 65,
 'delivery': 66,
 'are': 67,
 'part': 68,
 'analytics': 69,
 'which': 70,
 'invo

In [32]:
for word, index in tokenizer.word_index.items():
  if index==pos:
    print(word)

plays


In [33]:
text3 = "Data plays a vital role"

# tokenization
token_text3 = tokenizer.texts_to_sequences([text3])[0]
# padding
padded_text3 = pad_sequences([token_text3], maxlen=33, padding='pre')
# model prediction
model.predict(padded_text3)

pos3 = np.argmax(model.predict(padded_text3))

for word, index in tokenizer.word_index.items():
  if index==pos3:
    print(word)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
in


In [34]:
text3 = "Data is a vital role in our everyday life"

# tokenization
token_text3 = tokenizer.texts_to_sequences([text3])[0]
# padding
padded_text3 = pad_sequences([token_text3], maxlen=33, padding='pre')
# model prediction
model.predict(padded_text3)

pos3 = np.argmax(model.predict(padded_text3))

for word, index in tokenizer.word_index.items():
  if index==pos3:
    print(word)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
life
