In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
data = [
    'Today is a sunny day',
    'today is a Windy Day',
    'Is it sunny Today',
    'I really enjoyed walking in the snow today'
]

In [3]:
data

['Today is a sunny day',
 'today is a Windy Day',
 'Is it sunny Today',
 'I really enjoyed walking in the snow today']

In [4]:
tokenizer = Tokenizer(100)
tokenizer.fit_on_texts(data)
text_to_numbers = tokenizer.texts_to_sequences(data)
word_ids = tokenizer.word_index
print(word_ids)

{'today': 1, 'is': 2, 'a': 3, 'sunny': 4, 'day': 5, 'windy': 6, 'it': 7, 'i': 8, 'really': 9, 'enjoyed': 10, 'walking': 11, 'in': 12, 'the': 13, 'snow': 14}


In [5]:
print(text_to_numbers)

[[1, 2, 3, 4, 5], [1, 2, 3, 6, 5], [2, 7, 4, 1], [8, 9, 10, 11, 12, 13, 14, 1]]


In [6]:
lengths_of_strings = [len(s) for s in text_to_numbers]
print(max(lengths_of_strings))

8


In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_num_text_data = pad_sequences(text_to_numbers, padding = 'post', maxlen = max(lengths_of_strings), truncating = 'post')
print(padded_num_text_data)

[[ 1  2  3  4  5  0  0  0]
 [ 1  2  3  6  5  0  0  0]
 [ 2  7  4  1  0  0  0  0]
 [ 8  9 10 11 12 13 14  1]]


# Find the vocabulary size

In [8]:
vocabulary_size = len(tokenizer.word_index) + 1
print(vocabulary_size)

15


# Prepare input and output data for training
### Today is a sunny[input] ___? (day)[output]

In [9]:
X_train = padded_num_text_data[:,:-1]
X_train

array([[ 1,  2,  3,  4,  5,  0,  0],
       [ 1,  2,  3,  6,  5,  0,  0],
       [ 2,  7,  4,  1,  0,  0,  0],
       [ 8,  9, 10, 11, 12, 13, 14]])

In [10]:
y_train = padded_num_text_data[:,-1]
print(y_train)

[0 0 0 1]


In [11]:
padded_num_text_data = pad_sequences(text_to_numbers, maxlen = max(lengths_of_strings))
print(padded_num_text_data)

[[ 0  0  0  1  2  3  4  5]
 [ 0  0  0  1  2  3  6  5]
 [ 0  0  0  0  2  7  4  1]
 [ 8  9 10 11 12 13 14  1]]


In [12]:
vocabulary_size = len(tokenizer.word_index) + 1
print(vocabulary_size)

15


In [13]:
X_train = padded_num_text_data[:,:-1]
X_train

array([[ 0,  0,  0,  1,  2,  3,  4],
       [ 0,  0,  0,  1,  2,  3,  6],
       [ 0,  0,  0,  0,  2,  7,  4],
       [ 8,  9, 10, 11, 12, 13, 14]])

In [14]:
y_train = padded_num_text_data[:,-1]
print(y_train)

[5 5 1 1]


In [15]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.layers import LSTM, GRU, SimpleRNN
from tensorflow.keras.layers import Embedding

In [16]:
y_train = to_categorical(y_train, num_classes=vocabulary_size)
print(y_train)

[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [17]:
model = Sequential()
model.add(Embedding(vocabulary_size,10,
                    input_length=max(lengths_of_strings)-1))
model.add(Flatten())
model.add(Dense(vocabulary_size, activation = 'softmax'))

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 7, 10)             150       
                                                                 
 flatten (Flatten)           (None, 70)                0         
                                                                 
 dense (Dense)               (None, 15)                1065      
                                                                 
Total params: 1,215
Trainable params: 1,215
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [20]:
model.fit(X_train, y_train, epochs = 500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x233867d4df0>

In [21]:
max_sequence_length = max(lengths_of_strings)

In [23]:
# Example prediction
import numpy as np
input_sequence = ["today is a sunny"]
input_sequence = tokenizer.texts_to_sequences(input_sequence)
input_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length-1)

predicted_probs = model.predict(input_sequence)[0]
predicted_class_index = np.argmax(predicted_probs)

# Convert predicted class index back to word
predicted_word = tokenizer.index_word[predicted_class_index]

print("Predicted word:", predicted_word)

Predicted word: day


# Predict on the following Text
## data3 = """ Jack and Jill went up the hill\n
##		To fetch a pail of water\n
##		Jack fell down and broke his crown\n
##		And Jill came tumbling after\n """