In [1]:
import tensorflow as tf 
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Embedding, LSTM, Dense 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
import numpy as np 
import regex as re 

#### Strategy

Corpus = W1 w2 w3 w4. W5 w6 w7 ...... wn


X______________________y

w1_____________________w2

w1 w2__________________w3

w1 w2 w3_______________w4

w5______________________w6

w5 w6_________________w7 and so on

In [2]:
def file_to_sentence_list(file_path): 
    with open(file_path, 'r') as file: 
        text = file.read() 
  
    # Splitting the text into sentences using 
    # delimiters like '.', '?', and '!' 
    sentences = [sentence.strip() for sentence in re.split( 
        r'(?<=[.!?])\s+', text) if sentence.strip()] 
  
    return sentences 

In [18]:
file_path = 'pizza.txt'
text_data = file_to_sentence_list(file_path)

#print (text_data)
  
# Tokenize the text data to build vocabulary
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(text_data) 
total_words = len(tokenizer.word_index) + 1  

print(total_words)
#print(text_data[:5])
print(tokenizer.word_index)
  
# Create input sequences
input_sequences = [] 
for line in text_data: 
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)): 
        n_gram_sequence = token_list[:i+1] 
        input_sequences.append(n_gram_sequence)
        #print(input_sequences)
  
# Pad sequences and split into predictors and label 
max_sequence_len = max([len(seq) for seq in input_sequences]) 
input_sequences = np.array(pad_sequences( 
    input_sequences, maxlen=max_sequence_len, padding='pre')) 
X, y = input_sequences[:, :-1], input_sequences[:, -1] 

# Convert target data to one-hot encoding 
y = tf.keras.utils.to_categorical(y, num_classes=total_words) 

687
{'the': 1, 'and': 2, 'pizza': 3, 'of': 4, 'a': 5, 'to': 6, 'in': 7, 'has': 8, 'its': 9, 'for': 10, 'with': 11, 'it': 12, 'that': 13, 'is': 14, 'as': 15, 'culinary': 16, 'from': 17, 'become': 18, 'their': 19, 'have': 20, 'on': 21, 'flavors': 22, 'cheese': 23, 'toppings': 24, 'also': 25, 'delivery': 26, 'food': 27, 'people': 28, 'world': 29, 'traditional': 30, 'made': 31, 'experience': 32, 'our': 33, 'pizzerias': 34, 'dish': 35, 'diverse': 36, 'like': 37, 'crust': 38, 'delight': 39, 'symbol': 40, 'pizzas': 41, 'more': 42, 'making': 43, 'or': 44, 'iconic': 45, 'creativity': 46, 'cultural': 47, 'italy': 48, 'an': 49, 'combinations': 50, 'ancient': 51, 'who': 52, 'ingredients': 53, 'we': 54, 'this': 55, 'style': 56, 'home': 57, 'indulgence': 58, 'beyond': 59, 'global': 60, 'inspired': 61, 'options': 62, 'those': 63, 'not': 64, 'but': 65, 'together': 66, 'allowing': 67, 'just': 68, 'comfort': 69, 'local': 70, 'may': 71, 'favorite': 72, 'will': 73, 'taste': 74, 'fusion': 75, 'into': 76, '

In [19]:
print (X.shape)

print(y.shape)

print(y)

(1628, 39)
(1628, 687)
[[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [25]:
model = Sequential() 

#Embedding layer each word is represented in 100 dim dense vector output that is fed into LSTM
#each sentence has max_sequence_len-1 (39) input size and each word is represented as a 100 dim vector 

model.add(Embedding(max_sequence_len-1, 100, trainable = False)) 

#39 time stamps for each sentence to the network of dimension 150 
model.add(LSTM(150)) 

#output is a softmax layer of vector 687x1 
model.add(Dense(total_words, activation='softmax')) 

model.compile(loss='categorical_crossentropy', 
                optimizer='adam', metrics=['accuracy']) 

687


In [26]:
model.fit(X, y, epochs=100, verbose=1) 

Epoch 1/100
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 42ms/step - accuracy: 0.0481 - loss: 6.3126
Epoch 2/100
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 57ms/step - accuracy: 0.0484 - loss: 5.7080
Epoch 3/100
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 99ms/step - accuracy: 0.0598 - loss: 5.6650
Epoch 4/100
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 89ms/step - accuracy: 0.0849 - loss: 5.6105
Epoch 5/100
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 77ms/step - accuracy: 0.0854 - loss: 5.5624
Epoch 6/100
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 79ms/step - accuracy: 0.0816 - loss: 5.5103
Epoch 7/100
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 74ms/step - accuracy: 0.1014 - loss: 5.3491
Epoch 8/100
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 87ms/step - accuracy: 0.0933 - loss: 5.2070
Epoch 9/100
[1m51/51[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x27c485ec470>

In [27]:
# Generate next word predictions 
seed_text = "Pizza has "
next_words = 5
  
for _ in range(next_words): 
    token_list = tokenizer.texts_to_sequences([seed_text])[0] 
    token_list = pad_sequences( 
        [token_list], maxlen=max_sequence_len-1, padding='pre') 
    predicted_probs = model.predict(token_list) 
    predicted_word = tokenizer.index_word[np.argmax(predicted_probs)] 
    seed_text += " " + predicted_word 
    print("Next predicted words:", seed_text) 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step
Next predicted words: Pizza has  become
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Next predicted words: Pizza has  become a
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Next predicted words: Pizza has  become a symbol
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Next predicted words: Pizza has  become a symbol of
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Next predicted words: Pizza has  become a symbol of comfort


#### To improve peformance

1. Train with more data
2. Hyper parameter tuning - optimizer, epoch, learning rate, input nodes etc.
3. Advanced architectures - stacked LSTM, Bi directional LSTM, Transformers (GPT, BERT...)