In [7]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, activations, models, preprocessing, utils
import pandas as pd
import json
import nltk
from nltk.tokenize import word_tokenize


In [8]:
# Load the JSON dataset
file_path = 'D:/CSIT/python/gitChatbot/cakechatbot/intents.json'
with open(file_path, 'r') as file:
    data = json.load(file)

# Initialize lists to store processed data for each type
recipes = []
conversations = []

# Process each entry
for item in data:
    if "recipe_name" in item["output"]:
        # Process as a recipe entry
        ingredients = []
        # Tokenize each ingredient
        for ingredient in item["output"]["ingredients"]:
            tokenized_ingredient = word_tokenize(ingredient["ingredient"])
            ingredients.append({
                "ingredient": tokenized_ingredient,
                "quantity": ingredient["quantity"]
            })

        # Tokenize each instruction
        instructions = [word_tokenize(instruction) for instruction in item["output"]["instructions"]]

        recipe_entry = {
            "recipe_name": word_tokenize(item["output"]["recipe_name"]),
            "ingredients": ingredients,
            "instructions": instructions,
            "temperature": item["output"].get("temperature"),
            "baking_time": item["output"].get("baking_time"),
            "substitutions": item["output"].get("substitutions", []),
            "serving_size": item["output"].get("serving_size"),
            "adjustments": item["output"].get("adjustments", [])
        }
        recipes.append(recipe_entry)
    else:
        # Process as a conversation entry
        conversation_entry = {
            "input": word_tokenize(item["input"]),
            "output": word_tokenize(item["output"])
        }
        conversations.append(conversation_entry)

# Example: Print out the first processed recipe and conversation
print("First Tokenized Recipe Entry:", recipes[0])
print("First Tokenized Conversation Entry:", conversations[0])

First Tokenized Recipe Entry: {'recipe_name': ['Chocolate', 'Cake'], 'ingredients': [{'ingredient': ['flour'], 'quantity': '1 1/2 cups'}, {'ingredient': ['cocoa', 'powder'], 'quantity': '1/2 cup'}, {'ingredient': ['baking', 'soda'], 'quantity': '1 tsp'}, {'ingredient': ['sugar'], 'quantity': '1 cup'}, {'ingredient': ['butter'], 'quantity': '1/2 cup'}, {'ingredient': ['milk'], 'quantity': '1 cup'}, {'ingredient': ['eggs'], 'quantity': '2'}], 'instructions': [['Preheat', 'the', 'oven', 'to', '350°F', '(', '175°C', ')', '.'], ['Grease', 'and', 'flour', 'a', '9-inch', 'round', 'cake', 'pan', '.'], ['In', 'a', 'medium', 'bowl', ',', 'mix', 'together', 'the', 'dry', 'ingredients', '(', 'flour', ',', 'cocoa', 'powder', ',', 'baking', 'soda', ',', 'and', 'sugar', ')', '.'], ['In', 'a', 'separate', 'bowl', ',', 'beat', 'the', 'eggs', 'and', 'mix', 'with', 'butter', ',', 'milk', ',', 'and', 'vanilla', 'extract', '.'], ['Gradually', 'combine', 'the', 'wet', 'and', 'dry', 'ingredients', 'and', 'st

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Combine recipe names, ingredients, instructions, and conversations into one corpus for tokenization
all_texts = []
for recipe in recipes:
    # Add recipe name tokens
    all_texts.extend(recipe["recipe_name"])
    
    # Add ingredient tokens
    for ingredient in recipe["ingredients"]:
        all_texts.extend(ingredient["ingredient"])
    
    # Add instruction tokens
    for instruction in recipe["instructions"]:
        all_texts.extend(instruction)
    
    # Add temperature and baking time as tokens (if they exist and are not None)
    if recipe.get("temperature"):
        all_texts.extend(str(recipe["temperature"]).split())  # Tokenize temperature if it's a string
    
    if recipe.get("baking_time"):
        all_texts.extend(str(recipe["baking_time"]).split())  # Tokenize baking time if it's a string
    
    # Add substitutions, serving size, and adjustments (if they exist)
    for substitution in recipe.get("substitutions"):
        all_texts.extend(substitution)
    
    if recipe.get("serving_size"):
        all_texts.extend(str(recipe["serving_size"]).split())
    
    for adjustment in recipe.get("adjustments"):
        all_texts.extend(adjustment)


for conversation in conversations:
    all_texts.extend(conversation["input"])
    all_texts.extend(conversation["output"])

# Initialize the Tokenizer
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(all_texts)  # Fit tokenizer on all texts

# Convert text to sequences for recipes and conversations
def text_to_sequence(text):
    return tokenizer.texts_to_sequences([" ".join(text)])[0]

# Convert recipes
for recipe in recipes:
    # Convert recipe name
    recipe["recipe_name"] = text_to_sequence(recipe["recipe_name"])
    
    # Convert ingredients
    recipe["ingredients"] = [{"ingredient": text_to_sequence(ingredient["ingredient"]),
                              "quantity": ingredient["quantity"]} for ingredient in recipe["ingredients"]]
    
    # Convert instructions
    recipe["instructions"] = [text_to_sequence(instruction) for instruction in recipe["instructions"]]
    
    # Convert temperature and baking time if they exist and are strings
    if recipe.get("temperature"):
        recipe["temperature"] = text_to_sequence(str(recipe["temperature"]))
    
    if recipe.get("baking_time"):
        recipe["baking_time"] = text_to_sequence(str(recipe["baking_time"]))
    
    # Convert substitutions
    recipe["substitutions"] = [text_to_sequence(substitution) for substitution in recipe.get("substitutions", [])]
    
    # Convert serving size if it exists and is a string
    if recipe.get("serving_size"):
        recipe["serving_size"] = text_to_sequence(str(recipe["serving_size"]))
    
    # Convert adjustments
    recipe["adjustments"] = [text_to_sequence(adjustment) for adjustment in recipe.get("adjustments", [])]

# Convert conversations
for conversation in conversations:
    conversation["input"] = text_to_sequence(conversation["input"])
    conversation["output"] = text_to_sequence(conversation["output"])

# Pad the sequences for uniform length
# Calculate maximum sequence length across all relevant fields for padding
max_seq_length = max(
    max(len(recipe["recipe_name"]) for recipe in recipes),
    max(max(len(instruction) for instruction in recipe["instructions"]) for recipe in recipes),
    max(max(len(substitution) for substitution in recipe["substitutions"]) if recipe["substitutions"] else 0 for recipe in recipes),
    max(max(len(adjustment) for adjustment in recipe["adjustments"]) if recipe["adjustments"] else 0 for recipe in recipes),
    max(len(conversation["input"]) for conversation in conversations),
    max(len(conversation["output"]) for conversation in conversations)
)

# Pad recipes and conversation sequences
for recipe in recipes:
    # Pad recipe name
    recipe["recipe_name"] = pad_sequences([recipe["recipe_name"]], maxlen=max_seq_length, padding='post')[0]
    
    # Pad ingredients
    for ingredient in recipe["ingredients"]:
        ingredient["ingredient"] = pad_sequences([ingredient["ingredient"]], maxlen=max_seq_length, padding='post')[0]
    
    # Pad instructions
    recipe["instructions"] = pad_sequences(recipe["instructions"], maxlen=max_seq_length, padding='post').tolist()
    
    # Pad substitutions if they exist
    if recipe["substitutions"]:
        recipe["substitutions"] = pad_sequences(recipe["substitutions"], maxlen=max_seq_length, padding='post').tolist()
    
    # Pad adjustments if they exist
    if recipe["adjustments"]:
        recipe["adjustments"] = pad_sequences(recipe["adjustments"], maxlen=max_seq_length, padding='post').tolist()

# Pad conversations
for conversation in conversations:
    conversation["input"] = pad_sequences([conversation["input"]], maxlen=max_seq_length, padding='post')[0]
    conversation["output"] = pad_sequences([conversation["output"]], maxlen=max_seq_length, padding='post')[0]

# Check an example of padded data
print("Padded Recipe Example:", recipes[0])
print("Padded Conversation Example:", conversations[0])


# Check the maximum index in your tokenizer's word_index
max_index = max(tokenizer.word_index.values())
print("Max index in tokenizer:", max_index)


Padded Recipe Example: {'recipe_name': array([47,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]), 'ingredients': [{'ingredient': array([10,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]), 'quantity': '1 1/2 cups'}, {'ingredient': array([92, 19,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]), 'quantity': '1/2 cup'}, {'ingredient': array([ 9, 73,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,

In [10]:
# Check the maximum index in your tokenizer's word_index
max_index = max(tokenizer.word_index.values())
print("Max index in tokenizer:", max_index)


Max index in tokenizer: 583


In [11]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model

# Define model parameters
vocab_size = len(tokenizer.word_index) + 1  # Adjust for padding token
embedding_dim = 256  # Or whatever dimension you choose
lstm_units = 256
max_seq_length = 48  # Define your max sequence length here

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_seq_length,))
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Summary of the model
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 48)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    149504      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 48, 256)      149504      ['input_2[0][0]']                
                                                                                              

In [12]:
# Add start and end tokens to the tokenizer's word index if they are not already present
if '<start>' not in tokenizer.word_index:
    tokenizer.word_index['<start>'] = len(tokenizer.word_index) + 1
    tokenizer.index_word[tokenizer.word_index['<start>']] = '<start>'

if '<end>' not in tokenizer.word_index:
    tokenizer.word_index['<end>'] = len(tokenizer.word_index) + 2
    tokenizer.index_word[tokenizer.word_index['<end>']] = '<end>'

# Now you can safely add the start and end tokens
encoder_input_data = []
decoder_input_data = []
decoder_output_data = []

# Process conversation data for training
for conversation in conversations:
    # Add encoder input (user query/input sequence)
    encoder_input_data.append(conversation["input"])
    
    # Add decoder input (start token + output sequence)
    decoder_input_data.append([tokenizer.word_index['<start>']] + conversation["output"])
    
    # Add decoder output (output sequence + end token, shifted for teacher forcing)
    decoder_output_data.append(conversation["output"] + [tokenizer.word_index['<end>']])

# Process recipe data for training
for recipe in recipes:
    # Add recipe name as encoder input
    encoder_input_data.append(recipe["recipe_name"])
    
    # Combine ingredients, instructions, and additional info for decoder input/output
    combined_output = []
    for ingredient in recipe["ingredients"]:
        combined_output.extend(ingredient["ingredient"])  # Ingredient tokens
    for instruction in recipe["instructions"]:
        combined_output.extend(instruction)  # Instruction tokens
    
    # Include other recipe details if available
    if recipe.get("temperature"):
        combined_output.extend(recipe["temperature"])
    if recipe.get("baking_time"):
        combined_output.extend(recipe["baking_time"])
    for substitution in recipe.get("substitutions", []):
        combined_output.extend(substitution)
    if recipe.get("serving_size"):
        combined_output.extend(recipe["serving_size"])
    for adjustment in recipe.get("adjustments", []):
        combined_output.extend(adjustment)

    # Add the start and end tokens around the combined recipe data
    decoder_input_data.append([tokenizer.word_index['<start>']] + combined_output)
    decoder_output_data.append(combined_output + [tokenizer.word_index['<end>']])

# Pad sequences so that they all have the same length
max_encoder_len = max(len(seq) for seq in encoder_input_data)
max_decoder_len = max(len(seq) for seq in decoder_input_data)

# Pad sequences so that they all have the same length
encoder_input_data = pad_sequences(encoder_input_data, maxlen=max_encoder_len, padding='post', truncating='post')
decoder_input_data = pad_sequences(decoder_input_data, maxlen=max_seq_length, padding='post', truncating='post')
decoder_output_data = pad_sequences(decoder_output_data, maxlen=max_seq_length, padding='post', truncating='post')

# Convert to numpy arrays
encoder_input_data = np.array(encoder_input_data)
decoder_input_data = np.array(decoder_input_data)
decoder_output_data = np.array(decoder_output_data)

# Adjust decoder_output_data to be compatible with sparse_categorical_crossentropy
decoder_output_data = np.expand_dims(decoder_output_data, -1)

# Now you can train the model
batch_size = 64
epochs = 100

history = model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_output_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2
)

# Save the model for later use
model.save('cake_chatbot_seq2seq.h5')


Epoch 1/100


InvalidArgumentError: Graph execution error:

Detected at node 'model/embedding_1/embedding_lookup' defined at (most recent call last):
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
      app.launch_new_instance()
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
      app.start()
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\ipykernel\kernelapp.py", line 739, in start
      self.io_loop.start()
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\tornado\platform\asyncio.py", line 205, in start
      self.asyncio_loop.run_forever()
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\asyncio\base_events.py", line 601, in run_forever
      self._run_once()
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\asyncio\base_events.py", line 1905, in _run_once
      handle._run()
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue
      await self.process_one()
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one
      await dispatch(*args)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell
      await result
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\ipykernel\ipkernel.py", line 362, in execute_request
      await super().execute_request(stream, ident, parent)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request
      reply_content = await reply_content
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\ipykernel\ipkernel.py", line 449, in do_execute
      res = shell.run_cell(
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell
      return super().run_cell(*args, **kwargs)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\IPython\core\interactiveshell.py", line 3048, in run_cell
      result = self._run_cell(
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\IPython\core\interactiveshell.py", line 3103, in _run_cell
      result = runner(coro)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\IPython\core\interactiveshell.py", line 3308, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\IPython\core\interactiveshell.py", line 3490, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\IPython\core\interactiveshell.py", line 3550, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\dell\AppData\Local\Temp\ipykernel_6092\1531357120.py", line 75, in <module>
      history = model.fit(
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\keras\engine\training.py", line 993, in train_step
      y_pred = self(x, training=True)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\keras\engine\training.py", line 557, in __call__
      return super().__call__(*args, **kwargs)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\keras\engine\functional.py", line 510, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\keras\engine\functional.py", line 667, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\dell\anaconda3\envs\cake_chatbot\lib\site-packages\keras\layers\core\embedding.py", line 208, in call
      out = tf.nn.embedding_lookup(self.embeddings, inputs)
Node: 'model/embedding_1/embedding_lookup'
indices[46,0] = 587 is not in [0, 584)
	 [[{{node model/embedding_1/embedding_lookup}}]] [Op:__inference_train_function_5651]