<a href="https://colab.research.google.com/github/sofiaElenaHopartean/NLP_recipes/blob/main/Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import sys
import warnings
import unicodedata
import re
import random
import time
import pandas as pd
import numpy as np
import csv
%matplotlib inline

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model
import tensorflow as tf
from functools import partial
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


In [20]:
params={
    'save_dir': 'drive/MyDrive/recipes/saved_model_tf/model4/',
    'padding_size': 150,
    'num_classes' : 5
}
colname1, colname2 = 'norm_directions', 'norm_ingredients'

In [11]:
def load_obj(directory, name):
  '''Helper function using pickle to save and load objects'''
  with open(directory + name + ".pkl", "rb") as f:
      return pickle.load(f)


word_index = load_obj(params['save_dir'], "word_index")
word_lookup = load_obj(params['save_dir'], "word_lookup")
labels_index = load_obj(params['save_dir'], "labels_index")
labels_lookup = load_obj(params['save_dir'], "labels_lookup")
tokenizer = load_obj(params['save_dir'], "tokenizer")
sp_chrs = load_obj(params['save_dir'], "sp_chr")


### Rebuild test set

In [6]:
recipes_df = pd.read_csv("drive/MyDrive/recipes/recipes.csv", delimiter=";")
recipes_df_small = recipes_df[~recipes_df['Directions'].isna()][["Directions", "Ingredients"]]

In [14]:
def preprocess_dirs(text):
  _dirs = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode("utf-8")
  for sp_chr in sp_chrs: 
    _dirs = _dirs.replace(sp_chr, " ")
  _dirs = re.sub('\d', '', _dirs)
  return ' '.join(_dirs.split())

def preprocess_ingred(text):
  _ingreds = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode("utf-8")
  for sp_chr in sp_chrs: 
    if sp_chr != ",":
      _ingreds = _ingreds.replace(sp_chr, " ")
  _ingreds = re.sub('\d', '', _ingreds)
  return ' '.join(_ingreds.split())

recipes_df_small["lo_directions"] = recipes_df_small["Directions"].str.lower()
recipes_df_small["lo_ingredients"] = recipes_df_small["Ingredients"].str.lower()

recipes_df_small["norm_directions"] = recipes_df_small["lo_directions"].apply(preprocess_dirs)
recipes_df_small["norm_ingredients"] = recipes_df_small["lo_ingredients"].apply(preprocess_ingred)

In [21]:
X, y = [], []
for index, line in recipes_df_small[[colname1, colname2]].iterrows():
  _descr = line[colname1]
  _ingreds = re.split(',\s*', line[colname2])
  for ingred in _ingreds:
    replc = "S-INGREDIENT"
    parts = ingred.split(" ")
    if len(parts)==1: replc = "S-INGREDIENT"
    elif len(parts)==2: replc = "B-INGREDIENT E-INGREDIENT"
    else: 
      replc = "B-INGREDIENT " +" ".join(["I-INGREDIENT"]*(len(parts)-2)) +" E-INGREDIENT"
    
    if ingred in _descr:
      _descr = _descr.replace(ingred, replc)
  _descr = _descr.split()
  _labels = [ w if w in labels_index.keys() else "O" for w in _descr]
  
  words = [word_index.get(x) if x in word_index.keys() else 0 
            for x in line[colname1].split()]
  labels = [labels_index.get(y) for y in _labels]
  X.append(words)
  y.append(labels)

X = pad_sequences(X, maxlen = params['padding_size'], value = word_index["ENDPAD"], padding = "post")
y = pad_sequences(y, maxlen = params['padding_size'], value = labels_index["O"], padding = "post")
y_cat = [to_categorical(i, num_classes = params['num_classes']) for i in y]

_, X_test, _, y_test = train_test_split(X,y_cat, train_size=0.9, random_state= 11 )

### Evaluate Model on test set

In [23]:
reconstructed_model = load_model(params['save_dir'])
reconstructed_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 150)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 150, 150)          805200    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 150, 150)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 150, 300)          361200    
_________________________________________________________________
time_distributed (TimeDistri (None, 150, 5)            1505      
Total params: 1,167,905
Trainable params: 1,167,905
Non-trainable params: 0
_________________________________________________________________


In [65]:
metrics = reconstructed_model.evaluate(X_test, np.array(y_test))
print(metrics)

[0.023968150839209557, 0.9909257888793945]


In [59]:
input = {
  "recipe1": "Spray bread machine pan with vegetable oil spray.**Premix ingredients in order listed. Place mixture in bread machine pan.**Select the Quick Bread/Cake cycle.  Press Start. Check after 1 minute to see if mixture is well blended.**Cook until cake cycle stops.  Remove pan, and cool completely before removing bread from pan.**",
  "recipe2": "Preheat oven to 350 degrees F (175 degrees C). Grease and flour a 9x5 inch loaf pan. In a medium bowl, mix flour, soda, salt, 3/4 teaspoon nutmeg, ginger and cloves. Set aside.**In a large bowl, cream butter and sugar until light and fluffy. Beat in the eggs. Add flour mixture alternately with pumpkin. Stir in chocolate chips and 1/2 cup of the walnuts. Pour batter into loaf pan. Sprinkle remaining nuts on top.**Bake at 350 degrees F (175 degrees C) for 65 to 70 minutes or until toothpick inserted into center of cake comes out clean. While still warm, drizzle with glaze. Cool for 6 hours before serving.**to make the glaze:  In a medium bowl, combine confectioners sugar, nutmeg and cinnamon. Mix and add 1 to 2 teaspoons cream until drizzling consistency.**"
}

def inference(input):
  recipes=[]
  delimiters=[]
  keys = list(input.keys())
  for name in keys:
    _dir = preprocess_dirs(input[name]).split()
    recipes.append([ word_index.get(x) if x in word_index.keys() else 0 
                    for x in _dir])
    pos_sum = 0
    pos = []
    for x in _dir: 
      pos_sum += len(x) + 1
      pos.append(pos_sum)
    delimiters.append(pos)

  X = pad_sequences(recipes, maxlen = params['padding_size'], value = word_index["ENDPAD"], padding = "post")
  p = reconstructed_model.predict(np.array(X))
  p = np.argmax(p, axis=-1)
  # p.shape
  # print(delimiters[0])
  # print(delimiters[1])

  output={}
  for i in range(len(keys)):
    _res = []
    j = 0
    ingred = ""
    start, end = 0,0
    while j < len(p[i]):
      ner_tok = p[i][j]
      if ner_tok == 1: 
        _res.append([ word_lookup[X[i][j]],
                    0 if j == 0 else delimiters[i][j-1] +1, delimiters[i][j]])
      if ner_tok > 1:
        ingred+= word_lookup[X[i][j]]
        start = 0 if j == 0 else delimiters[i][j-1] +1 
        end = delimiters[i][j]
      if ner_tok == 0 and ingred != "":
        _res.append([ word_lookup[X[i][j]],
                    start, end])
        ingred = ""
        start, end = 0,0
      j+= 1
    output[keys[i]] = _res
  return output

inference(input)

{'recipe1': [['bread', 7, 12], ['bread', 102, 107], ['bread', 294, 299]],
 'recipe2': [['flour', 48, 53],
  ['flour', 93, 98],
  ['salt', 104, 108],
  ['nutmeg', 118, 124],
  ['ginger', 125, 131],
  ['butter', 175, 181],
  ['flour', 236, 241],
  ['pumpkin', 267, 274],
  ['chocolate', 283, 292],
  ['sugar', 609, 614],
  ['nutmeg', 615, 621],
  ['cinnamon', 626, 634]]}