In [45]:
import spacy
import pandas as pd

# read in RecipeNLG Dataset

df = pd.read_csv('RecipeNLG_Dataset.csv')

In [284]:
# takeout relevant columns
# convert from string to list literal

from ast import literal_eval

df_cols = df[["ingredients","NER","directions"]]

data = df_cols.sample(n = 10000, random_state = 42)

data["ingredients"] = data["ingredients"].apply(literal_eval)  
data["directions"] = data["directions"].apply(literal_eval)
data["NER"] = data["NER"].apply(literal_eval)
data.reset_index(inplace=True, drop = True)


import spacy
nlp=spacy.load('en_core_web_sm')
from spacy.tokens import Doc
from spacy.training import Example

In [285]:
# data ingredients
import numpy as np
data_ingredients = data[["ingredients","NER"]]

def ingredients_to_string(x : list):
    return ", ".join(x)

def get_NER_labels(row):
    ingredients_sentence = row["ingredients"]

    sentence_indexed = np.array([0] * len(ingredients_sentence))

    NER = row["NER"]
    entity_list = []
    doc = nlp.make_doc(ingredients_sentence)
    for food in NER:
        try:
            food_start_index = ingredients_sentence.index(food)
            entity_list.append((food_start_index, food_start_index + len(food), "FOOD"))
        except ValueError:
            pass
    
    try:
        return Example.from_dict(doc, {"entities": entity_list})
    except ValueError:
        return Example.from_dict(doc, {"entities": []})

data_ingredients["ingredients"] = data_ingredients["ingredients"].apply(ingredients_to_string)

ingredients_NER_labels = data_ingredients.apply(get_NER_labels, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_ingredients["ingredients"] = data_ingredients["ingredients"].apply(ingredients_to_string)


In [110]:
import nltk
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

nltk.download('averaged_perceptron_tagger')

def remove_descriptor_nltk(food_name):
    words = nltk.word_tokenize(food_name)
    tagged_words = nltk.pos_tag(words)
    result_words = []
    for word, tag in tagged_words:
        if tag not in ['JJ', 'JJR', 'JJS']:
            result_words.append(word)
    return " ".join(result_words)
    
wnl.lemmatize(remove_descriptor_nltk("green onions"))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\randymi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [206]:
# data directions

data_directions = data[["directions","NER"]]

def directions_to_string(x : list):
    return " ".join(x)

def get_NER_labels(row):
    ingredients_sentence = row["directions"]

    sentence_indexed = np.array([0] * len(ingredients_sentence))

    NER = row["NER"]
    entity_list = []
    doc = nlp.make_doc(ingredients_sentence)
    for food in NER:
        try:
            food_start_index = ingredients_sentence.index(food)
            entity_list.append((food_start_index, food_start_index + len(food), "FOOD"))
        except ValueError:
            pass
    
    try:
        return Example.from_dict(doc, {"entities": entity_list})
    except ValueError:
        return Example.from_dict(doc, {"entities": []})

data_directions["directions"] = data_directions["directions"].apply(directions_to_string)

directions_NER_labels = data_directions.apply(get_NER_labels, axis = 1)

# filter out examples with no NER labels
directions_NER_labels = directions_NER_labels[directions_NER_labels.apply(lambda x: len(set(x.get_aligned_ner())) != 1)]

directions_NER_labels.reset_index(inplace=True, drop = True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_directions["directions"] = data_directions["directions"].apply(directions_to_string)


In [286]:
# Getting the pipeline component and adding food label
ner = nlp.get_pipe("ner")
ner.add_label("FOOD")

def make_training_set(ingredients_proportion, directions_proportion, count = 1000):
    ingredients = ingredients_NER_labels.sample(n = int(count * ingredients_proportion))
    directions = directions_NER_labels.sample(n = int(count * directions_proportion))
    return pd.concat([ingredients, directions]).reset_index(drop = True)

# 80/20
training_set1 = make_training_set(0.8, 0.2, count = 1000)

# 60/40
training_set2 = make_training_set(0.6, 0.4, count = 1000)

# 50/50
training_set3 = make_training_set(0.5, 0.5, count = 1000)

# 40/60
training_set4 = make_training_set(0.4, 0.6, count = 1000)

# 20/80
training_set5 = make_training_set(0.2, 0.8, count = 1000)

TRAIN_DATA = ingredients_NER_labels


In [282]:
# training

optimizer = nlp.create_optimizer()
optimizer.learn_rate = 0.01


# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Import requirements
import random
from spacy.util import minibatch, compounding
from pathlib import Path

def compound(start, stop, factor):

    # initialize counter
    value = start
    counter = 1

    # loop until counter is less than n
    while value < stop:

        # produce the current value of the counter
         
        yield value
        value = value * (factor ** counter)

        # increment the counter
        counter += 1
      

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 100 iterations
  for iteration in range(100):

    # shuufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4, 32, 1.05))    

    for batch in batches:
      nlp.update(batch, losses=losses, drop=0.5, sgd = optimizer)
      print(f"Losses {losses}")
    
    print(f"Epoch {iteration} Loses", losses)

    # every 10 epochs, save the model
    if iteration % 10 == 0:
      nlp.to_disk(f'tmp/ingredients_model{iteration}')
    


Losses {'ner': 131.29200112516946}
Losses {'ner': 325.4222207302097}
Losses {'ner': 505.1897349367963}
Losses {'ner': 699.9145621813722}
Losses {'ner': 924.5307927122096}
Losses {'ner': 1189.5927112439217}
Losses {'ner': 1406.0198667996774}
Losses {'ner': 1625.1059985512147}
Losses {'ner': 1847.4772617735125}
Losses {'ner': 2078.770390550257}
Losses {'ner': 2332.1886197018275}
Losses {'ner': 2620.8461362872868}
Losses {'ner': 2907.2317710044845}
Losses {'ner': 3154.7714282126326}
Losses {'ner': 3465.7790194401878}
Losses {'ner': 3741.3430011299656}
Losses {'ner': 4143.440052929263}
Losses {'ner': 4573.36303899277}
Losses {'ner': 4985.639394730563}
Losses {'ner': 5477.899669501465}
Losses {'ner': 5986.8640260928805}
Losses {'ner': 6458.709009510356}
Losses {'ner': 7027.56669078477}
Losses {'ner': 7575.832898193063}
Losses {'ner': 8086.60852722257}
Losses {'ner': 8640.882239585664}
Losses {'ner': 9314.865394836213}
Losses {'ner': 10082.106198562815}
Losses {'ner': 10742.77647532387}


KeyboardInterrupt: 

In [279]:


# iterate over the generator object produced by my_generator
for value in compound(1,10,1.4):

    # print each value produced by generator
    print(value)

1
1.4
2.7439999999999993
7.529535999999997


In [248]:
nlp.to_disk('tmp/ingredients_model')

In [258]:
d = nlp("chicken, rice, and black beans")
d.ents

(chicken, rice, black beans)

In [213]:
minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)).__next__()

[{'doc_annotation': {'cats': {}, 'entities': ['O', 'O', 'O', 'O', 'B-FOOD', 'L-FOOD', 'O', 'O', 'O', 'O', 'O', 'U-FOOD', 'O', 'O', 'O', 'O', 'U-FOOD', 'O', 'O', 'O', 'B-FOOD', 'L-FOOD', 'O', 'O', 'O', 'B-FOOD', 'L-FOOD', 'O', 'O', 'O', 'O', 'B-FOOD', 'L-FOOD', 'O', 'O', 'O', 'O', 'U-FOOD', 'O', 'O', 'O', 'O', 'U-FOOD', 'O', 'O', 'O', 'B-FOOD', 'L-FOOD', 'O', 'O', 'O', 'U-FOOD'], 'spans': {}, 'links': {}}, 'token_annotation': {'ORTH': ['1', '1/2', 'lb', '.', 'ground', 'beef', ',', '1', '1/4', 'tsp', '.', 'onion', ',', '1', 'tsp', '.', 'parsley', ',', '1', 'can', 'mushroom', 'soup', ',', '1', 'c.', 'sour', 'cream', ',', '1/4', 'tsp', '.', 'garlic', 'powder', ',', '1', 'tsp', '.', 'salt', ',', '1/4', 'tsp', '.', 'pepper', ',', '1', 'can', 'tomato', 'soup', ',', '1/2', 'c.', 'milk'], 'SPACY': [True, True, False, True, True, False, True, True, True, False, True, False, True, True, False, True, False, True, True, True, True, False, True, True, True, True, False, True, True, False, True, True

In [160]:
import random
from spacy.util import minibatch, compounding
from pathlib import Path
random.shuffle(TRAIN_DATA)
losses = {}

# batch up the examples using spaCy's minibatch
# minibatch is generator which returns a random sample of train_data of size = n. Compounding is generator returning 4 * (1.001)^i while < 32    
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))

In [181]:
x = [[1,2,3]]
zip(*x)

<zip at 0x17ec1d6d640>