<a href="https://colab.research.google.com/github/ravi-gopalan/DAND_Data_Wrangling/blob/master/custom_entity_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [2]:
cd '/gdrive/My Drive/abv_reviews'

/gdrive/My Drive/abv_reviews


In [3]:
!pip install spacymoji
from spacymoji import Emoji



In [0]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

import random
import srsly
import spacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from spacy import displacy
from spacy.util import minibatch, compounding
from pandas.io.json import json_normalize

In [5]:
import spacy.cli
spacy.cli.download("en_core_web_lg")
import en_core_web_lg

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [0]:
from spacy.gold import docs_to_json, biluo_tags_from_offsets, offsets_from_biluo_tags

In [0]:
import re

In [238]:
ls -l

total 101759
-rw------- 1 root root     2000 Dec 26 18:06  bread_keyword_patterns.npz
-rw------- 1 root root      589 Dec 26 17:27  breads_list_2.jsonl
-rw------- 1 root root    26438 Dec 26 16:44  breads_list.jsonl
-rw------- 1 root root    25359 Dec 15 03:21  CoherenceScore.csv
-rw------- 1 root root     4968 Dec 26 18:06  condiment_keyword_patterns.npz
-rw------- 1 root root     2353 Dec 26 16:29  condiments_list_2.jsonl
-rw------- 1 root root    43320 Dec 26 16:21  condiments_list.jsonl
-rw------- 1 root root  8428475 Dec 21 13:01  datalist_corrected.csv
-rw------- 1 root root    24864 Dec 26 17:05  df_check_breads.csv
-rw------- 1 root root    40909 Dec 26 16:21  df_check_condiments.csv
-rw------- 1 root root   185178 Dec 26 08:09  df_check_de.csv
-rw------- 1 root root  4580904 Dec 26 08:10  df_check_en.csv
-rw------- 1 root root    56341 Dec 26 09:17  df_check_es.csv
-rw------- 1 root root    43595 Dec 26 09:17  df_check_fr.csv
-rw------- 1 root root    72573 Dec 26 17:05  df_ch

In [239]:
df_pattern = pd.read_json('patterns.jsonl',lines=True)
df_pattern.head()

Unnamed: 0,label,pattern,id
0,dish,[{'LOWER': ''mpanatigghi'}],'mpanatigghi
1,dish,[{'LOWER': ''nduja'}],'nduja
2,dish,"[{'LOWER': ''ota'}, {'LOWER': ''ika'}]",'ota_'ika
3,dish,"[{'LOWER': '.amaro'}, {'LOWER': 'ramazzotti'}]",.amaro_ramazzotti
4,dish,"[{'LOWER': '15'}, {'LOWER': 'bean'}, {'LOWER':...",15_bean_soup


In [0]:

def cleanse_text(text):
  step1 = re.sub(r"(\{'LOWER':\s)",r"",str(text))
  step2 = re.sub(r"[\[{\'\,\"}\]]",r"",step1)
  return step2



In [240]:
df_pattern['cleaned_pattern'] = df_pattern.pattern.apply(lambda x: cleanse_text(x))
df_pattern['label'] = 'U_' + df_pattern.label.astype(str)
df_pattern.head()

Unnamed: 0,label,pattern,id,cleaned_pattern
0,U_dish,[{'LOWER': ''mpanatigghi'}],'mpanatigghi,mpanatigghi
1,U_dish,[{'LOWER': ''nduja'}],'nduja,nduja
2,U_dish,"[{'LOWER': ''ota'}, {'LOWER': ''ika'}]",'ota_'ika,ota ika
3,U_dish,"[{'LOWER': '.amaro'}, {'LOWER': 'ramazzotti'}]",.amaro_ramazzotti,.amaro ramazzotti
4,U_dish,"[{'LOWER': '15'}, {'LOWER': 'bean'}, {'LOWER':...",15_bean_soup,15 bean soup


In [241]:
df_pattern['tup_col'] = list(zip(df_pattern.cleaned_pattern, df_pattern.label))
df_pattern['merged'] = df_pattern.apply(lambda row: {row['cleaned_pattern']:row['label']}, axis=1)
df_pattern.head()



Unnamed: 0,label,pattern,id,cleaned_pattern,tup_col,merged
0,U_dish,[{'LOWER': ''mpanatigghi'}],'mpanatigghi,mpanatigghi,"(mpanatigghi, U_dish)",{'mpanatigghi': 'U_dish'}
1,U_dish,[{'LOWER': ''nduja'}],'nduja,nduja,"(nduja, U_dish)",{'nduja': 'U_dish'}
2,U_dish,"[{'LOWER': ''ota'}, {'LOWER': ''ika'}]",'ota_'ika,ota ika,"(ota ika, U_dish)",{'ota ika': 'U_dish'}
3,U_dish,"[{'LOWER': '.amaro'}, {'LOWER': 'ramazzotti'}]",.amaro_ramazzotti,.amaro ramazzotti,"(.amaro ramazzotti, U_dish)",{'.amaro ramazzotti': 'U_dish'}
4,U_dish,"[{'LOWER': '15'}, {'LOWER': 'bean'}, {'LOWER':...",15_bean_soup,15 bean soup,"(15 bean soup, U_dish)",{'15 bean soup': 'U_dish'}


In [242]:

ent_list = df_pattern.tup_col.to_list()
ent_list[50:60]

[('agliata', 'U_dish'),
 ('agneau du périgord', 'U_dish'),
 ('agnolotti', 'U_dish'),
 ('agrodolce', 'U_dish'),
 ('agua de sevilla', 'U_dish'),
 ('agua de valencia', 'U_dish'),
 ('aguachile', 'U_dish'),
 ('aguadito de pollo', 'U_dish'),
 ('aguapanela', 'U_dish'),
 ('aguas frescas', 'U_dish')]

In [0]:
nlp = spacy.load("en_core_web_lg")
ruler = EntityRuler(nlp).from_disk("patterns.jsonl")
emoji = Emoji(nlp)
nlp.add_pipe(emoji, first=True)
nlp.add_pipe(ruler, before='ner')
#merge_ents = nlp.create_pipe("merge_entities")
#nlp.add_pipe(merge_ents, after='ner')

NameError: ignored

In [16]:
ruler.patterns[10:15]

[{'id': '5_hour_energy',
  'label': 'dish',
  'pattern': [{'LOWER': '5'}, {'LOWER': 'hour'}, {'LOWER': 'energy'}]},
 {'id': '7_and_7',
  'label': 'dish',
  'pattern': [{'LOWER': '7'}, {'LOWER': 'and'}, {'LOWER': '7'}]},
 {'id': 'a_gei',
  'label': 'dish',
  'pattern': [{'LOWER': 'a'}, {'LOWER': 'gei'}]},
 {'id': 'a_thoke',
  'label': 'dish',
  'pattern': [{'LOWER': 'a'}, {'LOWER': 'thoke'}]},
 {'id': 'aachener_printen',
  'label': 'dish',
  'pattern': [{'LOWER': 'aachener'}, {'LOWER': 'printen'}]}]

In [244]:
nlp.pipe_names

['emoji', 'tagger', 'parser', 'entity_ruler', 'ner']

In [18]:
ruler.patterns[6200]

{'id': 'litsea_cubeba',
 'label': 'spice',
 'pattern': [{'LOWER': 'litsea'}, {'LOWER': 'cubeba'}]}

In [326]:
doc1 = nlp("I love fettuccine cheese 💚🌿 but hate buffalo wings.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])

displacy.render(doc1,'ent',jupyter=True)



[('fettuccine', 'dish', 'fettuccine'), ('buffalo wings', 'dish', 'buffalo_wings')]


In [72]:
doc2 = nlp("Avocado 🌿 are good but the ajwain is great")
displacy.render(doc2,'ent',jupyter=True)

In [21]:
nlp.pipe_names

['emoji', 'tagger', 'parser', 'entity_ruler', 'ner']

In [16]:
doc3 = nlp("Baguette is the best bread in the whole world - bagels come close and ciabatta is also up there")
displacy.render(doc3,'ent',jupyter=True)

In [18]:
nlp.disable_pipes('entity_ruler')

[('entity_ruler', <spacy.pipeline.entityruler.EntityRuler at 0x7f5fdaccc630>)]

In [19]:
doc3 = nlp("Baguette is the best chaunk bread in the whole world - bagels come close and ciabatta is also up there")
displacy.render(doc3,'ent',jupyter=True)

In [17]:
reviews = pd.read_csv('reviews_text.csv',usecols=['_id', 'text'])
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56156 entries, 0 to 56155
Data columns (total 2 columns):
_id     56156 non-null object
text    56155 non-null object
dtypes: object(2)
memory usage: 877.6+ KB


In [18]:
reviews.dropna(inplace=True)
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56155 entries, 0 to 56155
Data columns (total 2 columns):
_id     56155 non-null object
text    56155 non-null object
dtypes: object(2)
memory usage: 1.3+ MB


In [382]:
texts = reviews['text'][0:1000]
texts

0      Burger joint offers a wide range of cheeseburg...
1      It was really good. The mushroom broth was esp...
2      $8.90 for sesame rice, mushroom rendang, curry...
3      2 mains + 1 green rice bento from greendot. Re...
4      The lion mane mushroom rendang is so delicious...
                             ...                        
995    This dish is 6-spoon appetizer made of tea lea...
996    These churros are so simple, original, and cri...
997    Warm fluffy soft bread with avocado and tofu f...
998    The avo + tofu didn’t fly well together but th...
999    Super original and definitely worth trying if ...
Name: text, Length: 1000, dtype: object

In [0]:
ingredients_list = ['pistachio', 'almond', 'walnut', 'cashew', 'peanuts', 'macadamia', 'hazelnuts', 'pecan', 'brazil nut', 'pine nut' ]
issues_list = ['crumble - maybe a verb', 'milk']
food_times = 
to_be_added_list = ['spirulina', 'herbs', 'micro-greens', 'greens','spinach','eggs', 'nut butters', 'cream cheese', 'cake', 'pepper', 'zucchini', 'aubergines',  'tomato',\
                    'oysters', 'kale', 'cucumber', 'quinoa', 'tomatoes', 'onion' ,'garlic', 'avocado', 'chilli', 'dark chocolate', 'mushrooms', 'coriander' ,'corn chips',\
                    'marinara', 'donuts', 'olives', 'crackers', 'waffles', 'strawberry', 'blueberry', 'raspberry', 'compote', 'mayo', 'mayonnaise', 'soy milk', 'oat milk', \
                    'coconut milk', 'almond milk', 'caffeine', 'milk tea', 'cauliflower', 'matcha', 'sorbet', 'banana', 'goji berry', 'chickpea', 'spinach', 'romaine',\
                    'arugula', 'cacao', 'Portobello', 'pear', 'plant milk', 'BBQ sauce', 'artichoke', 'mango', 'sandwich', 'capsicum', 'wedge', 'kang kong', 'seitan',\
                    'ciabatta', 'focaccia', 'bean sprouts', 'olive oil']

In [27]:
check = np.random.randint(0,50000,3)
print(check)

for doc in list(nlp.pipe(texts[check])):
  displacy.render(doc,'ent',jupyter=True)
#  displacy.render(doc,'dep',jupyter=True,options={'distance': 60, 'fine_grained': True})



[49671 32259 46827]


In [28]:

displacy.render(nlp("Very fresh and juicy coconut FRUIT Lassi DISH . Didn't thought it would be so fresh"),'dep',jupyter=True)

In [29]:
doc = nlp("I like London. I also like Bangalore")
tok_list = [token for token in doc]
print(tok_list, len(tok_list))


[I, like, London, ., I, also, like, Bangalore] 8


In [30]:
tags = ['O' for tok in tok_list]
tags

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

In [0]:
tags[2] = 'U-LOC'
tags[7] = 'U-LOC'
tags

['O', 'O', 'U-LOC', 'O', 'O', 'O', 'O', 'U-LOC']

In [0]:
entities = offsets_from_biluo_tags(doc, tags)
entities

[(7, 13, 'LOC'), (27, 36, 'LOC')]

In [31]:
doc = nlp("I love Punjabi cuisine.")
[ent.label_ for ent in doc.ents]

['NORP']

['NORP']

In [32]:
tok_list = [token for token in doc]
print(tok_list, len(tok_list))
tags = ['O' for tok in tok_list]
print(tags)

[I, love, Punjabi, cuisine, .] 5
['O', 'O', 'O', 'O', 'O']


In [0]:
tags[2] = 'U-NORP'
print(tags)
entities = offsets_from_biluo_tags(doc, tags)
print(entities)


['O', 'O', 'U-NORP', 'O', 'O']
[(7, 15, 'NORP')]


['O', 'O', 'U-LOC', 'O']

In [0]:
json_data = docs_to_json([doc])
json_data['paragraphs'][0]['sentences'][0]['tokens'][2]['ner']

'U-NORP'

In [0]:
import srsly

In [0]:
f = open('rice.txt')
docs = [] # initialize a list to be populated wih strings
lines = f.readlines()  # returns a list of srings from the txt file
for line in lines:
    # print(line[:]) # display the sentence from that line
    doc = nlp(line) # convert string into a spacy doc object using nlp
    docs.append(doc) # add new doc to the list of docs

json_data = docs_to_json(docs) # convert doc into a json file
srsly.write_json('spacy_train.json', [json_data])


In [0]:
f.close()

In [0]:
check_list = [('bok','U_vegetable'),('choy','U_vegetable'),\
              ('bean','U_vegetable'), ('sprouts','U_vegetable'),\
              ('soffritto','U_dish'),('patties','U_dish'),('cakes','U_dish'),('parmesan','U_dish'),('cheese','U_dish'),('breadcrumbs','U_dish'),\
              ('soy','U_condiment'),('sauce','U_condiment'),\
              ('olive','U_condiment'),('oil','U_condiment'),\
              ('wine','U_beverage'),\
              ('3/4-inch','U_QUANTITY'), ('2-inch','U_QUANTITY'),('3-inch','U_QUANTITY'),('6-inch','U_QUANTITY'),\
              ('first','U_ORDINAL'),('second','U_ORDINAL'),('third','U_ORDINAL'),('fourth','U_ORDINAL'),('fifth','U_ORDINAL'),('last','U_ORDINAL'),\
              ('1','U_CARDINAL'),('2','U_CARDINAL'),('3','U_CARDINAL'),('4','U_CARDINAL'),('5','U_CARDINAL'),('6','U_CARDINAL'),('10','U_CARDINAL'),\
              ('100','U_CARDINAL'),('one','U_CARDINAL'),('two','U_CARDINAL'),('three','U_CARDINAL'),('four','U_CARDINAL'),('five','U_CARDINAL'),('six','U_CARDINAL'),\
              ('seven','U_CARDINAL'),('eight','U_CARDINAL'),('nine','U_CARDINAL'),('ten','U_CARDINAL'),\
              ('minute','U_TIME'),('minutes','U_TIME'),('seconds','U_TIME'),('hour','U_TIME'),('hours','U_TIME'),('day','U_TIME')]

In [64]:
''.join(list(set([tup[1] for tup in ent_list + check_list if tup[0] == 'rice'])))

'U_dish'

In [0]:
f = open('rice.txt')
docs = [] # initialize a list to be populated wih strings
lines = f.readlines()  # returns a list of srings from the txt file
for line in lines:
    # print(line[:]) # display the sentence from that line
    doc = nlp(line) # convert string into a spacy doc object using nlp
    docs.append(doc) # add new doc to the list of docs
f.close()



In [0]:
train_gen = (line for line in open('rice.txt'))
type(train_gen)
to_train_ents = []

In [127]:
ls

 bread_keyword_patterns.npz       pattern_pastry.jsonl
 breads_list_2.jsonl              patterns_2019_12_26.jsonl
 breads_list.jsonl                patterns.jsonl
 CoherenceScore.csv               pattern_spice.jsonl
 condiment_keyword_patterns.npz   pattern_vegetable.jsonl
 condiments_list_2.jsonl          [0m[01;34mpending_images[0m/
 condiments_list.jsonl            pyLDAvis_10.html
 datalist_corrected.csv           pyLDAvis_11.html
 df_check_breads.csv              pyLDAvis_12.html
 df_check_condiments.csv          pyLDAvis_13.html
 df_check_de.csv                  pyLDAvis_14.html
 df_check_en.csv                  pyLDAvis_15.html
 df_check_es.csv                  pyLDAvis_16.html
 df_check_fr.csv                  pyLDAvis_17.html
 df_check_fruits.csv              pyLDAvis_18.html
 df_check_fy.csv                  pyLDAvis_19.html
 df_check_id.csv                  pyLDAvis_20.html
 df_check_it.csv                  pyLDAvis_21.html
 df_check_ms.csv                  pyLDAvis_22.

In [364]:
l = next(train_gen)
tok_list = [token.text for token in nlp(l)] 
#print(l, tok_list)
tag_list = ['O' for token in nlp(l)]
#print(tag_list)
matched_ents = [(i,item) for i, item in enumerate(tok_list) if item.lower() in [i[0] for i in ent_list + check_list]] 
#print(matched_ents)

for ent in matched_ents:
  ent_tag = ''.join(list(set([tup[1] for tup in ent_list + check_list if tup[0] == ent[1].lower()])))
  print(ent[0], ent[1], ent_tag)
  tag_list[ent[0]] = ent_tag

#print(tag_list)
entities = offsets_from_biluo_tags(nlp(l), tag_list)
print(entities)
to_train_ents.append((l, dict(entities = entities)))
displacy.render(nlp(l),'ent',jupyter=True)
print(l, entities)

#displacy.render(l,'dep',jupyter=True)

3 five U_CARDINAL
5 seven U_CARDINAL
6 minutes U_TIME
[(16, 20, 'CARDINAL'), (24, 29, 'CARDINAL'), (30, 37, 'TIME')]


Leave alone for five to seven minutes, then serve. [(16, 20, 'CARDINAL'), (24, 29, 'CARDINAL'), (30, 37, 'TIME')]


In [365]:
len(to_train_ents)
#to_train_ents[-5:]

37

In [78]:
for _, annotations in to_train_ents:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

NameError: ignored

In [82]:
to_train_ents = []

for _, doc in enumerate(docs):
  print(_)
  tok_list = [token.text for token in doc] 
#  print(tok_list)
  tag_list = ['O' for token in doc]
#  print(tag_list)
  matched_ents = [(i,item) for i, item in enumerate(tok_list) if item.lower() in [i[0] for i in ent_list + check_list]] 
#  print(matched_ents)
  print(doc)
  for ent in matched_ents:
    ent_tag = ''.join(list(set([tup[1] for tup in ent_list + check_list if tup[0] == ent[1].lower()])))
    print(ent[0], ent[1], ent_tag)
    tag_list[ent[0]] = ent_tag
  
#  print(tag_list)
  entities = offsets_from_biluo_tags(doc, tag_list)
#  print(entities)
  to_train_ents.append((doc, dict(entities = entities)))
  print(doc, entities)

0
If using tofu, cook it.

2 tofu U_dish
If using tofu, cook it.
 [(9, 13, 'dish')]
1
Cook the rice as you normally would, but be careful not to use too much water. The rice should not be too sticky. It may help to let the cooked rice sit overnight in the refrigerator.

2 rice U_dish
19 rice U_dish
33 rice U_dish
Cook the rice as you normally would, but be careful not to use too much water. The rice should not be too sticky. It may help to let the cooked rice sit overnight in the refrigerator.
 [(9, 13, 'dish'), (83, 87, 'dish'), (144, 148, 'dish')]
2
If using onions, dice them into whatever size you prefer.

2 onions U_vegetable
If using onions, dice them into whatever size you prefer.
 [(9, 15, 'vegetable')]
3
If using bok choy, separate the stalk part from the leaf part. Slice the leaf part crosswise into 3/4-inch-thick strips, then tear the strips into 2-inch pieces. These pieces will be treated much like the bean sprouts, though perhaps with slightly more cooking. The stalk part w

In [168]:
to_train_ents[0:5]

[(If using tofu, cook it., {'entities': [(9, 13, 'dish')]}),
 (Cook the rice as you normally would, but be careful not to use too much water. The rice should not be too sticky. It may help to let the cooked rice sit overnight in the refrigerator.,
  {'entities': [(9, 13, 'dish'), (83, 87, 'dish'), (137, 148, 'dish')]}),
 (If using onions, dice them into whatever size you prefer.,
  {'entities': [(9, 15, 'vegetable')]}),
 (If using bok choy, separate the stalk part from the leaf part. Slice the leaf part crosswise into 3/4-inch-thick strips, then tear the strips into 2-inch pieces. These pieces will be treated much like the bean sprouts, though perhaps with slightly more cooking. The stalk part will be treated as celery.,
  {'entities': [(9, 17, 'vegetable'),
    (98, 106, 'QUANTITY'),
    (147, 153, 'QUANTITY'),
    (205, 217, 'vegetable'),
    (296, 302, 'vegetable')]}),
 (If using bean sprouts, wash them and discard any that are not white and crunchy.,
  {'entities': [(9, 21, 'vegeta

In [0]:
nlp = spacy.load("en_core_web_lg")
ruler = EntityRuler(nlp).from_disk("patterns.jsonl")
emoji = Emoji(nlp)
nlp.add_pipe(emoji, first=True)
nlp.add_pipe(ruler, before='ner')
nlp.pipe_names

['emoji', 'tagger', 'parser', 'entity_ruler', 'ner']

In [169]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in ["ner" , "entity_ruler", "merge_entities"]]
other_pipes

['emoji', 'tagger', 'parser']

In [0]:
import itertools
merged = list(itertools.chain(*[tup[1]['entities'] for tup in to_train_ents]))

In [286]:
[t[2] for t in merged]

['dish',
 'dish',
 'dish',
 'dish',
 'vegetable',
 'vegetable',
 'vegetable',
 'QUANTITY',
 'QUANTITY',
 'vegetable',
 'vegetable',
 'vegetable',
 'vegetable',
 'vegetable',
 'dish',
 'condiment',
 'dish',
 'dish',
 'condiment',
 'dish',
 'condiment',
 'condiment',
 'dish',
 'vegetable',
 'vegetable',
 'vegetable',
 'condiment',
 'CARDINAL',
 'condiment',
 'condiment',
 'dish',
 'vegetable',
 'vegetable',
 'vegetable',
 'dish',
 'dish',
 'vegetable',
 'condiment',
 'condiment',
 'condiment',
 'dish',
 'dish',
 'condiment',
 'condiment',
 'beverage',
 'dish',
 'dish',
 'beverage',
 'dish',
 'dish',
 'TIME',
 'dish',
 'dish',
 'dish',
 'dish',
 'CARDINAL',
 'dish',
 'TIME',
 'condiment',
 'condiment',
 'condiment',
 'condiment',
 'dish',
 'dish',
 'dish',
 'dish',
 'CARDINAL',
 'dish',
 'dish',
 'dish',
 'condiment',
 'vegetable',
 'vegetable',
 'dish',
 'dish',
 'dish',
 'dish',
 'dish',
 'dish',
 'ORDINAL',
 'condiment',
 'herb',
 'dish',
 'dish',
 'spice',
 'dish',
 'vegetable',
 'veg

In [231]:
[tup for tup in to_train_ents]

[(If using tofu, cook it., {'entities': [(9, 13, 'dish')]}),
 (Cook the rice as you normally would, but be careful not to use too much water. The rice should not be too sticky. It may help to let the cooked rice sit overnight in the refrigerator.,
  {'entities': [(9, 13, 'dish'), (83, 87, 'dish'), (144, 148, 'dish')]}),
 (If using onions, dice them into whatever size you prefer.,
  {'entities': [(9, 15, 'vegetable')]}),
 (If using bok choy, separate the stalk part from the leaf part. Slice the leaf part crosswise into 3/4-inch-thick strips, then tear the strips into 2-inch pieces. These pieces will be treated much like the bean sprouts, though perhaps with slightly more cooking. The stalk part will be treated as celery.,
  {'entities': [(9, 12, 'vegetable'),
    (13, 17, 'vegetable'),
    (98, 106, 'QUANTITY'),
    (147, 153, 'QUANTITY'),
    (205, 209, 'vegetable'),
    (210, 217, 'vegetable'),
    (296, 302, 'vegetable')]}),
 (If using bean sprouts, wash them and discard any that are

In [366]:
model = 'en_core_web_lg'



"""Load the model, set up the pipeline and train the entity recognizer."""
if model is not None:
    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank("en")  # create blank Language class
    print("Created blank 'en' model")

# nlp.create_pipe works for built-ins that are registered with spaCy
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
    ner = nlp.get_pipe("ner")

# add labels
for _, annotations in to_train_ents:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

Loaded model 'en_core_web_lg'


In [367]:
ner.labels

('PRODUCT',
 'PERCENT',
 'dish',
 'LANGUAGE',
 'condiment',
 'LOC',
 'FAC',
 'ORDINAL',
 'herb',
 'NORP',
 'MONEY',
 'CARDINAL',
 'spice',
 'QUANTITY',
 'beverage',
 'TIME',
 'ORG',
 'EVENT',
 'PERSON',
 'LAW',
 'DATE',
 'GPE',
 'vegetable',
 'WORK_OF_ART')

In [0]:
from spacy.gold import GoldParse

In [374]:
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in ["ner"]]
with nlp.disable_pipes(*other_pipes):  # only train NER
    # reset and initialize the weights randomly – but only if we're
    # training a new model
    if model is None:
        nlp.begin_training()



    for itn in range(100):
      random.shuffle(clean_train_ents)
      losses = {}

#      for item in clean_train_ents:
# batch up the examples using spaCy's minibatch
      batches = minibatch(clean_train_ents, size=compounding(4.0, 32.0, 1.001))

      for batch in batches:
        texts, annotations = zip(*batch)
#        print([item[0]], [item[1]])

#        print(texts)
#        print(annotations)
        nlp.update(texts, annotations, drop=0.5, losses=losses)
        print(losses)

{'ner': 88.65077209472656}
{'ner': 191.41206622123718}
{'ner': 274.0754544734955}
{'ner': 376.19938254356384}
{'ner': 484.4726674556732}
{'ner': 600.2701672315598}
{'ner': 688.1295403242111}
{'ner': 780.2445400953293}
{'ner': 968.3551481962204}
{'ner': 985.2630408851546}
{'ner': 103.13918900489807}
{'ner': 173.14196467399597}
{'ner': 211.9196584224701}
{'ner': 356.4261119365692}
{'ner': 467.49747109413147}
{'ner': 550.5530912876129}
{'ner': 628.9059693813324}
{'ner': 808.8429419994354}
{'ner': 976.6488535404205}
{'ner': 1012.5997161976993}
{'ner': 106.78622198104858}
{'ner': 171.02277731895447}
{'ner': 358.5514028072357}
{'ner': 456.2937424182892}
{'ner': 547.4443485736847}
{'ner': 682.4905865192413}
{'ner': 785.4192407131195}
{'ner': 868.8943121433258}
{'ner': 975.0826132297516}
{'ner': 996.660508826375}
{'ner': 79.75613355636597}
{'ner': 203.94504261016846}
{'ner': 291.61164689064026}
{'ner': 386.6651213169098}
{'ner': 500.7001885175705}
{'ner': 620.7740014791489}
{'ner': 712.7062219

In [375]:
pwd

'/gdrive/My Drive/abv_reviews'

In [0]:
nlp.to_disk('trained_model_2020_01_06')

In [311]:




# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in ["ner"]]
with nlp.disable_pipes(*other_pipes):  # only train NER
    # reset and initialize the weights randomly – but only if we're
    # training a new model
    if model is None:
        nlp.begin_training()



    for itn in range(100):
      random.shuffle(to_train_ents)
      losses = {}
# batch up the examples using spaCy's minibatch
      batches = minibatch(to_train_ents, size=compounding(4.0, 32.0, 1.001))

      for batch in batches:
        texts, annotations = zip(*batch)
#        gold = GoldParse(texts, entities=entities)
        print(texts)
        print(annotations)
        nlp.update(texts, annotations, drop=0.5, losses=losses)
        print(losses)

(Add the soy sauce, meat(s), eggs, and all vegetables save for the onions and bean sprouts.
, Add the wine. A hot pan will cause the alcohol to evaporate more quickly (and create a nice dramatic effect), but make sure you don't burn the rice or the soffritto.
, Left over risotto is not particularly nice reheated on its own. So as not to waste the leftovers, you can form day old risotto into cakes or patties, coat them in flour and shallow fry them in olive oil, making risotto cakes. This can be served with a salad.
, Add a generous amount of oil to a large frying pan, more than you think you'd need to avoid sticking. If using a type of sausage or other excessively fatty meat, not as much oil may be needed because of the amount of fat.
)
({'entities': [(8, 11, 'condiment'), (12, 17, 'condiment'), (28, 32, 'dish'), (66, 72, 'vegetable'), (77, 81, 'vegetable'), (82, 89, 'vegetable')]}, {'entities': [(8, 12, 'beverage'), (143, 147, 'dish'), (155, 164, 'dish')]}, {'entities': [(10, 17, 'dis

ValueError: ignored

In [83]:
ner.labels

('PRODUCT',
 'PERCENT',
 'dish',
 'LANGUAGE',
 'condiment',
 'LOC',
 'FAC',
 'ORDINAL',
 'herb',
 'NORP',
 'MONEY',
 'CARDINAL',
 'spice',
 'QUANTITY',
 'beverage',
 'TIME',
 'ORG',
 'EVENT',
 'PERSON',
 'LAW',
 'DATE',
 'GPE',
 'vegetable',
 'WORK_OF_ART')

In [108]:
[tup for tup in to_train_ents if tup[0] == 'condimentU_spice']

[]

In [105]:
to_train_ents[5][1]['entities']

[(235, 242, 'dish')]

In [0]:
import re


def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']

        print(text, annotations)
        valid_entities = []
        for start, end, label in entities:
          valid_start = start
          valid_end = end
          print(valid_start, valid_end, len(text))
          while valid_start < len(text) and invalid_span_tokens.match(text[valid_start]):
            valid_start += 1
          while valid_end > 1 and invalid_span_tokens.match(text[valid_end - 1]):
            valid_end -= 1
          valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data

In [369]:
clean_train_ents = trim_entity_spans(to_train_ents)


If using tofu, cook it.
 {'entities': [(9, 13, 'dish')]}
9 13 24
Cook the rice as you normally would, but be careful not to use too much water. The rice should not be too sticky. It may help to let the cooked rice sit overnight in the refrigerator.
 {'entities': [(9, 13, 'dish'), (83, 87, 'dish'), (144, 148, 'dish')]}
9 13 184
83 87 184
144 148 184
If using onions, dice them into whatever size you prefer.
 {'entities': [(9, 15, 'vegetable')]}
9 15 58
If using bok choy, separate the stalk part from the leaf part. Slice the leaf part crosswise into 3/4-inch-thick strips, then tear the strips into 2-inch pieces. These pieces will be treated much like the bean sprouts, though perhaps with slightly more cooking. The stalk part will be treated as celery.
 {'entities': [(9, 12, 'vegetable'), (13, 17, 'vegetable'), (98, 106, 'QUANTITY'), (147, 153, 'QUANTITY'), (205, 209, 'vegetable'), (210, 217, 'vegetable'), (296, 302, 'vegetable')]}
9 12 304
13 17 304
98 106 304
147 153 304
205 209 304
210 

In [370]:
clean_train_ents

[['If using tofu, cook it.\n', {'entities': [[9, 13, 'dish']]}],
 ['Cook the rice as you normally would, but be careful not to use too much water. The rice should not be too sticky. It may help to let the cooked rice sit overnight in the refrigerator.\n',
  {'entities': [[9, 13, 'dish'], [83, 87, 'dish'], [144, 148, 'dish']]}],
 ['If using onions, dice them into whatever size you prefer.\n',
  {'entities': [[9, 15, 'vegetable']]}],
 ['If using bok choy, separate the stalk part from the leaf part. Slice the leaf part crosswise into 3/4-inch-thick strips, then tear the strips into 2-inch pieces. These pieces will be treated much like the bean sprouts, though perhaps with slightly more cooking. The stalk part will be treated as celery.\n',
  {'entities': [[9, 12, 'vegetable'],
    [13, 17, 'vegetable'],
    [98, 106, 'QUANTITY'],
    [147, 153, 'QUANTITY'],
    [205, 209, 'vegetable'],
    [210, 217, 'vegetable'],
    [296, 302, 'vegetable']]}],
 ['If using bean sprouts, wash them and dis

In [88]:
doc1 = nlp("I love fettuccine cheese 💚🌿 but hate rice, bok choy  and onion.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])

displacy.render(doc1,'ent',jupyter=True)

[]


In [0]:
doc1 = nlp("I love fettuccine cheese 💚🌿 but hate rice, bok choy  and onion.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])

displacy.render(doc1,'ent',jupyter=True)


[('rice', 'dish', ''), ('bok', 'dish', ''), ('onion', 'dish', '')]


In [0]:
ner.labels

('GPE',
 'herb',
 'condiment',
 'ORDINAL',
 'PERCENT',
 'LAW',
 'LANGUAGE',
 'MONEY',
 'PERSON',
 'LOC',
 'fruit',
 'dish',
 'WORK_OF_ART',
 'CARDINAL',
 'bread',
 'spice',
 'ORG',
 'PRODUCT',
 'DATE',
 'TIME',
 'QUANTITY',
 'NORP',
 'EVENT',
 'FAC',
 'vegetable')

In [0]:
for _, annotations in to_train_ents:
    for ent in annotations.get("entities"):
        print(ent[2])

dish
dish
dish
dish
dish
condiment
dish
vegetable
vegetable
TIME
condiment
CARDINAL
dish
condiment
dish
ORDINAL
TIME
dish
dish
CARDINAL
condiment
dish
ORDINAL
ORDINAL
dish
dish
dish
dish
vegetable
vegetable
condiment
condiment
condiment
dish
dish
dish
dish
vegetable
vegetable
dish
dish
dish
dish
dish
dish
dish
dish
ORDINAL
dish
dish
dish
dish
condiment
dish
dish
vegetable
vegetable
vegetable
dish
dish
dish
CARDINAL
CARDINAL
TIME
spice
dish
dish
condiment
dish
ORDINAL
ORDINAL
TIME
condiment
vegetable
vegetable
vegetable
dish
condiment
dish
dish
dish
CARDINAL
TIME
dish
dish
dish
vegetable
vegetable
dish
dish
dish
dish
dish
vegetable
condiment
dish
GPE


In [379]:
check = np.random.randint(0,1000,10)
print(check)



[589 595 386 476 882 141 977 479 612 319]


In [384]:
texts[check]

589    I ordered the high rise pizza dough, red sauce...
595    Spinach and pumpkin flavoured ravioli was an e...
386    The vegan options to customize my salad are aw...
476    I love that the restaurant has a good range of...
882    Like I said ordered 3 different types of dimsu...
141    One of my favorite food at WDSB! Various types...
977    This place hand dips their corn dogs in their ...
479    This dish was a terrible disappointment. The r...
612    As a vegan this is about as good as it gets fo...
319    A rich but not overly creamy soup. Very refres...
Name: text, dtype: object

In [386]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [0]:
train_rev = (rev for rev in texts)

In [388]:
type(train_rev)

generator

In [397]:
[tup[1] for tup in ent_list + check_list if tup[0] == 'rendang']

['U_dish']

In [425]:
rev = next(train_rev)
tok_list = [token.text for token in nlp(rev)] 
#print(rev, tok_list)
tag_list = ['O' for token in nlp(rev)]
#print(tag_list)
matched_ents = [(i,item) for i, item in enumerate(tok_list) if item.lower() in [i[0] for i in ent_list + check_list]] 
print(matched_ents)

for ent in matched_ents:
  ent_tag = ''.join(list(set([tup[1] for tup in ent_list + check_list if tup[0] == ent[1].lower()])))
  print(ent[0], ent[1], ent_tag)
  tag_list[ent[0]] = ent_tag

#print(tag_list)
entities = offsets_from_biluo_tags(nlp(rev), tag_list)
print(entities)
to_train_ents.append((rev, dict(entities = entities)))
displacy.render(nlp(rev),'ent',jupyter=True)
#print(rev, entities)

#displacy.render(l,'dep',jupyter=True)

n_entities_to_be_added = int(input("How many entities are missing: "))
if n_entities_to_be_added == 0:
  print('You entered {} Thanks. So no new entities to be considered'.format(n_entities_to_be_added))
else:
  for user_input in range(n_entities_to_be_added):
    e_name = input('Provide the name of the entity to be added: ')
    e_label = input('Provide the label of the entity to be added:')
    print(e_name, 'U_'+e_label)
    check_list.append((e_name, 'U_'+e_label))

[(2, 'falafel'), (14, 'falafel'), (27, 'potato'), (34, 'garlic'), (72, 'sauce'), (77, 'eggs'), (80, 'salad'), (87, 'onions')]
2 falafel U_dish
14 falafel U_dish
27 potato U_vegetable
34 garlic U_spiceU_vegetable
72 sauce U_condiment
77 eggs U_dish
80 salad U_dish
87 onions U_vegetable
[(12, 19, 'dish'), (83, 90, 'dish'), (157, 163, 'vegetable'), (198, 204, 'spiceU_vegetable'), (376, 381, 'condiment'), (403, 407, 'dish'), (411, 416, 'dish'), (444, 450, 'vegetable')]


How many entities are missing: 0
You entered 0 Thanks. So no new entities to be considered


In [409]:
check_list

[('bok', 'U_vegetable'),
 ('choy', 'U_vegetable'),
 ('bean', 'U_vegetable'),
 ('sprouts', 'U_vegetable'),
 ('soffritto', 'U_dish'),
 ('patties', 'U_dish'),
 ('cakes', 'U_dish'),
 ('parmesan', 'U_dish'),
 ('cheese', 'U_dish'),
 ('breadcrumbs', 'U_dish'),
 ('soy', 'U_condiment'),
 ('sauce', 'U_condiment'),
 ('olive', 'U_condiment'),
 ('oil', 'U_condiment'),
 ('wine', 'U_beverage'),
 ('3/4-inch', 'U_QUANTITY'),
 ('2-inch', 'U_QUANTITY'),
 ('3-inch', 'U_QUANTITY'),
 ('6-inch', 'U_QUANTITY'),
 ('first', 'U_ORDINAL'),
 ('second', 'U_ORDINAL'),
 ('third', 'U_ORDINAL'),
 ('fourth', 'U_ORDINAL'),
 ('fifth', 'U_ORDINAL'),
 ('last', 'U_ORDINAL'),
 ('1', 'U_CARDINAL'),
 ('2', 'U_CARDINAL'),
 ('3', 'U_CARDINAL'),
 ('4', 'U_CARDINAL'),
 ('5', 'U_CARDINAL'),
 ('6', 'U_CARDINAL'),
 ('10', 'U_CARDINAL'),
 ('100', 'U_CARDINAL'),
 ('one', 'U_CARDINAL'),
 ('two', 'U_CARDINAL'),
 ('three', 'U_CARDINAL'),
 ('four', 'U_CARDINAL'),
 ('five', 'U_CARDINAL'),
 ('six', 'U_CARDINAL'),
 ('seven', 'U_CARDINAL'),
 ('

In [385]:
for doc in list(nlp.pipe(texts[check])):
  displacy.render(doc,'ent',jupyter=True)
#  displacy.render(doc,'dep',jupyter=True,options={'distance': 60, 'fine_grained': True})