In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd

pd.set_option("max_row", 100000)

# used to evaluate model
from sklearn_crfsuite import metrics

# to set random seed
import numpy as np

np.random.seed(42)

import math

<IPython.core.display.Javascript object>

In [3]:
data = pd.read_csv(
    "../1 Scraping Data and Annotating Data/tagged_data.csv", index_col=0
)

<IPython.core.display.Javascript object>

In [4]:
agg_func = lambda s: [
    (w, p, t)
    for w, p, t in zip(
        s["word"].values.tolist(), s["pos"].values.tolist(), s["tag"].values.tolist()
    )
]

<IPython.core.display.Javascript object>

In [5]:
grouped = data.groupby("sentence#").apply(agg_func)

<IPython.core.display.Javascript object>

In [6]:
sentences = [s for s in grouped]

<IPython.core.display.Javascript object>

In [7]:
def sent2labels(sent):
    """Retrieve all the labels from sentences which are lists containing (w, p, t)"""
    return [label for token, postag, label in sent]

<IPython.core.display.Javascript object>

In [8]:
y = [sent2labels(s) for s in sentences]

<IPython.core.display.Javascript object>

In [9]:
# split into train and test
boundary = math.ceil(len(y) * 0.8)

<IPython.core.display.Javascript object>

In [10]:
# test data
y_test = y[boundary:]

<IPython.core.display.Javascript object>

In [11]:
#################################################
# Dependency parser
###################################################

# Import modules
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

<IPython.core.display.Javascript object>

In [12]:
# Locate validation set from sentence#212 onward (ID:212 to ID:263)
parse_data = data.iloc[2770:3445]

# Inspect the dataset
parse_data.head()

Unnamed: 0,sentence#,word,pos,tag
2770,212.0,Spray,NN,U-Action
2771,212.0,baking,NN,B-Utensil
2772,212.0,sheet,NN,L-Utensil
2773,212.0,with,IN,O
2774,212.0,non,JJ,B-Utensil


<IPython.core.display.Javascript object>

In [13]:
# Set the sentenceID as index for merging ID:212 to ID:263
parse_data.set_index("sentence#", inplace=True)
parse_data.head()

Unnamed: 0_level_0,word,pos,tag
sentence#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
212.0,Spray,NN,U-Action
212.0,baking,NN,B-Utensil
212.0,sheet,NN,L-Utensil
212.0,with,IN,O
212.0,non,JJ,B-Utensil


<IPython.core.display.Javascript object>

In [14]:
# Merge the words back to its orginal form by sentence ID
combined_sentence = (
    parse_data.groupby(["sentence#"])["word"]
    .apply(lambda x: " ".join(x.astype(str)))
    .reset_index()
)
pd.set_option("max_colwidth", 200)
combined_sentence.head()

Unnamed: 0,sentence#,word
0,212.0,Spray baking sheet with non - stick spray .
1,213.0,Spread potatoes onto baking sheet .
2,214.0,Bake for 15 minutes ; toss and turn ; bake for five minutes ; add herbs and toss ; bake a final five minutes until the potatoes are brown and crisp .
3,215.0,Serve immediately .
4,216.0,"In 1/4 cup butter , saute carrots , onion , celery and broccoli stems for 5 minutes ."


<IPython.core.display.Javascript object>

In [15]:
# Convert the dataframe to the nested list
combined_sentences_list = combined_sentence.reset_index()[
    ["sentence#", "word"]
].values.tolist()
for item in combined_sentences_list:
    print(item[1])

Spray baking sheet with non - stick spray .
Spread potatoes onto baking sheet .
Bake for 15 minutes ; toss and turn ; bake for five minutes ; add herbs and toss ; bake a final five minutes until the potatoes are brown and crisp .
Serve immediately .
In 1/4 cup butter , saute carrots , onion , celery and broccoli stems for 5 minutes .
Add thyme , oregano and basil ; saute 5 minutes more .
Add wine and deglaze pan .
Add hot chicken stock and reduce by one - third .
Add Worcestershire sauce , Tabasco , smoked chicken , beans and broccoli florets ; simmer 5 minutes .
Add cream , simmer 5 minutes more and season to taste ( thicken with cornstarch if desired ) .
Drop in remaining butter , piece by piece , stirring until melted and serve immediately .
Smoked Chicken : On a covered grill , slightly smoke boneless chicken , cooking to medium rare ( about 30 minutes ) .
Chef Meskan uses applewood chips and does not allow the grill to become too hot .
Preheat oven to 350 degrees .
Grease 13 x 9 b

<IPython.core.display.Javascript object>

In [16]:
# Print(combined_sentence_list)
dp_picked_sent = []
not_dp_picked_sent = []

for sentence in combined_sentences_list:
    ind_sentences = sentence[1]
    doc = nlp(ind_sentences)
    for token in doc:
        if token.pos_ == "VERB":
            # print(int(sentence[0]), " ".join([token.dep_ for token in doc]))
            dep_parser = " ".join([token.dep_ for token in doc])
            dp_picked_sent.append((int(sentence[0]), dep_parser))

        else:
            non_dep_picked_sent = []
        # dp_picked_sent.append((int(sentence[0]), dep_parser))
        # if token.dep_ == "ROOT" or token.dep_ == "conj" or token.dep_ == "nmod" or token.dep_ == "advcl":
        # print(doc, "---->>>")

<IPython.core.display.Javascript object>

In [17]:
# Print(combined_sentence_list)
dp_picked_sent = []

for sentence in combined_sentences_list:
    ind_sentences = sentence[1]
    doc = nlp(ind_sentences)
    for token in doc:
        dep_parser = " ".join([token.dep_ for token in doc])
        dp_picked_sent.append((int(sentence[0]), dep_parser))
        #############################
        # Code for locate the ACTION words
        #############################
        # if token.pos_ == "VERB":
        # if token.dep_ == "ROOT" or token.dep_ == "conj":
        # print(doc, "---->>>", token.text, token.pos_, token.dep_)
        #############################
        # Code End
        #############################

        # if token.pos_ == "VERB":
        # print(int(sentence[0]), " ".join([token.dep_ for token in doc]))
        # dep_parser = " ".join([token.dep_ for token in doc])
    # dp_picked_sent.append((int(sentence[0]), dep_parser))

    # dp_picked_sent.append((int(sentence[0]), dep_parser))
    # if token.dep_ == "ROOT" or token.dep_ == "conj" or token.dep_ == "nmod" or token.dep_ == "advcl":
    # print(doc, "---->>>")

<IPython.core.display.Javascript object>

In [18]:
# Check the sentence with its ID
dp_picked_sent

[(212, 'compound compound ROOT prep dep dep amod pobj punct'),
 (212, 'compound compound ROOT prep dep dep amod pobj punct'),
 (212, 'compound compound ROOT prep dep dep amod pobj punct'),
 (212, 'compound compound ROOT prep dep dep amod pobj punct'),
 (212, 'compound compound ROOT prep dep dep amod pobj punct'),
 (212, 'compound compound ROOT prep dep dep amod pobj punct'),
 (212, 'compound compound ROOT prep dep dep amod pobj punct'),
 (212, 'compound compound ROOT prep dep dep amod pobj punct'),
 (212, 'compound compound ROOT prep dep dep amod pobj punct'),
 (213, 'compound ROOT prep compound pobj punct'),
 (213, 'compound ROOT prep compound pobj punct'),
 (213, 'compound ROOT prep compound pobj punct'),
 (213, 'compound ROOT prep compound pobj punct'),
 (213, 'compound ROOT prep compound pobj punct'),
 (213, 'compound ROOT prep compound pobj punct'),
 (214,
  'advcl prep nummod pobj punct conj cc conj punct ROOT prep nummod pobj punct conj dobj cc conj punct conj det amod nummod do

<IPython.core.display.Javascript object>

In [19]:
set((int(sentence[0]), dep_parser))

{263, 'ROOT cc conj punct'}

<IPython.core.display.Javascript object>

In [20]:
# Print out the result
print(
    "Total # of sentences:",
    len(combined_sentences_list),
    "\n# of Dependency Parser PICKED sentences:",
    len(set(dp_picked_sent)),
)

Total # of sentences: 52 
# of Dependency Parser PICKED sentences: 52


<IPython.core.display.Javascript object>

In [21]:
# Reform the sentence data

# Remove duplicates
dp_picked_a = set(list(dp_picked_sent))
dp_picked_new = list(dp_picked_a)

# Sort the order by ID
dp_picked_final = sorted(dp_picked_new)
dp_picked_final

[(212, 'compound compound ROOT prep dep dep amod pobj punct'),
 (213, 'compound ROOT prep compound pobj punct'),
 (214,
  'advcl prep nummod pobj punct conj cc conj punct ROOT prep nummod pobj punct conj dobj cc conj punct conj det amod nummod dobj mark det nsubj advcl acomp cc conj punct'),
 (215, 'ROOT advmod punct'),
 (216,
  'prep nummod compound nsubj punct compound conj punct conj punct conj cc conj ROOT prep nummod pobj punct'),
 (217, 'ROOT dobj punct conj cc conj punct conj nummod npadvmod advmod punct'),
 (218, 'ROOT dobj cc compound conj punct'),
 (219, 'ROOT amod compound dobj cc conj prep nummod punct pobj punct'),
 (220,
  'ROOT compound dobj punct appos punct amod conj punct conj cc compound conj punct appos nummod appos punct'),
 (221,
  'compound nsubj punct appos nummod npadvmod advmod cc conj aux relcl punct ROOT prep pobj mark advcl punct punct'),
 (222,
  'ROOT prep amod pobj punct conj prep pobj punct acl mark advcl cc conj advmod punct'),
 (223,
  'amod ROOT punc

<IPython.core.display.Javascript object>

In [22]:
for i in dp_picked_final:
    bb = list(i)

<IPython.core.display.Javascript object>

In [23]:
def convert(lst):
    return [i for item in lst for i in item.split()]

<IPython.core.display.Javascript object>

In [24]:
doc

Freeze and enjoy !

<IPython.core.display.Javascript object>

In [25]:
# Replace the dp parse with project tags

# tages = [("ROOT", "conj")]
# o_targe = []

dep_list = []
for i in dp_picked_final:
    p1 = i[1].split()
    p2 = map(
        lambda x: x
        if x == "ROOT" or x == "conj"  # or x == "nmod" or x == "acl" or x == "pcomp"
        else "O",
        p1,
    )
    new = map(lambda x: x if x == "O" else "U-Action", p2)
    dep_list.append((int(i[0]), list(new)))

dep_list

[(212, ['O', 'O', 'U-Action', 'O', 'O', 'O', 'O', 'O', 'O']),
 (213, ['O', 'U-Action', 'O', 'O', 'O', 'O']),
 (214,
  ['O',
   'O',
   'O',
   'O',
   'O',
   'U-Action',
   'O',
   'U-Action',
   'O',
   'U-Action',
   'O',
   'O',
   'O',
   'O',
   'U-Action',
   'O',
   'O',
   'U-Action',
   'O',
   'U-Action',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'U-Action',
   'O']),
 (215, ['U-Action', 'O', 'O']),
 (216,
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'U-Action',
   'O',
   'U-Action',
   'O',
   'U-Action',
   'O',
   'U-Action',
   'U-Action',
   'O',
   'O',
   'O',
   'O']),
 (217,
  ['U-Action',
   'O',
   'O',
   'U-Action',
   'O',
   'U-Action',
   'O',
   'U-Action',
   'O',
   'O',
   'O',
   'O']),
 (218, ['U-Action', 'O', 'O', 'O', 'U-Action', 'O']),
 (219, ['U-Action', 'O', 'O', 'O', 'O', 'U-Action', 'O', 'O', 'O', 'O', 'O']),
 (220,
  ['U-Action',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'U-Action',
   'O'

<IPython.core.display.Javascript object>

In [26]:
# Assign modified tagged sentence to y_depend_output
y_depend_output = []
for i in dep_list:
    y_depend_output.append(i[1])

<IPython.core.display.Javascript object>

In [27]:
# Display the result of DP parse
print(
    metrics.flat_classification_report(
        y_test, y_depend_output, labels=["U-Action"], digits=3
    )
)

              precision    recall  f1-score   support

    U-Action      0.523     0.695     0.597        82

   micro avg      0.523     0.695     0.597        82
   macro avg      0.523     0.695     0.597        82
weighted avg      0.523     0.695     0.597        82



<IPython.core.display.Javascript object>

In [28]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Add butter to a saucepan and cover with lid")

displacy.render(
    doc, style="dep", jupyter=True,
)
print(doc.to_json())

{'text': 'Add butter to a saucepan and cover with lid', 'ents': [], 'sents': [{'start': 0, 'end': 43}], 'tokens': [{'id': 0, 'start': 0, 'end': 3, 'pos': 'VERB', 'tag': 'VB', 'dep': 'ROOT', 'head': 0}, {'id': 1, 'start': 4, 'end': 10, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'dobj', 'head': 0}, {'id': 2, 'start': 11, 'end': 13, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 0}, {'id': 3, 'start': 14, 'end': 15, 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'head': 4}, {'id': 4, 'start': 16, 'end': 24, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'pobj', 'head': 2}, {'id': 5, 'start': 25, 'end': 28, 'pos': 'CCONJ', 'tag': 'CC', 'dep': 'cc', 'head': 0}, {'id': 6, 'start': 29, 'end': 34, 'pos': 'VERB', 'tag': 'VB', 'dep': 'conj', 'head': 0}, {'id': 7, 'start': 35, 'end': 39, 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'head': 6}, {'id': 8, 'start': 40, 'end': 43, 'pos': 'NOUN', 'tag': 'NN', 'dep': 'pobj', 'head': 7}]}


<IPython.core.display.Javascript object>

In [29]:
##########################
# The end
##########################

<IPython.core.display.Javascript object>