Created on March 21st 2020 by Patrick Rotzetter
Last update on Feb 28th 2021

protzetter@bluewin.ch
https://www.linkedin.com/in/rotzetter/

**Small experiment for order mail processing**




In [1]:
# Import libraries
import spacy
from spacy.pipeline import EntityRuler
from spacy.matcher import Matcher,PhraseMatcher
from spacy.symbols import nsubj, VERB, dobj, NOUN, root, xcomp
from spacy import displacy
from spacy.matcher import Matcher
from pathlib import Path
import random  

In [2]:
# load spacy model latets modle based on pre-trained transformer model Roberta
# check spacy documentation for installation instructions

from spacy.lang.en import English
import en_core_web_trf
nlp = en_core_web_trf.load()

In [3]:
#read order dialog file

text = 'Hello,  I would like to order a notebook with 16GB and 256 GB disk, I would like to spend less than 1000 Francs, what would be the options  Thanks a lot  Patrick'

print(text)


Hello,  I would like to order a notebook with 16GB and 256 GB disk, I would like to spend less than 1000 Francs, what would be the options  Thanks a lot  Patrick


In [4]:
#process the mail trough standard spacy pipeline
doc=nlp(text)


In [5]:
# print text entities detected
for ent in doc.ents :
    print(ent.text, ent.label_,)


16GB QUANTITY
256 GB QUANTITY
less than 1000 Francs MONEY
Patrick PERSON


In [6]:
#Let us visualize the result directly in the text
displacy.render(doc, style='ent', minify=True)

The default model does not seem to detect notebook and disk as entities, but identifies the sender as a person and identifies the RAM and disk size as quantities. This is a good start, but still far away from a practical solution. So, let us add some domain specific entities that will help us later on.

In [7]:
# add domain specific entities and add to the pipeline
patterns = [{"label": "CURRENCY", "pattern":  [{"lower": "francs"}]},
            {"label": "PART", "pattern":  [{"lower": "disk"}]}]


config = {
   "phrase_matcher_attr": None,
   "validate": True,
   "overwrite_ents": True,
   "ent_id_sep": "||",
}
ruler=nlp.add_pipe('entity_ruler',config=config)


In [8]:
ruler.add_patterns(patterns)

In [9]:
#process the mail again with added entities
doc=nlp(text)
for ents in doc.ents:
    # Print the entity text and its label
    print(ents.text, ents.label_,)



16GB QUANTITY
256 GB QUANTITY
disk PART
Francs CURRENCY
Patrick PERSON


In [10]:
#Let us visualize the result directly in the text
displacy.render(doc, style='ent', minify=True)

In [11]:
# Let us try to identify specific phrases or sequence of words, for example to detect the memory size
matcher = PhraseMatcher(nlp.vocab)
terms = ["16 GB","256 GB"]
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(t) for t in terms]
matcher.add("MEMORY", None, *patterns)

doc = nlp(text)
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

16GB
256 GB


In [12]:
# Part of speech tagging
# This is where the trained pipeline and its statistical models come in, 
# which enable spaCy to make predictions of which tag or label most likely applies in this context
for token in doc:
  print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

Hello intj be AUX [,,  ]
, punct Hello INTJ []
  dep Hello INTJ []
I nsubj like VERB []
would aux like VERB []
like ccomp be AUX [I, would, order]
to aux order VERB []
order xcomp like VERB [to, notebook]
a det notebook NOUN []
notebook dobj order VERB [a, with]
with prep notebook NOUN [GB, disk]
16 nummod GB NOUN []
GB pobj with ADP [16, and, GB]
and cc GB NOUN []
256 nummod GB NOUN []
GB conj GB NOUN [256]
disk pobj with ADP []
, punct be AUX []
I nsubj like VERB []
would aux like VERB []
like ccomp be AUX [I, would, spend]
to aux spend VERB []
spend xcomp like VERB [to, Francs]
less amod 1000 NUM []
than quantmod 1000 NUM []
1000 nummod Francs NOUN [less, than]
Francs dobj spend VERB [1000]
, punct be AUX []
what attr be AUX []
would aux be AUX []
be ROOT be AUX [Hello, like, ,, like, ,, what, would, options,  , Thanks]
the det options NOUN []
options attr be AUX [the]
  dep be AUX []
Thanks dep be AUX [lot]
a det lot NOUN []
lot npadvmod Thanks NOUN [a]
  dep   SPACE [Patrick]
Patr

In [13]:
# visualize the dependency graph
displacy.render(doc, style="dep", minify=True, jupyter=True)

Spacy provides all the required tagging to find the action verbs, we want to know if the customer wants to order something or is just interested by some information for example. Let us iterate through all tokens in the text and search for an open clausal complement ( refer to for all possible dependency tags https://spacy.io/api/annotation#pos-tagging )

In [14]:
# Identify action verbs
verbs = set()
for possible_verbs in doc:
    if possible_verbs.dep == xcomp and possible_verbs.head.pos == VERB :
        verbs.add(possible_verbs)
print(verbs)

{spend, order}


Let us find possible items in the text using the dependency tag ‘dobj’ for direct objects of a verb.

In [15]:
# Let us find possible items in the text using the dependency tag ‘dobj’ for direct objects of a verb.
items = set()
for possible_subject in doc:
    if possible_subject.dep == dobj and possible_subject.head.pos == VERB:
        items.add(possible_subject)
print(items)

{notebook, Francs}


In [17]:
# We will compare similarities between identified objects and the word ‘laptop’. 
# The word ‘notebook’ is much closer to ‘laptop’ than Francs.
import en_core_web_lg
nlp = en_core_web_lg.load()
orderobject=nlp("laptop")
for  sub in items:
  print(sub.text,nlp(sub.text).similarity(orderobject))


notebook 0.8021939740910023
Francs 0.0015887125151089767


In [18]:
# We will compare similarities between identified verbs and the verb ‘order’. 
# Then based on this we will identiofy the direct object of the verb
orderword=nlp("order") 
for  verb in verbs:
  if (nlp(verb.text).similarity(orderword)) >=0.8:
    for v in verb.children:
      if v.dep==dobj:
        print(v.text)


notebook


In [19]:
# we can also identify specific slots using numeric modifiers for example to understand the desired quantities
# with corresponding modified objects
for token in doc:  
    if token.dep_ == 'nummod':    
      print(f"Numerical modifier: {token.text} --> object: {token.head}")

Numerical modifier: 16 --> object: GB
Numerical modifier: 256 --> object: GB
Numerical modifier: 1000 --> object: Francs
