Created on March 21st 2020 by Patrick Rotzetter
protzetter@bluewin.ch
https://www.linkedin.com/in/rotzetter/

**Small experiment for order mail processing**




In [1]:
# Import libraries
import spacy
from spacy.pipeline import EntityRuler
from spacy.matcher import Matcher,PhraseMatcher
from spacy.symbols import nsubj, VERB, dobj, NOUN, root, xcomp
from spacy import displacy
from spacy.matcher import Matcher
from pathlib import Path
import random  

In [None]:
!python -m spacy download en_core_web_lg

In [2]:
from platform import python_version
print(python_version())
!pip show spacy

3.8.5
Name: spacy
Version: 2.3.4
Summary: Industrial-strength Natural Language Processing (NLP) in Python
Home-page: https://spacy.io
Author: Explosion
Author-email: contact@explosion.ai
License: MIT
Location: c:\users\patrick_rotzetter\anaconda3\envs\base38\lib\site-packages
Requires: tqdm, blis, murmurhash, wasabi, catalogue, plac, cymem, srsly, numpy, thinc, preshed, requests, setuptools
Required-by: texthero, en-core-web-sm, en-core-web-lg


In [3]:
# load spacy model

from spacy.lang.en import English
import en_core_web_lg
nlp = en_core_web_lg.load()

In [4]:
# use in case you are in goocle collab
# connect to google drive
#from google.colab import drive
#drive.mount('/content/drive')

In [5]:
#read mail file

text = open('ordermail.txt').read().replace('\n', ' ')

print(text)


Hello,  I would like to order a notebook with 16GB and 256 GB disk, I would like to spend less than 1000 Francs, what would be the options  Thanks a lot  Patrick


In [6]:
#process the mail trough standard spacy pipeline
docMail=nlp(text)


In [7]:
# print text entities detected
for ent in docMail.ents :
    print(ent.text, ent.label_,)


16GB QUANTITY
256 GB QUANTITY
less than 1000 Francs MONEY


In [8]:
# add domain specific entities and add to the pipeline
patterns = [{"label": "OBJECT", "pattern":  [{"lower": "real estate"}]},
             {"label": "CURRENCY", "pattern":  [{"lower": "francs"}]},
            {"label": "PART", "pattern":  [{"lower": "disk"}]}]

ruler = EntityRuler(nlp, patterns=patterns,overwrite_ents=True)
nlp.add_pipe(ruler)


In [9]:
#process the mail again with added entities
docMail=nlp(text)
for ents in docMail.ents:
    # Print the entity text and its label
    print(ents.text, ents.label_,)



16GB QUANTITY
256 GB QUANTITY
disk PART
Francs CURRENCY


In [10]:
matcher = PhraseMatcher(nlp.vocab)
terms = ["16 GB","256 GB"]
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(t) for t in terms]
matcher.add("MEMORY", None, *patterns)

doc = nlp(text)
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

16GB
256 GB


In [11]:
for token in docMail:
  print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

Hello intj like VERB []
, punct like VERB [ ]
   , PUNCT []
I nsubj like VERB []
would aux like VERB []
like ccomp like VERB [Hello, ,, I, would, order]
to aux order VERB []
order xcomp like VERB [to, notebook, with]
a det notebook NOUN []
notebook dobj order VERB [a]
with prep order VERB [GB]
16 nummod GB PROPN []
GB pobj with ADP [16, and, disk]
and cc GB PROPN []
256 nummod GB NOUN []
GB compound disk NOUN [256]
disk conj GB PROPN [GB]
, punct like VERB []
I nsubj like VERB []
would aux like VERB []
like ROOT like VERB [like, ,, I, would, spend, be]
to aux spend VERB []
spend xcomp like VERB [to, Francs]
less amod 1000 NUM []
than quantmod 1000 NUM []
1000 nummod Francs NOUN [less, than]
Francs dobj spend VERB [1000]
, punct be AUX []
what nsubj be AUX []
would aux be AUX []
be ccomp like VERB [,, what, would, options, Thanks, lot]
the det options NOUN []
options attr be AUX [the,  ]
   options NOUN []
Thanks npadvmod be AUX []
a det lot NOUN []
lot npadvmod be AUX [a,  , Patrick]
 

In [12]:
displacy.render(docMail, style="dep", minify=True, jupyter=True)

In [13]:
displacy.render(docMail, style='ent', minify=True)

In [14]:
for token in docMail:  
    if token.dep_ == 'nummod':    
      print(f"Numerical modifier: {token.text} --> object: {token.head}")

Numerical modifier: 16 --> object: GB
Numerical modifier: 256 --> object: GB
Numerical modifier: 1000 --> object: Francs


In [15]:
verbs = set()
for possible_subject in docMail:
    if possible_subject.dep == dobj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject)
print(verbs)

{notebook, Francs}


In [16]:
for token in docMail:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])    


Hello intj like VERB []
, punct like VERB [ ]
   , PUNCT []
I nsubj like VERB []
would aux like VERB []
like ccomp like VERB [Hello, ,, I, would, order]
to aux order VERB []
order xcomp like VERB [to, notebook, with]
a det notebook NOUN []
notebook dobj order VERB [a]
with prep order VERB [GB]
16 nummod GB PROPN []
GB pobj with ADP [16, and, disk]
and cc GB PROPN []
256 nummod GB NOUN []
GB compound disk NOUN [256]
disk conj GB PROPN [GB]
, punct like VERB []
I nsubj like VERB []
would aux like VERB []
like ROOT like VERB [like, ,, I, would, spend, be]
to aux spend VERB []
spend xcomp like VERB [to, Francs]
less amod 1000 NUM []
than quantmod 1000 NUM []
1000 nummod Francs NOUN [less, than]
Francs dobj spend VERB [1000]
, punct be AUX []
what nsubj be AUX []
would aux be AUX []
be ccomp like VERB [,, what, would, options, Thanks, lot]
the det options NOUN []
options attr be AUX [the,  ]
   options NOUN []
Thanks npadvmod be AUX []
a det lot NOUN []
lot npadvmod be AUX [a,  , Patrick]
 

In [17]:
items = set()
for possible_item in docMail:
    if possible_item.dep == dobj and possible_item.head.pos == VERB:
        items.add(possible_item)
print(items)

{notebook, Francs}


In [18]:
orderobject=nlp("laptop")
for  sub in items:
  print(sub.similarity(orderobject))


0.8021939809276627
0.001588716816210226


In [19]:
verbs = set()
for possible_verbs in docMail:
    if possible_verbs.dep == xcomp and possible_verbs.head.pos == VERB :
        verbs.add(possible_verbs)
print(verbs)

{order, spend}


In [20]:
orderword=nlp("order")
for  verb in verbs:
  if (verb.similarity(orderword)) >=0.8:
    for v in verb.children:
      if v.dep==dobj:
        print(v.text)


notebook


In [21]:
text='I am passing an order for a laptop with 16GB RAM and 256 Disk'

In [22]:
doc=nlp(text)

In [23]:
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])    


I nsubj passing VERB []
am aux passing VERB []
passing ROOT passing VERB [I, am, order, with]
an det order NOUN []
order dobj passing VERB [an, for]
for prep order NOUN [laptop]
a det laptop NOUN []
laptop pobj for ADP [a]
with prep passing VERB [RAM]
16 nummod RAM PROPN []
GB compound RAM PROPN []
RAM pobj with ADP [16, GB, and, Disk]
and cc RAM PROPN []
256 nummod Disk NOUN []
Disk conj RAM PROPN [256]


In [24]:
items = set()
for possible_item in doc:
    if possible_item.dep == dobj and possible_item.head.pos == VERB:
        items.add(possible_item)
print(items)

{order}
