Created on March 21st 2020 by Patrick Rotzetter
protzetter@bluewin.ch
https://www.linkedin.com/in/rotzetter/

**Small experiment for order mail processing**




In [None]:
# Import libraries
import spacy
from spacy.pipeline import EntityRuler
from spacy.matcher import Matcher,PhraseMatcher
from spacy.symbols import nsubj, VERB, dobj, NOUN, root, xcomp
from spacy import displacy
from spacy.matcher import Matcher
from pathlib import Path
import random  

In [None]:
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
[K     |████████████████████████████████| 827.9MB 1.2MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-cp36-none-any.whl size=829180944 sha256=4fe7dfc7abe659dbaadef98f6e7a1a93f56930adf45401055494fd0b4d1dd0a3
  Stored in directory: /tmp/pip-ephem-wheel-cache-oo7jei6a/wheels/2a/c1/a6/fc7a877b1efca9bc6a089d6f506f16d3868408f9ff89f8dbfc
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [None]:
from platform import python_version
print(python_version())
!pip show spacy

3.6.9
Name: spacy
Version: 2.2.4
Summary: Industrial-strength Natural Language Processing (NLP) in Python
Home-page: https://spacy.io
Author: Explosion
Author-email: contact@explosion.ai
License: MIT
Location: /usr/local/lib/python3.6/dist-packages
Requires: wasabi, tqdm, requests, numpy, setuptools, murmurhash, catalogue, preshed, blis, thinc, srsly, cymem, plac
Required-by: fastai, en-core-web-sm, en-core-web-lg


In [None]:
# load spacy model

from spacy.lang.en import English
import en_core_web_lg
nlp = en_core_web_lg.load()

In [None]:
# connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#read mail file

text = open('/content/drive/My Drive/Colab Notebooks/ordermail.txt').read().replace('\n', ' ')

print(text)


Hello,  I would like to order a notebook with 16GB and 256 GB disk, I would like to spend less than 1000 Francs, what would be the options  Thanks a lot  Patrick


In [None]:
#process the mail trough standard spacy pipeline
docMail=nlp(text)


In [None]:
# print text entities detected
for ent in docMail.ents :
    print(ent.text, ent.label_,)


16GB QUANTITY
256 GB QUANTITY
less than 1000 Francs MONEY
Patrick PERSON


In [None]:
# add domain specific entities and add to the pipeline
patterns = [{"label": "OBJECT", "pattern":  [{"lower": "real estate"}]},
             {"label": "CURRENCY", "pattern":  [{"lower": "francs"}]},
            {"label": "PART", "pattern":  [{"lower": "disk"}]}]

ruler = EntityRuler(nlp, patterns=patterns,overwrite_ents=True)
nlp.add_pipe(ruler)


In [None]:
#process the mail again with added entities
docMail=nlp(text)
for ents in docMail.ents:
    # Print the entity text and its label
    print(ents.text, ents.label_,)



16GB QUANTITY
256 GB QUANTITY
disk PART
Francs CURRENCY
Patrick PERSON


In [None]:
matcher = PhraseMatcher(nlp.vocab)
terms = ["16 GB","256 GB"]
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(t) for t in terms]
matcher.add("MEMORY", None, *patterns)

doc = nlp(text)
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

16GB
256 GB


In [None]:
for token in docMail:
  print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

Hello intj like VERB []
, punct like VERB [ ]
   , PUNCT []
I nsubj like VERB []
would aux like VERB []
like ccomp like VERB [Hello, ,, I, would, order]
to aux order VERB []
order xcomp like VERB [to, notebook]
a det notebook NOUN []
notebook dobj order VERB [a, with]
with prep notebook NOUN [GB]
16 nummod GB PROPN []
GB pobj with ADP [16, and, disk]
and cc GB PROPN []
256 nummod disk NOUN []
GB compound disk NOUN []
disk conj GB PROPN [256, GB]
, punct like VERB []
I nsubj like VERB []
would aux like VERB []
like ROOT like VERB [like, ,, I, would, spend]
to aux spend VERB []
spend xcomp like VERB [to, Francs]
less amod 1000 NUM []
than quantmod 1000 NUM []
1000 nummod Francs NOUN [less, than]
Francs dobj spend VERB [1000]
, punct be AUX []
what nsubj be AUX []
would aux be AUX []
be ROOT be AUX [,, what, would, options]
the det options NOUN []
options attr be AUX [the,  ]
   options NOUN []
Thanks ROOT Thanks NOUN [lot, Patrick]
a det lot NOUN []
lot npadvmod Thanks NOUN [a,  ]
   lot

In [None]:
displacy.render(docMail, style="dep", minify=True, jupyter=True)

In [None]:
displacy.render(docMail, style='ent', minify=True)

'<div class="entities" style="line-height: 2.5; direction: ltr">Hello,  I would like to order a notebook with <mark class="entity" style="background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">16GB<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">QUANTITY</span></mark> and <mark class="entity" style="background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">256 GB<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">QUANTITY</span></mark> <mark class="entity" style="background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">disk<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; v

In [None]:
for token in docMail:  
    if token.dep_ == 'nummod':    
      print(f"Numerical modifier: {token.text} --> object: {token.head}")

Numerical modifier: 16 --> object: GB
Numerical modifier: 256 --> object: disk
Numerical modifier: 1000 --> object: Francs


In [None]:
verbs = set()
for possible_subject in docMail:
    if possible_subject.dep == dobj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject)
print(verbs)

{notebook, Francs}


In [None]:
for token in docMail:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])    


Hello intj like VERB []
, punct like VERB [ ]
   , PUNCT []
I nsubj like VERB []
would aux like VERB []
like ccomp like VERB [Hello, ,, I, would, order]
to aux order VERB []
order xcomp like VERB [to, notebook]
a det notebook NOUN []
notebook dobj order VERB [a, with]
with prep notebook NOUN [GB]
16 nummod GB PROPN []
GB pobj with ADP [16, and, disk]
and cc GB PROPN []
256 nummod disk NOUN []
GB compound disk NOUN []
disk conj GB PROPN [256, GB]
, punct like VERB []
I nsubj like VERB []
would aux like VERB []
like ROOT like VERB [like, ,, I, would, spend]
to aux spend VERB []
spend xcomp like VERB [to, Francs]
less amod 1000 NUM []
than quantmod 1000 NUM []
1000 nummod Francs NOUN [less, than]
Francs dobj spend VERB [1000]
, punct be AUX []
what nsubj be AUX []
would aux be AUX []
be ROOT be AUX [,, what, would, options]
the det options NOUN []
options attr be AUX [the,  ]
   options NOUN []
Thanks ROOT Thanks NOUN [lot, Patrick]
a det lot NOUN []
lot npadvmod Thanks NOUN [a,  ]
   lot

In [None]:
items = set()
for possible_item in docMail:
    if possible_item.dep == dobj and possible_item.head.pos == VERB:
        items.add(possible_item)
print(items)

{notebook, Francs}


In [None]:
orderobject=nlp("laptop")
for  sub in items:
  print(sub.similarity(orderobject))


0.8021939809276627
0.0015887124852857469


In [None]:
verbs = set()
for possible_verbs in docMail:
    if possible_verbs.dep == xcomp and possible_verbs.head.pos == VERB :
        verbs.add(possible_verbs)
print(verbs)

{spend, order}


In [None]:
orderword=nlp("order")
for  verb in verbs:
  if (verb.similarity(orderword)) >=0.8:
    for v in verb.children:
      if v.dep==dobj:
        print(v.text)


notebook


In [None]:
text='I am passing an order for a laptop with 16GB RAM and 256 Disk'

In [None]:
doc=nlp(text)

In [None]:
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])    


I nsubj passing VERB []
am aux passing VERB []
passing ROOT passing VERB [I, am, order]
an det order NOUN []
order dobj passing VERB [an, for]
for prep order NOUN [laptop]
a det laptop NOUN []
laptop pobj for ADP [a, with]
with prep laptop NOUN [RAM]
16 nummod RAM PROPN []
GB compound RAM PROPN []
RAM pobj with ADP [16, GB, and, Disk]
and cc RAM PROPN []
256 nummod Disk NOUN []
Disk conj RAM PROPN [256]


In [None]:
items = set()
for possible_item in doc:
    if possible_item.dep == dobj and possible_item.head.pos == VERB:
        items.add(possible_item)
print(items)

{order}
