Created on March 21st 2020 by Patrick Rotzetter
Last update on Feb 28th 2021

protzetter@bluewin.ch
https://www.linkedin.com/in/rotzetter/

**Small experiment for insurance claim processing**




In [113]:
# Import libraries
import spacy
from spacy.pipeline import EntityRuler
from spacy.matcher import Matcher,PhraseMatcher
from spacy.symbols import nsubj, VERB, dobj, NOUN, root, xcomp
from spacy import displacy
from spacy.matcher import Matcher
from pathlib import Path
import random  

In [114]:
!python -m spacy validate

[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation:
/opt/anaconda3/envs/spacy30/lib/python3.8/site-packages/spacy[0m

NAME              SPACY            VERSION                            
en_core_web_lg    >=3.0.0,<3.1.0   [38;5;2m3.0.0[0m   [38;5;2m✔[0m
en_core_web_sm    >=3.0.0,<3.1.0   [38;5;2m3.0.0[0m   [38;5;2m✔[0m
en_core_web_trf   >=3.0.0,<3.1.0   [38;5;2m3.0.0[0m   [38;5;2m✔[0m



In [115]:
from platform import python_version
print(python_version())
!pip show spacy

3.8.5
Name: spacy
Version: 3.0.1
Summary: Industrial-strength Natural Language Processing (NLP) in Python
Home-page: https://spacy.io
Author: Explosion
Author-email: contact@explosion.ai
License: MIT
Location: /opt/anaconda3/envs/spacy30/lib/python3.8/site-packages
Requires: spacy-legacy, requests, typer, wasabi, jinja2, cymem, numpy, blis, setuptools, tqdm, preshed, pydantic, pathy, packaging, catalogue, murmurhash, thinc, srsly
Required-by: texthero, en-core-web-trf, en-core-web-sm, en-core-web-lg


In [116]:
# load spacy model latets modle based on pre-trained transformer model Roberta

from spacy.lang.en import English
import en_core_web_trf
nlp = en_core_web_trf.load()

In [117]:
#read order dialog file

text = open('claimsmail.txt').read().replace('\n', ' ')

print(text)


Hello,  I would like to submit 1 claim for my car accident and send you a picture, how should I proceed.  Thanks a lot  Patrick


In [118]:
#process the mail trough standard spacy pipeline
doc=nlp(text)


In [119]:
# print text entities detected
for ent in doc.ents :
    print(ent.text, ent.label_,)


Patrick PERSON


In [120]:
#Let us visualize the result directly in the text
displacy.render(doc, style='ent', minify=True)

The default model does not seem to detect notebook and disk as entities, but identifies the sender as a person and identifies the RAM and disk size as quantities. This is a good start, but still far away from a practical solution. So, let us add some domain specific entities that will help us later on.

In [121]:
# add domain specific entities and add to the pipeline
# add domain specific entities and add to the pipeline
patterns = [{"label": "INSURE", "pattern":  [{"lower": "claim"}]},
             {"label": "DAMAGE", "pattern":  [{"lower": "accident"}]},
{"label": "OBJECT", "pattern": [{"LOWER": "car"}]},
{"label": "INSURANCE", "pattern": [{"LOWER": "hm"}, {"LOWER": "insurance"}, {"LOWER":"group"}]}]


config = {
   "phrase_matcher_attr": None,
   "validate": True,
   "overwrite_ents": True,
   "ent_id_sep": "||",
}
ruler=nlp.add_pipe('entity_ruler',config=config)


In [122]:
ruler.add_patterns(patterns)

In [123]:
#process the mail again with added entities
doc=nlp(text)
for ents in doc.ents:
    # Print the entity text and its label
    print(ents.text, ents.label_,)



claim INSURE
car OBJECT
accident DAMAGE
Patrick PERSON


In [103]:
#Let us visualize the result directly in the text
displacy.render(doc, style='ent', minify=True)

In [124]:
# Let us try to identify specific phrases or sequence of words, for example to detect an event
matcher = PhraseMatcher(nlp.vocab)
terms = ["car accident"]
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(t) for t in terms]
matcher.add("INCIDENT", None, *patterns)

doc = nlp(text)
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

car accident


In [125]:
## Part of speech tagging
# This is where the trained pipeline and its statistical models come in, 
# which enable spaCy to make predictions of which tag or label most likely applies in this context
for token in doc:
  print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

Hello intj   SPACE []
, punct   SPACE []
  ccomp proceed VERB [Hello, ,]
I nsubj like VERB []
would aux like VERB []
like ccomp proceed VERB [I, would, submit]
to aux submit VERB []
submit xcomp like VERB [to, claim, and, send]
1 nummod claim NOUN []
claim dobj submit VERB [1, for]
for prep claim NOUN [accident]
my poss accident NOUN []
car compound accident NOUN []
accident pobj for ADP [my, car]
and cc submit VERB []
send conj submit VERB [you, picture]
you dative send VERB []
a det picture NOUN []
picture dobj send VERB [a]
, punct proceed VERB []
how advmod proceed VERB []
should aux proceed VERB []
I nsubj proceed VERB []
proceed ROOT proceed VERB [ , like, ,, how, should, I, .]
. punct proceed VERB []
  ROOT   SPACE []
Thanks ROOT Thanks NOUN [lot,  , Patrick]
a det lot NOUN []
lot npadvmod Thanks NOUN [a]
  punct Thanks NOUN []
Patrick npadvmod Thanks NOUN []


In [126]:
# visualize the dependency graph
displacy.render(doc, style="dep", minify=True, jupyter=True)

Spacy provides all the required tagging to find the action verbs, we want to know if the customer wants to order something or is just interested by some information for example. Let us iterate through all tokens in the text and search for an open clausal complement ( refer to for all possible dependency tags https://spacy.io/api/annotation#pos-tagging )

In [127]:
# Identify action verbs
verbs = set()
for possible_verbs in doc:
    if possible_verbs.pos == VERB :
        verbs.add(possible_verbs)
print(verbs)

{submit, send, like, proceed}


In [128]:
# we can imagine comparing the action verb with a pre-defined set of actions that we understand and this way know what the customer wants to do
# let us assume we have defined an action notify and another cancel
import en_core_web_lg
nlp = en_core_web_lg.load()
notifyNlp=nlp("notify")
cancelNlp=nlp("cancel")

verb=verbs.pop().text
verbNlp=nlp(verb)

print("Similarity of {} to notify: {}".format(verb, notifyNlp.similarity(verbNlp)))
print("Similarity of {} cancel: {}".format(verb, cancelNlp.similarity(verbNlp)))

Similarity of submit to notify: 0.5794420418542369
Similarity of submit cancel: 0.4386734820003478


Let us find possible items in the text using the dependency tag ‘dobj’ for direct objects of a verb.

In [129]:
# Let us find possible items in the text using the dependency tag ‘dobj’ for direct objects of a verb.
items = set()
for possible_subject in doc:
    if possible_subject.dep == dobj and possible_subject.head.pos == VERB:
        items.add(possible_subject)
print(items)

{picture, claim}


In [110]:
# We will compare similarities between identified objects and the word ‘notification’. 
# The word ‘claim’ is much closer to ‘notification’ than 'picture'.
import en_core_web_lg
nlp = en_core_web_lg.load()
notification=nlp("notification")
for  sub in items:
  print(sub.text,nlp(sub.text).similarity(notification))


claim 0.2714922100856048
picture 0.13987488498232845


In [111]:
# We will compare similarities between identified verbs and the verb ‘notify’. 
# Then based on this we will identiofy the direct object of the verb
claimword=nlp("notify") 
for  verb in verbs:
  if (nlp(verb.text).similarity(claimword)) >=0.5:
    for v in verb.children:
      if v.dep==dobj:
        print(v.text)


claim


In [112]:
# we can also identify specific slots using numeric modifiers for example to understand the desired quantities
# with corresponding modified objects
for token in doc:  
    if token.dep_ == 'nummod':    
      print(f"Numerical modifier: {token.text} --> object: {token.head}")

Numerical modifier: 1 --> object: claim
