In [2]:
import spacy

In [3]:
nlp = spacy.load('C:\\MachineLearning\\repos\\personel\\nlp\\ner-pos\\en_core_web_sm-2.2.5\\en_core_web_sm\\en_core_web_sm-2.2.5')

In [4]:
def showEnts(document):
    if document.ents:
        for ent in document.ents:
            print (ent.text + '--'+ent.label_+'--'+str(spacy.explain(ent.label_)))
    else:
        print ('no entities found')

In [6]:
# document with no entities
document = nlp('hi, how are you?')
showEnts(document)

no entities found


In [7]:
# document with some entities - CAPABILITIES is not an entity
document= nlp('capabilities in india')
showEnts(document)

no entities found


In [8]:
# document with some entities - CAPABILITIES & SETTLEMENT are not an entity
document= nlp('settlement capabilities in india')
showEnts(document)

no entities found


In [9]:
# document with some entities - CAPABILITIES, SETTLEMENT & FINANCE are not an entity
document= nlp('finance and reconcilliation capabilities in india')
showEnts(document)

no entities found


## Entity annotations
`Doc.ents` are token spans with their own set of annotations.
<table>
<tr><td>`ent.text`</td><td>The original entity text</td></tr>
<tr><td>`ent.label`</td><td>The entity type's hash value</td></tr>
<tr><td>`ent.label_`</td><td>The entity type's string description</td></tr>
<tr><td>`ent.start`</td><td>The token span's *start* index position in the Doc</td></tr>
<tr><td>`ent.end`</td><td>The token span's *stop* index position in the Doc</td></tr>
<tr><td>`ent.start_char`</td><td>The entity text's *start* index position in the Doc</td></tr>
<tr><td>`ent.end_char`</td><td>The entity text's *stop* index position in the Doc</td></tr>
</table>



## NER Tags
Tags are accessible through the `.label_` property of an entity.
<table>
<tr><th>TYPE</th><th>DESCRIPTION</th><th>EXAMPLE</th></tr>
<tr><td>`PERSON`</td><td>People, including fictional.</td><td>*Fred Flintstone*</td></tr>
<tr><td>`NORP`</td><td>Nationalities or religious or political groups.</td><td>*The Republican Party*</td></tr>
<tr><td>`FAC`</td><td>Buildings, airports, highways, bridges, etc.</td><td>*Logan International Airport, The Golden Gate*</td></tr>
<tr><td>`ORG`</td><td>Companies, agencies, institutions, etc.</td><td>*Microsoft, FBI, MIT*</td></tr>
<tr><td>`GPE`</td><td>Countries, cities, states.</td><td>*France, UAR, Chicago, Idaho*</td></tr>
<tr><td>`LOC`</td><td>Non-GPE locations, mountain ranges, bodies of water.</td><td>*Europe, Nile River, Midwest*</td></tr>
<tr><td>`PRODUCT`</td><td>Objects, vehicles, foods, etc. (Not services.)</td><td>*Formula 1*</td></tr>
<tr><td>`EVENT`</td><td>Named hurricanes, battles, wars, sports events, etc.</td><td>*Olympic Games*</td></tr>
<tr><td>`WORK_OF_ART`</td><td>Titles of books, songs, etc.</td><td>*The Mona Lisa*</td></tr>
<tr><td>`LAW`</td><td>Named documents made into laws.</td><td>*Roe v. Wade*</td></tr>
<tr><td>`LANGUAGE`</td><td>Any named language.</td><td>*English*</td></tr>
<tr><td>`DATE`</td><td>Absolute or relative dates or periods.</td><td>*20 July 1969*</td></tr>
<tr><td>`TIME`</td><td>Times smaller than a day.</td><td>*Four hours*</td></tr>
<tr><td>`PERCENT`</td><td>Percentage, including "%".</td><td>*Eighty percent*</td></tr>
<tr><td>`MONEY`</td><td>Monetary values, including unit.</td><td>*Twenty Cents*</td></tr>
<tr><td>`QUANTITY`</td><td>Measurements, as of weight or distance.</td><td>*Several kilometers, 55kg*</td></tr>
<tr><td>`ORDINAL`</td><td>"first", "second", etc.</td><td>*9th, Ninth*</td></tr>
<tr><td>`CARDINAL`</td><td>Numerals that do not fall under another type.</td><td>*2, Two, Fifty-two*</td></tr>
</table>

In [10]:
# document with some entities - CAPABILITIES, SETTLEMENT , FINANCE, Names are not an entity
document= nlp('what does rahul jain do in Fidelity india?')
showEnts(document)

no entities found


In [11]:
# Add a word as Named Entity
from spacy.tokens import Span

document= nlp('settlement capabilities in india')

NORP = document.vocab.strings['NORP']
new_entity = Span(document,0,1,label=NORP)
document.ents = list(document.ents) + [new_entity]

#document= nlp('settlement capabilities in india')
showEnts(document)

settlement--NORP--Nationalities or religious or political groups


In [12]:
# Add multiple phrases as entities
from spacy.tokens import Span
from spacy import displacy

document= nlp('how many settlement, finance and reconcilliation capabilities in india')
#showEnts(document)

from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)
phrase_list = ['settlement','finance','reconcilliation']
phrase_patterns = [nlp(text) for text in phrase_list]
matcher.add('capabilities-matcher',None, *phrase_patterns)
found_matches = matcher(document)

from spacy.tokens import Span
NORP = document.vocab.strings['NORP']

new_entity = [Span(document, match[1], match[2], label=NORP) for match in found_matches]
document.ents = list(document.ents)+new_entity
    
showEnts(document)
print ('\n\n')
for token in document:
    print((token.text, token.pos_, token.tag_, token.dep_))
displacy.render(document)

settlement--NORP--Nationalities or religious or political groups
finance--NORP--Nationalities or religious or political groups
reconcilliation--NORP--Nationalities or religious or political groups



('how', '', '', '')
('many', '', '', '')
('settlement', '', '', '')
(',', '', '', '')
('finance', '', '', '')
('and', '', '', '')
('reconcilliation', '', '', '')
('capabilities', '', '', '')
('in', '', '', '')
('india', '', '', '')


https://spacy.io/usage/models
  "__main__", mod_spec)


In [36]:
document= nlp('settlement capabilities')
showEnts(document)

no entities found


In [49]:
!python -m spacy download en_trf_distilbertbaseuncased_lg

Collecting en_trf_distilbertbaseuncased_lg==2.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_trf_distilbertbaseuncased_lg-2.2.0/en_trf_distilbertbaseuncased_lg-2.2.0.tar.gz (245.0 MB)
Collecting spacy-transformers>=0.5.0
  Downloading spacy-transformers-0.5.1.tar.gz (59 kB)
Collecting transformers<2.1.0,>=2.0.0
  Downloading transformers-2.0.0-py3-none-any.whl (290 kB)
Collecting torchcontrib<0.1.0,>=0.0.2
  Downloading torchcontrib-0.0.2.tar.gz (11 kB)
Collecting ftfy<6.0.0,>=5.0.0
  Downloading ftfy-5.6.tar.gz (58 kB)
Collecting dataclasses<0.7,>=0.6
  Downloading dataclasses-0.6-py3-none-any.whl (14 kB)
Building wheels for collected packages: en-trf-distilbertbaseuncased-lg, spacy-transformers, torchcontrib, ftfy
  Building wheel for en-trf-distilbertbaseuncased-lg (setup.py): started
  Building wheel for en-trf-distilbertbaseuncased-lg (setup.py): finished with status 'done'
  Created wheel for en-trf-distilbertbaseuncased-lg: filename=en_trf_disti

In [14]:
nlp = spacy.load('en_trf_distilbertbaseuncased_lg')

In [15]:
document= nlp('settlement capabilities in india')
showEnts(document)

no entities found


In [13]:
!pip install spacy-transformers



Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
