In [1]:
import spacy
nlp = spacy.load("en_core_web_md")

In [2]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [3]:
doc = nlp("Tesla Inc is going to acquire Twitter Inc for $45 billion")

for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

Tesla Inc | ORG | Companies, agencies, institutions, etc.
Twitter Inc | ORG | Companies, agencies, institutions, etc.
$45 billion | MONEY | Monetary values, including unit


In [4]:
from spacy import displacy

displacy.render(doc, style="ent")

In [5]:
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [6]:
doc = nlp("Michael Bloomberg founded Bloomberg in 1982")

for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

Michael Bloomberg | PERSON | People, including fictional
Bloomberg | GPE | Countries, cities, states
1982 | DATE | Absolute or relative dates or periods


In [7]:
doc = nlp("Tesla is going to acquire Twitter for $45 billion")

for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

Tesla | GPE | Countries, cities, states
$45 billion | MONEY | Monetary values, including unit


In [8]:
type(doc[2:5])

spacy.tokens.span.Span

In [9]:
from spacy.tokens import Span

s1 = Span(doc, 0, 1, label="ORG")
s2 = Span(doc, 5, 6, label="ORG")

doc.set_ents([s1, s2], default = "unmodified")

In [10]:
for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

Tesla | ORG | Companies, agencies, institutions, etc.
Twitter | ORG | Companies, agencies, institutions, etc.
$45 billion | MONEY | Monetary values, including unit


# NER Lesson

In [11]:
import nltk
import pandas as pd

In [12]:
text = 'Apple acquired Zoom in China on Wednesday 6th May 2020. \ This news has made Apple and Google stock jump by 5% on DOW Jones Index in the \ United States of America'

In [13]:
#tokenize to words
words = nltk.word_tokenize(text)
words

['Apple',
 'acquired',
 'Zoom',
 'in',
 'China',
 'on',
 'Wednesday',
 '6th',
 'May',
 '2020',
 '.',
 '\\',
 'This',
 'news',
 'has',
 'made',
 'Apple',
 'and',
 'Google',
 'stock',
 'jump',
 'by',
 '5',
 '%',
 'on',
 'DOW',
 'Jones',
 'Index',
 'in',
 'the',
 '\\',
 'United',
 'States',
 'of',
 'America']

In [14]:
pos_tags = nltk.pos_tag(words)
pos_tags

[('Apple', 'NNP'),
 ('acquired', 'VBD'),
 ('Zoom', 'NNP'),
 ('in', 'IN'),
 ('China', 'NNP'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('6th', 'CD'),
 ('May', 'NNP'),
 ('2020', 'CD'),
 ('.', '.'),
 ('\\', 'VB'),
 ('This', 'DT'),
 ('news', 'NN'),
 ('has', 'VBZ'),
 ('made', 'VBN'),
 ('Apple', 'NNP'),
 ('and', 'CC'),
 ('Google', 'NNP'),
 ('stock', 'NN'),
 ('jump', 'NN'),
 ('by', 'IN'),
 ('5', 'CD'),
 ('%', 'NN'),
 ('on', 'IN'),
 ('DOW', 'NNP'),
 ('Jones', 'NNP'),
 ('Index', 'NNP'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('\\', 'NNP'),
 ('United', 'NNP'),
 ('States', 'NNPS'),
 ('of', 'IN'),
 ('America', 'NNP')]

In [15]:
# Descripiton of Tag
nltk.help.upenn_tagset('NNP')
nltk.help.upenn_tagset('VBD')
nltk.help.upenn_tagset('IN')
nltk.help.upenn_tagset('CC')
nltk.help.upenn_tagset('NNPS')
nltk.help.upenn_tagset('CD')

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...
VBD: verb, past tense
    dipped pleaded swiped regummed soaked tidied convened halted registered
    cushioned exacted snubbed strode aimed adopted belied figgered
    speculated wore appreciated contemplated ...
IN: preposition or conjunction, subordinating
    astride among uppon whether out inside pro despite on by throughout
    below within for towards near behind atop around if like until below
    next into if beside ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
NNPS: noun, proper, plural
    Americans Americas Amharas Amityvilles Amusements Anarcho-Syndicalists
    Andalusians Andes Andruses Angels Animals Anthony Antilles Antiques
    Apache Apaches Apocrypha ...
CD

In [16]:
chunks = nltk.ne_chunk(pos_tags, binary = True) #either a NE or not a NE

for chunk in chunks:
    print(chunk)

(NE Apple/NNP)
('acquired', 'VBD')
('Zoom', 'NNP')
('in', 'IN')
(NE China/NNP)
('on', 'IN')
('Wednesday', 'NNP')
('6th', 'CD')
('May', 'NNP')
('2020', 'CD')
('.', '.')
('\\', 'VB')
('This', 'DT')
('news', 'NN')
('has', 'VBZ')
('made', 'VBN')
(NE Apple/NNP)
('and', 'CC')
(NE Google/NNP)
('stock', 'NN')
('jump', 'NN')
('by', 'IN')
('5', 'CD')
('%', 'NN')
('on', 'IN')
(NE DOW/NNP Jones/NNP Index/NNP)
('in', 'IN')
('the', 'DT')
('\\', 'NNP')
(NE United/NNP States/NNPS)
('of', 'IN')
(NE America/NNP)


In [17]:
entities = []
labels = []

for chunk in chunks:
    if hasattr(chunk, 'label'):
        #print(chunk)
        entities.append(' '.join(c[0] for c in chunk))
        labels.append(chunk.label())
        
entities_labels = list(set(zip(entities, labels)))
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities", "Labels"]
entities_df

Unnamed: 0,Entities,Labels
0,United States,NE
1,Google,NE
2,Apple,NE
3,America,NE
4,DOW Jones Index,NE
5,China,NE


In [18]:
chunks = nltk.ne_chunk(pos_tags, binary = True) #either a NE or not a NE

for chunk in chunks:
    print(chunk)
    
entities = []
labels = []

sentence = nltk.sent_tokenize(text)
for sent in sentence:
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary=False):
        if hasattr(chunk, 'label'):
            entities.append(' '.join(c[0] for c in chunk))
            labels.append(chunk.label())
        
entities_labels = list(set(zip(entities, labels)))

entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities", "Labels"]
entities_df

(NE Apple/NNP)
('acquired', 'VBD')
('Zoom', 'NNP')
('in', 'IN')
(NE China/NNP)
('on', 'IN')
('Wednesday', 'NNP')
('6th', 'CD')
('May', 'NNP')
('2020', 'CD')
('.', '.')
('\\', 'VB')
('This', 'DT')
('news', 'NN')
('has', 'VBZ')
('made', 'VBN')
(NE Apple/NNP)
('and', 'CC')
(NE Google/NNP)
('stock', 'NN')
('jump', 'NN')
('by', 'IN')
('5', 'CD')
('%', 'NN')
('on', 'IN')
(NE DOW/NNP Jones/NNP Index/NNP)
('in', 'IN')
('the', 'DT')
('\\', 'NNP')
(NE United/NNP States/NNPS)
('of', 'IN')
(NE America/NNP)


Unnamed: 0,Entities,Labels
0,Zoom,PERSON
1,DOW Jones Index,ORGANIZATION
2,United States,GPE
3,America,GPE
4,Apple,PERSON
5,Google,ORGANIZATION
6,China,GPE


# Spacy

In [19]:
import spacy 
from spacy import displacy
#SpaCy 2.x brough significant speed and accuracy improvements
spacy.__version__

'3.3.1'

In [20]:
nlp = spacy.load("en_core_web_md")

In [21]:
doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
print(text)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels,'Position_Start':position_start, 'Position_End':position_end})

df

Apple acquired Zoom in China on Wednesday 6th May 2020. \ This news has made Apple and Google stock jump by 5% on DOW Jones Index in the \ United States of America


Unnamed: 0,Entities,Labels,Position_Start,Position_End
0,(Zoom),LOC,15,19
1,(China),GPE,23,28
2,"(Wednesday, 6th, May, 2020)",DATE,32,54
3,"(Apple, and, Google)",ORG,77,93
4,"(5, %)",PERCENT,108,110
5,"(DOW, Jones)",ORG,114,123
6,"(the, \, United, States, of, America)",GPE,133,163


In [22]:
spacy.explain("ORG")

'Companies, agencies, institutions, etc.'

In [23]:
spacy.explain("GPE")

'Countries, cities, states'

In [24]:
spacy.explain("DATE")

'Absolute or relative dates or periods'

In [25]:
spacy.explain("PERCENT")

'Percentage, including "%"'

In [26]:
spacy.explain("LOC")

'Non-GPE locations, mountain ranges, bodies of water'

# Spacy Basics

In [27]:
import spacy
nlp= spacy.load("en_core_web_md")

In [28]:
with open ("data/wiki_us.txt", "r") as f:
    text = f.read()

In [29]:
print(text)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a transcontinental country located primarily in North America. It consists of 50 states, a federal district, five major unincorporated territories, nine minor outlying islands,[i] and 326 Indian reservations with limited sovereignty. It is the third-largest country by both land and total area.[c] The United States shares land borders with Canada to its north and with Mexico to its south. It maintains maritime borders with the Bahamas, Cuba, Russia, and other nations.[j] It has a population of over 331 million,[d] and is the third most populous country in the world after China and India. The national capital is Washington, D.C., and the most populous city and financial center is New York City. The United States is a melting pot of cultures and ethnicities, and its population has been profoundly shaped by centuries of immigration. It has a highly diverse climate and geography and

In [30]:
doc = nlp(text)

In [31]:
print(doc)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a transcontinental country located primarily in North America. It consists of 50 states, a federal district, five major unincorporated territories, nine minor outlying islands,[i] and 326 Indian reservations with limited sovereignty. It is the third-largest country by both land and total area.[c] The United States shares land borders with Canada to its north and with Mexico to its south. It maintains maritime borders with the Bahamas, Cuba, Russia, and other nations.[j] It has a population of over 331 million,[d] and is the third most populous country in the world after China and India. The national capital is Washington, D.C., and the most populous city and financial center is New York City. The United States is a melting pot of cultures and ethnicities, and its population has been profoundly shaped by centuries of immigration. It has a highly diverse climate and geography and

In [32]:
print(len(text)) # counts every index within the txt file
print(len(doc)) # counts every "token" which can be defined as a word or punctuation mark

4596
840


In [33]:
for token in text[:10]:
    print(token)

T
h
e
 
U
n
i
t
e
d


In [34]:
for token in doc[:10]:
    print(token)

The
United
States
of
America
(
U.S.A.
or
USA
)


In [35]:
for token in text.split()[:10]:
    print(token)

The
United
States
of
America
(U.S.A.
or
USA),
commonly
known


In [36]:
for sent in doc.sents:
    print(sent)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a transcontinental country located primarily in North America.
It consists of 50 states, a federal district, five major unincorporated territories, nine minor outlying islands,[i] and 326 Indian reservations with limited sovereignty.
It is the third-largest country by both land and total area.[c]
The United States shares land borders with Canada to its north and with Mexico to its south.
It maintains maritime borders with the Bahamas, Cuba, Russia, and other nations.[j]
It has a population of over 331 million,[d] and is the third most populous country in the world after China and India.
The national capital is Washington, D.C., and the most populous city and financial center is New York City.
The United States is a melting pot of cultures and ethnicities, and its population has been profoundly shaped by centuries of immigration.
It has a highly diverse climate and geography and

In [37]:
sentence1 = doc.sents[0]
print(sentence1)

TypeError: 'generator' object is not subscriptable

In [40]:
sentence1 = list(doc.sents)[0]
sentence1

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a transcontinental country located primarily in North America.

In [41]:
for token in doc[:10]:
    print(token)

The
United
States
of
America
(
U.S.A.
or
USA
)


In [42]:
token2 = sentence1[2]
print(token2)

States


In [43]:
token2.text

'States'

In [44]:
token2.left_edge

The

In [45]:
token2.right_edge

,

In [46]:
token2.ent_type

384

In [47]:
token2.ent_type_

'GPE'

In [48]:
token2.ent_iob_

'I'

In [49]:
token2.lemma_

'States'

In [50]:
sentence1[12].lemma_

'know'

In [51]:
for sent in sentence1[0:15]:
    print(sent)

The
United
States
of
America
(
U.S.A.
or
USA
)
,
commonly
known
as
the


In [52]:
token2.morph

Number=Sing

In [53]:
sentence1[12].morph

Aspect=Perf|Tense=Past|VerbForm=Part

In [54]:
token2.pos_ #part of speech

'PROPN'

In [55]:
token2.dep_ #dependency on the sentence

'nsubj'

In [56]:
token2.lang_

'en'

In [57]:
text = "Mike enjoys playing football"

In [58]:
doc2 = nlp(text)

In [59]:
print(doc2)

Mike enjoys playing football


In [60]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Mike PROPN nsubj
enjoys VERB ROOT
playing VERB xcomp
football NOUN dobj


In [61]:
from spacy import displacy
displacy.render(doc2, style="dep")

# Named Entity Recoginition

In [62]:
import spacy
nlp= spacy.load("en_core_web_md")

with open ("data/wiki_us.txt", "r") as f:
    text = f.read()
    
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

The United States of America GPE
U.S.A. GPE
USA GPE
the United States GPE
U.S. GPE
US GPE
America GPE
North America LOC
50 CARDINAL
five CARDINAL
nine CARDINAL
326 CARDINAL
Indian NORP
third ORDINAL
The United States GPE
Canada GPE
Mexico GPE
Bahamas GPE
Cuba GPE
Russia GPE
nations.[j ORG
over 331 CARDINAL
third ORDINAL
China GPE
India GPE
Washington GPE
D.C. GPE
New York City GPE
The United States GPE
centuries DATE
one CARDINAL
17 CARDINAL
Siberia LOC
North American NORP
at least 12,000 years ago DATE
European NORP
the 16th century DATE
The United States GPE
the Thirteen British Colonies ORG
the East Coast LOC
British NORP
Crown ORG
the American Revolution ORG
1765â€“1784 DATE
the late 18th century DATE
U.S. GPE
North America LOC
Native Americans NORP
1848 DATE
the United States GPE
the Confederate States of America ORG
Union ORG
the American Civil War EVENT
Union ORG
the Thirteenth Amendment LAW
1900 DATE
the United States GPE
World War EVENT
Japan GPE
Pearl Harbor GPE
1941 DATE
U.S

In [63]:
from spacy import displacy

displacy.render(doc, style = "ent")

In [64]:
doc = nlp(text)

In [65]:
sentence1 = list(doc.sents)[0]

In [66]:
print(sentence1)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a transcontinental country located primarily in North America.


In [67]:
import numpy as np

your_word = "country"

ms = nlp.vocab.vectors.most_similar(
    np.asarray([nlp.vocab.vectors[nlp.vocab.strings[your_word]]]), n = 10)
words = [nlp.vocab.strings[w] for w in ms[0][0]]
distances = ms[2]
print(words)

['POVERTY', 'inner-city', 'Poverty', 'INTERSECT', 'INEQUALITY', 'Inequality', 'ILLITERACY', 'illiteracy', 'handicaps', 'poorest']


In [68]:
doc1 = nlp("I like salty fries and hamburgers")
doc2 = nlp("Fast food tastes very good")

In [69]:
print(doc1, "<->", doc2, doc1.similarity(doc2))

I like salty fries and hamburgers <-> Fast food tastes very good 0.7353870417719951


In [70]:
doc3 = nlp("The Empire State Building is in New York.")

In [71]:
print(doc1, "<->", doc3, doc1.similarity(doc3))

I like salty fries and hamburgers <-> The Empire State Building is in New York. 0.505982187537456


In [72]:
doc4 = nlp("I enjoy oranges.")
doc5 = nlp("I enjoy apples.")
doc6 = nlp("I enjoy burgers")

In [73]:
print(doc4, "<->", doc5, doc4.similarity(doc5))

I enjoy oranges. <-> I enjoy apples. 0.9522808230936617


In [74]:
print(doc4, "<->", doc6, doc4.similarity(doc6))

I enjoy oranges. <-> I enjoy burgers 0.8391932833041409


In [75]:
french_fries = doc1[2:4]
burgers = doc1[5]
print(french_fries, "<->", burgers, french_fries.similarity(burgers))


salty fries <-> hamburgers 0.5733411908149719


# spaCy Pipeline

In [76]:
nlp = spacy.blank("en")

In [77]:
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x2c442bae680>

In [78]:
nlp.analyze_pipes()

{'summary': {'sentencizer': {'assigns': ['token.is_sent_start', 'doc.sents'],
   'requires': [],
   'scores': ['sents_f', 'sents_p', 'sents_r'],
   'retokenizes': False}},
 'problems': {'sentencizer': []},
 'attrs': {'doc.sents': {'assigns': ['sentencizer'], 'requires': []},
  'token.is_sent_start': {'assigns': ['sentencizer'], 'requires': []}}}

In [79]:
nlp2 = spacy.load("en_core_web_md")

In [80]:
nlp2.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  'tagger': [],
  'parser': [],
  'attribute_ruler': [],
  'lemmatizer': [],
  'ner': []},
 'att

# Entity Ruler

In [81]:
nlp = spacy.load("en_core_web_md")
text = "West Chesterfieldville was referenced in Mr. Deeds"

In [82]:
doc = nlp(text)

In [83]:
for ent in doc.ents:
    print(ent.text, ent.label_)

West Chesterfieldville PERSON
Deeds PERSON


In [84]:
ruler = nlp.add_pipe("entity_ruler")

In [85]:
nlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ent

In [86]:
patterns = [
    {"label": "GPE", "pattern": "West Chestertenfieldville"}
]

In [87]:
ruler.add_patterns(patterns)

In [88]:
doc2 = nlp(text)
for ent in doc2.ents:
    print(ent.text, ent.label_)

West Chesterfieldville PERSON
Deeds PERSON


In [89]:
nlp2 = spacy.load("en_core_web_md")

In [90]:
ruler = nlp2.add_pipe("entity_ruler", before = "ner")

In [91]:
ruler.add_patterns(patterns)

In [92]:
doc = nlp2(text)

In [93]:
for ent in doc.ents:
    print(ent.text, ent.label_)

West Chesterfieldville PERSON
Deeds PERSON


In [94]:
import spacy
text = "West Chesterfieldville was referenced in Mr. Deeds"
nlp3 = spacy.load("en_core_web_md")

In [95]:
ruler = nlp3.add_pipe("entity_ruler", before = "ner")

In [96]:
patterns = [
    {"label": "GPE", "pattern": "West Chesterfieldville"},
    {"label": "FILM", "pattern": "Mr. Deeds"}
]

In [97]:
ruler.add_patterns(patterns)

In [98]:
doc = nlp3(text)

In [99]:
for ent in doc.ents:
    print(ent.text, ent.label_)

West Chesterfieldville GPE
Mr. Deeds FILM


# spaCy Matcher

In [100]:
import spacy
from spacy.matcher import Matcher

In [101]:
nlp = spacy.load("en_core_web_md")
matcher = Matcher(nlp.vocab)
pattern = [{"LIKE_EMAIL": True}]
matcher.add("EMAIL_ADDRESS", [pattern])
doc = nlp("This is an email address: spencerfox8@gmail.com")
matches = matcher(doc)

In [102]:
nlp.pipe_labels['parser']

['ROOT',
 'acl',
 'acomp',
 'advcl',
 'advmod',
 'agent',
 'amod',
 'appos',
 'attr',
 'aux',
 'auxpass',
 'case',
 'cc',
 'ccomp',
 'compound',
 'conj',
 'csubj',
 'csubjpass',
 'dative',
 'dep',
 'det',
 'dobj',
 'expl',
 'intj',
 'mark',
 'meta',
 'neg',
 'nmod',
 'npadvmod',
 'nsubj',
 'nsubjpass',
 'nummod',
 'oprd',
 'parataxis',
 'pcomp',
 'pobj',
 'poss',
 'preconj',
 'predet',
 'prep',
 'prt',
 'punct',
 'quantmod',
 'relcl',
 'xcomp']

In [103]:
spacy.explain('punct')

'punctuation'

In [104]:
displacy.render(doc, style = "ent")

In [105]:
print(matches)

[(16571425990740197027, 6, 7)]


In [106]:
print(nlp.vocab[matches[0][0]].text)

EMAIL_ADDRESS


In [107]:
with open ("data/wiki_us.txt", "r") as f:
    text = f.read()

In [108]:
print(text)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a transcontinental country located primarily in North America. It consists of 50 states, a federal district, five major unincorporated territories, nine minor outlying islands,[i] and 326 Indian reservations with limited sovereignty. It is the third-largest country by both land and total area.[c] The United States shares land borders with Canada to its north and with Mexico to its south. It maintains maritime borders with the Bahamas, Cuba, Russia, and other nations.[j] It has a population of over 331 million,[d] and is the third most populous country in the world after China and India. The national capital is Washington, D.C., and the most populous city and financial center is New York City. The United States is a melting pot of cultures and ethnicities, and its population has been profoundly shaped by centuries of immigration. It has a highly diverse climate and geography and

In [109]:
nlp = spacy.load("en_core_web_md")
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN"}]
matcher.add("PROPER_NOUN", [pattern])
doc = nlp(text)
matches = matcher(doc)
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

119
(451313080118390996, 1, 2) United
(451313080118390996, 2, 3) States
(451313080118390996, 4, 5) America
(451313080118390996, 6, 7) U.S.A.
(451313080118390996, 8, 9) USA
(451313080118390996, 15, 16) United
(451313080118390996, 16, 17) States
(451313080118390996, 18, 19) U.S.
(451313080118390996, 20, 21) US
(451313080118390996, 23, 24) America


In [110]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP": "+"}, {"POS": "VERB"}]
matcher.add("PROPER_NOUN", [pattern], greedy = 'LONGEST')
doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

7
(451313080118390996, 78, 81) United States shares
(451313080118390996, 247, 250) United States emerged
(451313080118390996, 294, 296) U.S. began
(451313080118390996, 324, 327) United States spanned
(451313080118390996, 427, 429) U.S. declared
(451313080118390996, 560, 563) United States launching
(451313080118390996, 747, 749) U.S. holds


In [111]:
doc = nlp("Britian is a place. Mary is a doctor.")

In [112]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Britian NORP


In [113]:
from spacy.language import Language

In [114]:
@Language.component("remove_gpe")
def remove_gpe(doc):
    original_ents = list(doc.ents)
    for ent in doc.ents:
        if ent.label_ == "GPE":
            original_ents.remove(ent)
    doc.ents = original_ents
    return(doc)

In [115]:
nlp.add_pipe("remove_gpe")

<function __main__.remove_gpe(doc)>

In [116]:
nlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'remove_gpe': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  

In [117]:
doc = nlp("Britian is a place. Mary is a doctor.")

In [118]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Britian NORP


In [119]:
import re

In [120]:
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."

In [121]:
pattern = r"Paul [A-Z]\w+"

In [122]:
matches = re.finditer(pattern, text)

for match in matches:
    print(match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


In [123]:
import spacy
from spacy.tokens import Span

In [124]:
nlp = spacy.blank("en")

In [125]:
doc = nlp(text)

In [126]:
original_ents = list(doc.ents)

In [127]:
mwt_ents = []

In [128]:
for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    print(span)

Paul Newman
Paul Hollywood


In [129]:
if span is not None:
    mwt_ents.append((span.start, span.end, span.text))

In [130]:
for ent in mwt_ents:
    start, end, name = ent
    per_ent = Span(doc, start, end, label = "PERSON")
    original_ents.append(per_ent)
doc.ents = original_ents
for ent in doc.ents:
    print(ent.text, ent.label_)


Paul Hollywood PERSON


In [131]:
print(mwt_ents)

[(8, 10, 'Paul Hollywood')]


In [132]:
from spacy.language import Language
from spacy.util import filter_spans

In [133]:
@Language.component("cinema_ner")
def cinema_ner(doc):
    pattern = r"Hollywood"
    original_ents = list(doc.ents)
    mwt_ents = []
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
    if span is not None:
        mwt_ents.append((span.start, span.end, span.text))
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label = "CINEMA")
        original_ents.append(per_ent)
    filtered = filter_spans(original_ents)
    doc.ents = filtered
    return(doc)

In [134]:
nlp3 = spacy.load("en_core_web_md")
nlp3.add_pipe("cinema_ner")

#nlp2 = spacy.blank("en")
#nlp2.add_pipe("paul_ner")

<function __main__.cinema_ner(doc)>

In [135]:
doc3 = nlp3(text)
for ent in doc3.ents:
    print(ent.text, ent.label_)

Paul Newman PERSON
American NORP
Paul Hollywood PERSON
British NORP
Paul PERSON
