In [32]:
import spacy 
from spacy.lang.en import English

In [31]:
nlp = spacy.load("en_core_web_md")

In [284]:
import os

In [285]:
nlp = English()

In [286]:
doc = nlp("Hello world!")

In [287]:
for token in doc:
    print(token.text) # word or punctuation chars

Hello
world
!


In [288]:
token = doc[1]

In [289]:
token

world

In [290]:
span = doc[1:4]

In [291]:
span 

world!

In [292]:
span = doc[1:2]

In [293]:
span 

world

In [294]:
doc = nlp('It costs $5.')

In [295]:
print('Tokens: ', [token for token in doc])

Tokens:  [It, costs, $, 5, .]


In [296]:
print('Index: ', [token.i for token in doc])

Index:  [0, 1, 2, 3, 4]


In [297]:
print('is_alpha: ', [token.is_alpha for token in doc])

is_alpha:  [True, True, False, False, False]


In [298]:
print('is_punct: ', [token.is_punct for token in doc])

is_punct:  [False, False, False, False, True]


In [299]:
print('like_num: ', [token.like_num for token in doc])

like_num:  [False, False, False, True, False]


In [300]:
doc = nlp('It costs $5 and ten cents.')

In [301]:
print('like_num: ', [token.like_num for token in doc])
# can detect "TEN" as a number

like_num:  [False, False, False, True, False, True, False, False]


In [302]:
first_token = doc[0]

In [303]:
first_token 

It

In [304]:
# Process the text
doc = nlp("In 1990, more than 60% of people in East Asia were in extreme poverty. Now less than 4% are.")

In [305]:
# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i+1]
        # Check if the next token's text equals '%'
        if next_token.text == '%':
            print('Percentage found:', token.text)

Percentage found: 60
Percentage found: 4


In [306]:
# !python -m spacy download en_core_web_md
import spacy

nlp = spacy.load('en_core_web_md')

In [307]:
doc = nlp('She ate the pizza')

In [308]:
#iterate over the tokens
for token in doc:
    print(f"{token.text:{6}} {token.pos_:->{10}}")

She    ------PRON
ate    ------VERB
the    -------DET
pizza  ------NOUN


In [309]:
for token in doc:
    print(f"{token.text:{6}} {token.pos_:->{10}} {token.dep_:>{10}} {token.head.text:>{8}}")
    # dep_ return dependencies (subject or object)
    # token.head.tetx --> parent token of. Shows the child tokens
    # nsubj (nominal subject), dobj(direct object), det(determiner)

She    ------PRON      nsubj      ate
ate    ------VERB       ROOT      ate
the    -------DET        det    pizza
pizza  ------NOUN       dobj      ate


In [310]:
# Named Entitites

doc = nlp(u'Apple is looking at buying U.K. starup for $1 billion')

In [311]:
print('like_num: ', [token.like_num for token in doc])

like_num:  [False, False, False, False, False, False, False, False, False, True, True]


In [312]:
# Iterate overt the predicted entitites
for ent in doc.ents:
    print(f'{ent.text:{10}} {ent.label_}')

Apple      ORG
U.K.       GPE
$1 billion MONEY


In [313]:
spacy.explain('GPE')

'Countries, cities, states'

In [314]:
spacy.explain('MONEY')

'Monetary values, including unit'

In [315]:
spacy.explain('NNP')  # also for dependency labels

'noun, proper singular'

In [316]:
spacy.explain('dobj')

'direct object'

In [317]:
## Rule-Match 


from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

## Add the pattern to the macther
pattern = [{'ORTH':'iPhone'}, {'ORTH':'X'}]
matcher.add('IPHONE_PATTERN', None, pattern)  # IPHONE_PATTERN is a unique ID

In [318]:
doc = nlp("New iPhone X relase date leaked")

In [319]:
matches = matcher(doc)

In [320]:
matches

[(9528407286733565721, 1, 3)]

In [321]:
for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span)

iPhone X


In [322]:
pattern=[{'IS_DIGIT': True},
         {'LOWER':'fifa'},
         {'LOWER':'world'},
         {'LOWER':'cup'},
         {'IS_PUNCT': True}]

# Token include digits, punct and case INsensitive fifa, world, cup

In [323]:
doc = nlp('2018 FIFA World Cup: France Won!')

In [324]:
matcher.add('WORLDCUP_PATTERN', None, pattern)

In [325]:
macthes = matcher(doc)

In [326]:
for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span)

FIFA World


In [327]:
pattern = [{'LEMMA':'love', 'POS':'VERB'},{'POS':'NOUN'}]

# We are looking for a 'Love' verb followed by a noun

In [328]:
doc = nlp('I loved dogs but now I love cats more')

In [329]:
matcher.add('LOVE_PATTERN', None, pattern)
macthes = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span)

loved dogs


In [330]:

# Write one pattern that matches adjectives ('ADJ') followed by one or two 'NOUN's 
# (one noun and one optional noun)


doc = nlp("Features of the app include a beautiful design, smart search, automatic labels and optional voice responses.")

# Write a pattern for adjective plus one or two nouns
pattern = [{'POS': 'ADJ'}, {'POS': 'NOUN'}, {'POS': 'NOUN', 'OP': '?'}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add('ADJ_NOUN_PATTERN', None, pattern)
matches = matcher(doc)
print('Total matches found:', len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print('Match found:', doc[start:end].text)
    

Total matches found: 4
Match found: beautiful design
Match found: smart search
Match found: automatic labels
Match found: optional voice responses


In [331]:
help(matcher.remove)

Help on built-in function remove:

remove(...) method of spacy.matcher.matcher.Matcher instance
    Remove a rule from the matcher. A KeyError is raised if the key does
    not exist.
    
    key (unicode): The ID of the match rule.



In [332]:
matcher.__contains__

<method-wrapper '__contains__' of spacy.matcher.matcher.Matcher object at 0x1f10b7de0>

In [333]:
## Write one pattern that only matches mentions of the full iOS versions: 
# "iOS 7", "iOS 11" and "iOS 10".


doc = nlp("""After making the iOS update you won't notice a radical system-wide redesign: 
            nothing like the aesthetic upheaval we got with iOS 7. Most of iOS 11's furniture 
            remains the same as in iOS 10. But you will discover some tweaks once you delve a little deeper.""")

# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{'ORTH': 'iOS'}, {'IS_DIGIT': True}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add('IOS_VERSION_PATTERN', None, pattern)
matches = matcher(doc)
print('Total matches found:', len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print('Match found:', doc[start:end].text)

Total matches found: 6
Match found: radical system
Match found: wide redesign
Match found: aesthetic upheaval
Match found: iOS 7
Match found: iOS 11
Match found: iOS 10


In [334]:
coffee_hash = nlp.vocab.strings['coffee']

In [335]:
coffee_hash

3197928453018144401

In [336]:
coffee_string = nlp.vocab.strings[coffee_hash]

In [337]:
coffee_string

'coffee'

In [338]:
## LEXEMES

doc = nlp('I love coffee')
lexeme = nlp.vocab['coffee']

# print the lexical attributes

print(lexeme.text, lexeme.orth, lexeme.is_alpha, lexeme.like_num)

coffee 3197928453018144401 True False


In [339]:
## Docs, Span and Entities from sctrach

In [340]:
# Import the Doc and Span classes
from spacy.tokens import Doc, Span

# Create a doc from the words and spaces
doc = Doc(nlp.vocab, words=['I', 'like', 'David', 'Bowie'], spaces=[True, True, True, False])

# Create a span for "David Bowie" from the doc and assign it the label "PERSON"
span = Span(doc, 2, 4, )  # span = Span(doc, 2, 4, label='PERSON') but label does not work 
print(span.text, span.label_)

David Bowie 


In [341]:
# Import the Doc and Span classes
from spacy.tokens import Doc, Span

# Create a doc from the words and spaces
doc = Doc(nlp.vocab, words=['I', 'like', 'David', 'Bowie'], spaces=[True, True, True, False])

# Create a span for "David Bowie" from the doc and assign it the label "PERSON"
span = Span(doc, 2, 4, )

# Add the span to the doc's entities
doc.ents = [span]

# Print entities' text and labels
print([(ent.text, ent.label_) for ent in doc.ents])

[('David Bowie', '')]


In [342]:
# !pip install -U spacy

In [400]:
 nlp = spacy.load('en_core_web_md')

In [345]:
## Comaring two dcoumnets in terms of similarity

In [349]:
# to use similarity attribute we are to use either medium or large model of spaCy

doc1 = nlp('I like fast food')
doc2 = nlp('I like pizza')

print(doc1.similarity(doc2))

0.8627204117787385


In [350]:
doc3 = nlp('I like swimming')
print(doc1.similarity(doc3))

0.8038315411687904


In [351]:
doc3 = nlp('I like skjfhsd')
print(doc1.similarity(doc3))

0.8491934178104581


In [352]:
# Compare 2 tokens

doc = nlp('I like pizza and pasta')
token1 = nlp("pizza")
token2 = nlp('pasta')

token1.similarity(token2)

0.7369546743653412

In [353]:
doc = nlp('I like pizza and pasta')
token1 = nlp("bus")
token2 = nlp('dog')

token1.similarity(token2)

0.21193229601905125

In [354]:
# Compare a document with a token

doc =nlp('I like pizza')
token = nlp("soap")[0]
print(doc.similarity(token))

0.32531983166759537


In [355]:
# Compare a span with a document

span = nlp('I like pizza and pasta')[2:5]
doc = nlp('McDonalds sells burgers')

print(span.similarity(doc))

0.6199092090831612


In [356]:
## Similarity is determined using word vectors
## Word2Vec is used

doc = nlp('I have a banana')

print(doc[3])

banana


In [357]:
print(doc[3].vector)
# multi dimentinal word embediing 

[ 2.0228e-01 -7.6618e-02  3.7032e-01  3.2845e-02 -4.1957e-01  7.2069e-02
 -3.7476e-01  5.7460e-02 -1.2401e-02  5.2949e-01 -5.2380e-01 -1.9771e-01
 -3.4147e-01  5.3317e-01 -2.5331e-02  1.7380e-01  1.6772e-01  8.3984e-01
  5.5107e-02  1.0547e-01  3.7872e-01  2.4275e-01  1.4745e-02  5.5951e-01
  1.2521e-01 -6.7596e-01  3.5842e-01 -4.0028e-02  9.5949e-02 -5.0690e-01
 -8.5318e-02  1.7980e-01  3.3867e-01  1.3230e-01  3.1021e-01  2.1878e-01
  1.6853e-01  1.9874e-01 -5.7385e-01 -1.0649e-01  2.6669e-01  1.2838e-01
 -1.2803e-01 -1.3284e-01  1.2657e-01  8.6723e-01  9.6721e-02  4.8306e-01
  2.1271e-01 -5.4990e-02 -8.2425e-02  2.2408e-01  2.3975e-01 -6.2260e-02
  6.2194e-01 -5.9900e-01  4.3201e-01  2.8143e-01  3.3842e-02 -4.8815e-01
 -2.1359e-01  2.7401e-01  2.4095e-01  4.5950e-01 -1.8605e-01 -1.0497e+00
 -9.7305e-02 -1.8908e-01 -7.0929e-01  4.0195e-01 -1.8768e-01  5.1687e-01
  1.2520e-01  8.4150e-01  1.2097e-01  8.8239e-02 -2.9196e-02  1.2151e-03
  5.6825e-02 -2.7421e-01  2.5564e-01  6.9793e-02 -2

In [358]:
matcher = Matcher(nlp.vocab)
matcher.add("DOG", None, [{'LOWER':'golden'}, {"LOWER":"retriever"}])

doc = nlp('I have a Golden Retriever')

for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print('Matched span: ', span)

Matched span:  Golden Retriever


In [359]:
## Root token

print('Root token: ', span.root.text) # Root token is Retriever
print('Root head token : ', span.root.head.text) # Root Head Token is Have 

# Previous token:

print('Prevoiuos Token: ', doc[start-1].text, doc[start-1].pos_) # prevoius token is a 

Root token:  Retriever
Root head token :  have
Prevoiuos Token:  a DET


In [360]:
## Phrase Matching is used for key word search in the document
## Faster than Matcher
## Great for large word lists

from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)
pattern = nlp('Golden Retriever')
matcher.add('DOG', None, pattern)

doc = nlp('I have a Golden Retriever')

for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print('Matched span: ', span)

Matched span:  Golden Retriever


In [401]:
# Import the PhraseMatcher and initialize it
from spacy.matcher import PhraseMatcher
import spacy
#nlp = spacy.load('en_core_web_md')
list_of_countries = ['Afghanistan',
 'Åland Islands',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia (Plurinational State of)',
 'Bonaire, Sint Eustatius and Saba',
 'Bosnia and Herzegovina',
 'Botswana',
 'Bouvet Island',
 'Brazil',
 'British Indian Ocean Territory',
 'United States Minor Outlying Islands',
 'Virgin Islands (British)',
 'Virgin Islands (U.S.)',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cabo Verde',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Christmas Island',
 'Cocos (Keeling) Islands',
 'Colombia',
 'Comoros',
 'Congo',
 'Congo (Democratic Republic of the)',
 'Cook Islands',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Curaçao',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Ethiopia',
 'Falkland Islands',
 'Faroe Islands',
 'Fiji',
 'Finland',
 'France',
 'French Guiana',
 'French Polynesia',
 'French Southern Territories',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Gibraltar',
 'Greece',
 'Greenland',
 'Grenada',
 'Guadeloupe',
 'Guam',
 'Guatemala',
 'Guernsey',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',
 'Haiti',
 'Heard Island and McDonald Islands',
 'Holy See',
 'Honduras',
 'Hong Kong',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 "Côte d'Ivoire",
 'Iran',
 'Iraq',
 'Ireland',
 'Isle of Man',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jersey',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kiribati',
 'Kuwait',
 'Kyrgyzstan',
 "Lao People's Democratic Republic",
 'Latvia',
 'Lebanon',
 'Lesotho',
 'Liberia',
 'Libya',
 'Liechtenstein',
 'Lithuania',
 'Luxembourg',
 'Macao',
 'Macedonia',
 'Madagascar',
 'Malawi',
 'Malaysia',
 'Maldives',
 'Mali',
 'Malta',
 'Marshall Islands',
 'Martinique',
 'Mauritania',
 'Mauritius',
 'Mayotte',
 'Mexico',
 'Micronesia',
 'Moldova',
 'Monaco',
 'Mongolia',
 'Montenegro',
 'Montserrat',
 'Morocco',
 'Mozambique',
 'Myanmar',
 'Namibia',
 'Nauru',
 'Nepal',
 'Netherlands',
 'New Caledonia',
 'New Zealand',
 'Nicaragua',
 'Niger',
 'Nigeria',
 'Niue',
 'Norfolk Island',
 "Korea",
 'Northern Mariana Islands',
 'Norway',
 'Oman',
 'Pakistan',
 'Palau',
 'Palestine, State of',
 'Panama',
 'Papua New Guinea',
 'Paraguay',
 'Peru',
 'Philippines',
 'Pitcairn',
 'Poland',
 'Portugal',
 'Puerto Rico',
 'Qatar',
 'Republic of Kosovo',
 'Réunion',
 'Romania',
 'Russian Federation',
 'Rwanda',
 'Saint Barthélemy',
 'Saint Helena, Ascension and Tristan da Cunha',
 'Saint Kitts and Nevis',
 'Saint Lucia',
 'Saint Martin',
 'Saint Pierre and Miquelon',
 'Saint Vincent and the Grenadines',
 'Samoa',
 'San Marino',
 'Sao Tome and Principe',
 'Saudi Arabia',
 'Senegal',
 'Serbia',
 'Seychelles',
 'Sierra Leone',
 'Singapore',
 'Sint Maarten (Dutch part)',
 'Slovakia',
 'Slovenia',
 'Solomon Islands',
 'Somalia',
 'South Africa',
 'South Georgia and the South Sandwich Islands',
 'Korea (Republic of)',
 'South Sudan',
 'Spain',
 'Sri Lanka',
 'Sudan',
 'Suriname',
 'Svalbard and Jan Mayen',
 'Swaziland',
 'Sweden',
 'Switzerland',
 'Syrian Arab Republic',
 'Taiwan',
 'Tajikistan',
 'Tanzania, United Republic of',
 'Thailand',
 'Timor-Leste',
 'Togo',
 'Tokelau',
 'Tonga',
 'Trinidad and Tobago',
 'Tunisia',
 'Turkey',
 'Turkmenistan',
 'Turks and Caicos Islands',
 'Tuvalu',
 'Uganda',
 'Ukraine',
 'United Arab Emirates',
 'United Kingdom',
 'UK',
 'United States of America',
 'USA',
 'U.S.',
 'U.S.A',                   
 'Uruguay',
 'Uzbekistan',
 'Vanuatu',
 'Venezuela',
 'Viet Nam',
 'Wallis and Futuna',
 'Western Sahara',
 'Yemen',
 'Zambia',
 'Zimbabwe']

doc = nlp('Czech Republic may help Slovakia in economy')
matcher = PhraseMatcher(nlp.vocab)

# Create pattern Doc objects and add them to the matcher
# This is the faster version of: [nlp(country) for country in COUNTRIES]
patterns = list(nlp.pipe(list_of_countries))
matcher.add('COUNTRY', None, *patterns)

# Call the matcher on the test document and print the result
matches = matcher(doc)
print([doc[start:end] for match_id, start, end in matches])

[Czech Republic, Slovakia]


In [402]:
## CREATING A CUSTOM ENTITY RECOGNIZER/MATCHER (COUNTRY FOR THIS CASE)

In [403]:
class EntityMatcher(object):
    name = 'entity_matcher'

    def __init__(self, nlp, terms, label):
        patterns = [nlp(term) for term in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = []
        for label, start, end in matches:
            span = Span(doc, start, end, label=label)
            spans.append(span)
        doc.ents = spans
        return doc

In [404]:
entity_matcher = EntityMatcher(nlp, list_of_countries, 'COUNTRY')
nlp.add_pipe(entity_matcher, before='ner')
#nlp.add_pipe(entity_matcher)
print(nlp.pipe_names)

['tagger', 'parser', 'entity_matcher', 'ner']


In [40]:
## Extraction a specific entity form the text

text = """After the Cold War, the UN saw a radical expansion in its peacekeeping duties, taking on more
missions in ten years than it had in the previous four decades.Between 1988 and 2000, the number of adopted
Security Council resolutions more than doubled, and the peacekeeping budget increased more than tenfold. The 
UN negotiated an end to the Salvadoran Civil War, launched a successful peacekeeping mission in Namibia, and 
oversaw democratic elections in post-apartheid South Africa and post-Khmer Rouge Cambodia. In 1991, the UN
authorized a US-led coalition that repulsed the Iraqi invasion of Kuwait. Brian Urquhart, Under-Secretary-General
from 1971 to 1985, later described the hopes raised by these successes as a "false renaissance" for the organization,
given the more troubled missions that followed. Though the UN Charter had been written primarily to prevent aggression
by one nation against another, in the early 1990s the UN faced a number of simultaneous, serious crises within nations
such as Somalia, Haiti, Mozambique, and the former Yugoslavia. The UN mission in Somalia was widely viewed as a 
failure after the US withdrawal following casualties in the Battle of Mogadishu, and the UN mission to Bosnia faced
'worldwide ridicule' for its indecisive and confused mission in the face of ethnic cleansing. In 1994, the UN 
Assistance Mission for Rwanda failed to intervene in the Rwandan genocide amid indecision in the Security Council. 
Beginning in the last decades of the Cold War, American and European critics of the UN condemned the organization for
perceived mismanagement and corruption. In 1984, the US President, Ronald Reagan, withdrew his nation\'s funding from
UNESCO (the United Nations Educational, Scientific and Cultural Organization, founded 1946) over allegations of
mismanagement, followed by Britain and Singapore. Boutros Boutros-Ghali, Secretary-General from 1992 to 1996, 
initiated a reform of the Secretariat, reducing the size of the organization somewhat. His successor, Kofi Annan
(1997–2006), initiated further management reforms in the face of threats from the United States to withhold its UN
dues. In the late 1990s and 2000s, international interventions authorized by the UN took a wider variety of forms. 
The UN mission in the Sierra Leone Civil War of 1991–2002 was supplemented by British Royal Marines, and the invasion
of Afghanistan in 2001 was overseen by NATO. In 2003, the United States invaded Iraq despite failing to pass a UN 
Security Council resolution for authorization, prompting a new round of questioning of the organization\'s 
effectiveness. Under the eighth Secretary-General, Ban Ki-moon, the UN has intervened with peacekeepers in crises
including the War in Darfur in Sudan and the Kivu conflict in the Democratic Republic of Congo and sent observers 
and chemical weapons inspectors to the Syrian Civil War. In 2013, an internal review of UN actions in the final 
battles of the Sri Lankan Civil War in 2009 concluded that the organization had suffered "systemic failure". One 
hundred and one UN personnel died in the 2010 Haiti earthquake, the worst loss of life in the organization\'s history. 
The Millennium Summit was held in 2000 to discuss the UN\'s role in the 21st century. The three day meeting was the 
largest gathering of world leaders in history, and culminated in the adoption by all member states of the Millennium 
Development Goals (MDGs), a commitment to achieve international development in areas such as poverty reduction, 
gender equality, and public health. Progress towards these goals, which were to be met by 2015, was ultimately uneven.
The 2005 World Summit reaffirmed the UN\'s focus on promoting development, peacekeeping, human rights, and global 
security. The Sustainable Development Goals were launched in 2015 to succeed the Millennium Development Goals. In 
addition to addressing global challenges, the UN has sought to improve its accountability and democratic legitimacy 
by engaging more with civil society and fostering a global constituency. In an effort to enhance transparency, in 
2016 the organization held its first public debate between candidates for Secretary-General. On 1 January 2017, 
Portuguese diplomat António Guterres, who previously served as UN High Commissioner for Refugees, became the ninth 
Secretary-General. Guterres has highlighted several key goals for his administration, including an emphasis on 
diplomacy for preventing conflicts, more effective peacekeeping efforts, 
and streamlining the organization to be more responsive and versatile to global needs. New York, Ankara in May 2000."""

In [410]:
#text = "Iraq, Italy, Belgium , apple book came here. In 1990 the USA was perfect but Turkey"

In [415]:
from spacy.pipeline import EntityRuler

#nlp = spacy.load('en_core_web_md')
doc = nlp(text)
#ruler = EntityRuler(nlp) 
#patterns = list(nlp.pipe(list_of_countries))
#ruler.add_patterns(patterns)
#nlp.add_pipe(ruler)

for ent in doc.ents:
    if ent.label_=='COUNTRY':
        print(f'{ent.text:{15}} --> {ent.label_:>{10}}')


Namibia         -->    COUNTRY
South Africa    -->    COUNTRY
Cambodia        -->    COUNTRY
Kuwait          -->    COUNTRY
Somalia         -->    COUNTRY
Haiti           -->    COUNTRY
Mozambique      -->    COUNTRY
Somalia         -->    COUNTRY
Rwanda          -->    COUNTRY
Singapore       -->    COUNTRY
Sierra Leone    -->    COUNTRY
Afghanistan     -->    COUNTRY
Iraq            -->    COUNTRY
Sudan           -->    COUNTRY
Congo           -->    COUNTRY
Haiti           -->    COUNTRY


In [413]:
for ent in doc.ents: # not working properly, can not catch the dates and other GPE such as New York and Ankara
    print(f'{ent.text:{25}} --> {ent.label_:.>{15}}')

the Cold War              --> ..........EVENT
UN                        --> ............ORG
ten years                 --> ...........DATE
the previous four decades --> ...........DATE
Between 1988 and 2000     --> ...........DATE
Security Council          --> ............ORG
UN                        --> ............ORG
the Salvadoran Civil War  --> ..........EVENT
Namibia                   --> ........COUNTRY
South Africa              --> ........COUNTRY
Cambodia                  --> ........COUNTRY
Kuwait                    --> ........COUNTRY
Somalia                   --> ........COUNTRY
Haiti                     --> ........COUNTRY
Mozambique                --> ........COUNTRY
Somalia                   --> ........COUNTRY
Rwanda                    --> ........COUNTRY
Singapore                 --> ........COUNTRY
Sierra Leone              --> ........COUNTRY
Afghanistan               --> ........COUNTRY
Iraq                      --> ........COUNTRY
Sudan                     --> ....

In [384]:
# ## We are using the COUNTRIES matcher described above
# matcher = PhraseMatcher(nlp.vocab)
# matcher.add('COUNTRY', None, *patterns)

# from spacy.tokens import Doc, Span
# for match_id, start, end in matcher(doc):
    
#     # Create a span with the label of "GPE"
#     span = Span(doc2, start, end, label='GPE')
#     doc.ents = list(doc.ents) + [span]
#     #print(doc.ents)
# # Print the only COUNTRY entities in the document
# print([(ent.text, ent.label_) for ent in doc.ents if ent.label_ == 'GPE'])

In [270]:
# ! pip install spacy-nightly

In [416]:
nlp.pipe_names

['tagger', 'parser', 'entity_matcher', 'ner']

In [417]:
print(nlp.pipeline)

[('tagger', <spacy.pipeline.pipes.Tagger object at 0x217c8a588>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x18b1f87c8>), ('entity_matcher', <__main__.EntityMatcher object at 0x2964d9b38>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x18b1f8828>)]


In [None]:
# Define the custom component
def length_component(doc):
    # Get the doc's length
    doc_length = len(doc)
    print("This document is {} tokens long.".format(doc_length))
    # Return the doc
    return doc

# Load the small English model
#nlp = spacy.load('en_core_web_sm')

# Add the component first in the pipeline and print the pipe names
nlp.add_pipe(length_component, first=True)
print(nlp.pipe_names)

In [9]:
# Process a text
doc = nlp('This is a sentence')

This document is 4 tokens long.


In [None]:
# Define the custom component
def animal_component(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label 'ANIMAL'
    spans = [Span(doc, start, end, label='ANIMAL')
             for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc

In [10]:
## Setting Custom Attributes

from spacy.tokens import Doc, Token, Span

In [14]:
# Register the Token extension attribute 'is_country' with the default value False
Token.set_extension('is_country', default=False)

# Process the text and set the is_country attribute to True for the token "Spain"
doc = nlp("I live in Spain.")
doc[3]._.is_country = True

# Print the token text and the is_country attribute for all tokens
print([(token.text, token._.is_country) for token in doc])

This document is 5 tokens long.
[('I', False), ('live', False), ('in', False), ('Spain', True), ('.', False)]


In [19]:
# Define the getter function that takes a token and returns its reversed text
def get_reversed(token):
    return token.text[::-1]
  
# Register the Token property extension 'reversed' with the getter get_reversed
Token.set_extension('reversed', getter=get_reversed, force=True) # If you don't put foce=True, you will get an error 
# as of 2nd run

# Process the text and print the reversed attribute for each token
doc = nlp("All generalizations are false, including this one.")
for token in doc:
    print('reversed:', token._.reversed)

This document is 9 tokens long.
reversed: llA
reversed: snoitazilareneg
reversed: era
reversed: eslaf
reversed: ,
reversed: gnidulcni
reversed: siht
reversed: eno
reversed: .


In [20]:
# Define the getter function
def get_has_number(doc):
    # Return if any of the tokens in the doc return True for token.like_num
    return any(token.like_num for token in doc)

# Register the Doc property extension 'has_number' with the getter get_has_number
Doc.set_extension('has_number', getter=get_has_number)

# Process the text and check the custom has_number attribute 
doc = nlp("The museum closed for five years in 2012.")
print('has_number:', doc._.has_number)

This document is 9 tokens long.
has_number: True


In [21]:
def get_wikipedia_url(span):
    # Get a Wikipedia URL if the span has one of the labels
    if span.label_ in ('PERSON', 'ORG', 'GPE', 'LOCATION'):
        entity_text = span.text.replace(' ', '_')
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text

# Set the Span extension wikipedia_url using get getter get_wikipedia_url
Span.set_extension('wikipedia_url', getter=get_wikipedia_url)

doc = nlp("In over fifty years from his very first recordings right through to his last album, David Bowie was at the vanguard of contemporary culture.")
for ent in doc.ents:
    # Print the text and Wikipedia URL of the entity
    print(ent.text, ent._.wikipedia_url)

This document is 26 tokens long.
over fifty years None
first None
David Bowie https://en.wikipedia.org/w/index.php?search=David_Bowie


In [22]:
def countries_component(doc):
    # Create an entity Span with the label 'GPE' for all matches
    doc.ents = [Span(doc, start, end, label='GPE')
                for match_id, start, end in matcher(doc)]
    return doc

# Add the component to the pipeline
nlp.add_pipe(countries_component)

In [None]:
# def countries_component(doc):
#     # Create an entity Span with the label 'GPE' for all matches
#     doc.ents = [Span(doc, start, end, label='GPE')
#                 for match_id, start, end in matcher(doc)]
#     return doc

# # Add the component to the pipeline
# nlp.add_pipe(countries_component)

# # Register capital and getter that looks up the span text in country capitals
# Span.set_extension('capital', getter=lambda span: capitals.get(span.text), force=True)

# # Process the text and print the entity text, label and capital attributes
# doc = nlp("Czech Republic may help Slovakia protect its airspace")
# print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])

In [None]:
## BAD 

docs =[nlp(text) for text in LOTS_OF_TEXT]

## GOOD

docs = list(nlp.pipe(LOT_SOF_TEXT))

In [33]:
data = [('This is a text', {'id':1, 'page_number':15}),
        ('And another text', {'id':2, 'page_number':16})]

In [34]:
for doc, context in nlp.pipe(data, as_tuples=True):
    print(doc.text, context['page_number'])

This is a text 15
And another text 16


In [35]:
from spacy.tokens import Doc
Doc.set_extension('id', default=None)
Doc.set_extension('page_number', default=None)

for doc, context in nlp.pipe(data, as_tuples=True):
    doc._.id = context['id']
    doc._.page_number = context['page_number']

In [39]:
# IF WE ONLY NEED TEXT ATTRIBUTE/COMPONENT OF NLP THEN:
## BAD 

doc = nlp('Hello world')

## GOOD

doc = nlp.make_doc("Hello World")

In [41]:
text = 'Hello world, I am Serdar from USA'

In [42]:
## WE CAN TEMPORARILY DISABLE SOME COMPONENTS TO FASTER PROCESS

with nlp.disable_pipes('tagger', 'parser'): # out of with block they will be restored
    doc = nlp(text)
    print(doc.ents)

(Serdar, USA)


In [43]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [44]:
TEXTS = ['McDonalds is my favorite restaurant.',
 'Here I thought @McDonalds only had precooked burgers but it seems they only have not cooked ones?? I have no time to get sick..',
 'People really still eat McDonalds :(',
 'The McDonalds in Spain has chicken wings. My heart is so happy ',
 '@McDonalds Please bring back the most delicious fast food sandwich of all times!!....The Arch Deluxe :P',
 'please hurry and open. I WANT A #McRib SANDWICH SO BAD! :D',
 'This morning i made a terrible decision by gettin mcdonalds and now my stomach is payin for it']

In [45]:
# Process the texts and print the adjectives

## This is inefficient way
for text in TEXTS:
    doc = nlp(text)
    print([token.text for token in doc if token.pos_ == 'ADJ'])

['favorite']
['sick']
[]
['happy']
['delicious', 'fast']
[]
['terrible', 'gettin', 'payin']


In [49]:
# Efficient way
for doc in nlp.pipe(TEXTS):
    print([token.text for token in doc if token.pos_ == 'ADJ'])

['favorite']
['sick']
[]
['happy']
['delicious', 'fast']
[]
['terrible', 'gettin', 'payin']


In [50]:
# Process the texts and print the entities
docs = list(nlp.pipe(TEXTS))
entities = [doc.ents for doc in docs]
print(*entities)

(McDonalds,) () (McDonalds,) (McDonalds, Spain) (@McDonalds,) () (This morning,)


In [51]:
people = ['David Bowie', 'Angela Merkel', 'Lady Gaga']

# Create a list of patterns for the PhraseMatcher
patterns = list(nlp.pipe(people))

In [52]:
TEXTS = ['How to preorder the iPhone X',
 'iPhone X is coming',
 'Should I pay $1,000 for the iPhone X?',
 'The iPhone 8 reviews are here',
 'Your iPhone goes up to 11 today',
 'I need a new phone! Any tips?']

In [57]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

# Two tokens whose lowercase forms match 'iphone' and 'x'
pattern1 = [{'LOWER': 'iphone'}, {'LOWER': 'x'}]

# Token whose lowercase form matches 'iphone' and an optional digit
pattern2 = [{'LOWER': 'iphone'}, {'OP':'?', 'IS_DIGIT': True}]

# Add patterns to the matcher
matcher.add('GADGET', None, pattern1, pattern2)

In [58]:
# Create a blank 'en' model
nlp = spacy.blank('en')

# Create a new entity recognizer and add it to the pipeline
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

# Add the label 'GADGET' to the entity recognizer
ner.add_label('GADGET')

In [59]:
TRAINING_DATA = [('How to preorder the iPhone X', {'entities': [(20, 28, 'GADGET')]}),
 ('iPhone X is coming', {'entities': [(0, 8, 'GADGET')]}),
 ('Should I pay $1,000 for the iPhone X?', {'entities': [(28, 36, 'GADGET')]}),
 ('The iPhone 8 reviews are here', {'entities': [(4, 12, 'GADGET')]}),
 ('Your iPhone goes up to 11 today', {'entities': [(5, 11, 'GADGET')]}),
 ('I need a new phone! Any tips?', {'entities': []})]

In [65]:
import random
# Start the training
nlp.begin_training()

# Loop for 10 iterations
for itn in range(10):
    # Shuffle the training data
    random.shuffle(TRAINING_DATA)
    losses = {}
    
    # Batch the examples and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
        texts = [text for text, entities in batch]
        annotations = [entities for text, entities in batch]
        
        # Update the model
        nlp.update(texts, annotations, losses=losses)
        print(losses)

{'ner': 4.672516345977783}
{'ner': 9.683455526828766}
{'ner': 12.075345039367676}
{'ner': 1.2624063352122903}
{'ner': 3.51289764046669}
{'ner': 5.7956129572412465}
{'ner': 0.4537782919669553}
{'ner': 1.980706626929532}
{'ner': 3.642047315284799}
{'ner': 1.1778354898560792}
{'ner': 1.9670406673976686}
{'ner': 2.088610317414954}
{'ner': 0.20141411817726151}
{'ner': 0.21626807155368288}
{'ner': 1.6583324711172316}
{'ner': 0.009449234382515215}
{'ner': 0.012873385455405728}
{'ner': 1.0180944650129042}
{'ner': 0.00017365360730181578}
{'ner': 0.0018220643187084606}
{'ner': 0.04926770868093577}
{'ner': 0.0002627248097750723}
{'ner': 0.0013336976211744123}
{'ner': 0.020566556761281607}
{'ner': 5.074081209199832e-06}
{'ner': 0.00013164226499752865}
{'ner': 0.0005426717944250377}
{'ner': 2.7057295832466632e-06}
{'ner': 0.00011574770890507291}
{'ner': 0.00011914996419994106}


you've successfully trained your first spaCy model. The numbers printed to the IPython shell represent the loss on each iteration, the amount of work left for the optimizer. The lower the number, the better. In real life, you normally want to use a lot more data than this, ideally at least a few hundred or a few thousand examples.

In [66]:
TEST_DATA = ['Apple is slowing down the iPhone 8 and iPhone X - how to stop it',
 "I finally understand what the iPhone X 'notch' is for",
 'Everything you need to know about the Samsung Galaxy S9',
 'Looking to compare iPad models? Here’s how the 2018 lineup stacks up',
 'The iPhone 8 and iPhone 8 Plus are smartphones designed, developed, and marketed by Apple',
 'what is the cheapest ipad, especially ipad pro???',
 'Samsung Galaxy is a series of mobile computing devices designed, manufactured and marketed by Samsung Electronics']

In [69]:
# Process each text in TEST_DATA
for doc in nlp.pipe(TEST_DATA):

    # Print the document text and entitites
    print(doc.text)
    print(doc.ents, '\n')

Apple is slowing down the iPhone 8 and iPhone X - how to stop it
(iPhone 8, iPhone X) 

I finally understand what the iPhone X 'notch' is for
(iPhone X,) 

Everything you need to know about the Samsung Galaxy S9
() 

Looking to compare iPad models? Here’s how the 2018 lineup stacks up
() 

The iPhone 8 and iPhone 8 Plus are smartphones designed, developed, and marketed by Apple
(iPhone 8, iPhone 8) 

what is the cheapest ipad, especially ipad pro???
() 

Samsung Galaxy is a series of mobile computing devices designed, manufactured and marketed by Samsung Electronics
() 



In [71]:
TRAINING_DATA = [
    ("Reddit partners with Patreon to help creators build communities", 
     {'entities': [(0, 6, 'WEBSITE'), (21, 28, 'WEBSITE')]}),
  
    ("PewDiePie smashes YouTube record", 
     {'entities': [(0, 9, 'PERSON'), (18, 25, 'WEBSITE')]}),
  
    ("Reddit founder Alexis Ohanian gave away two Metallica tickets to fans", 
     {'entities': [(0, 6, 'WEBSITE'), (15, 29, 'PERSON')]}),]