In [1]:
import json

import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

In [2]:
# Load the list of countries
with open('countries.json', 'r') as f:
    countries = json.load(f)
    
countries

['Afghanistan',
 'Åland Islands',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia (Plurinational State of)',
 'Bonaire, Sint Eustatius and Saba',
 'Bosnia and Herzegovina',
 'Botswana',
 'Bouvet Island',
 'Brazil',
 'British Indian Ocean Territory',
 'United States Minor Outlying Islands',
 'Virgin Islands (British)',
 'Virgin Islands (U.S.)',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cabo Verde',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Christmas Island',
 'Cocos (Keeling) Islands',
 'Colombia',
 'Comoros',
 'Congo',
 'Congo (Democratic Republic of the)',
 'Cook Islands',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Curaça

In [3]:
# Extract a text bearing some country names    
with open('country_text.txt', 'r') as f:
    country_text = f.read()
    
country_text

'After the Cold War, the UN saw a radical expansion in its peacekeeping duties, taking on more missions in ten years than it had in the previous four decades.Between 1988 and 2000, the number of adopted Security Council resolutions more than doubled, and the peacekeeping budget increased more than tenfold. The UN negotiated an end to the Salvadoran Civil War, launched a successful peacekeeping mission in Namibia, and oversaw democratic elections in post-apartheid South Africa and post-Khmer Rouge Cambodia. In 1991, the UN authorized a US-led coalition that repulsed the Iraqi invasion of Kuwait. Brian Urquhart, Under-Secretary-General from 1971 to 1985, later described the hopes raised by these successes as a "false renaissance" for the organization, given the more troubled missions that followed. Though the UN Charter had been written primarily to prevent aggression by one nation against another, in the early 1990s the UN faced a number of simultaneous, serious crises within nations su

In [4]:
# Initialize doc, set and add pattern to matcher
nlp = spacy.load('en_core_web_sm')
doc = nlp(country_text)

country_pattern = list(nlp.pipe(countries))
matcher = PhraseMatcher(nlp.vocab)
matcher.add('COUNTRY_PATTERN', None, *country_pattern)
matches = matcher(doc)

matches

[(3675018568124021177, 74, 75),
 (3675018568124021177, 84, 86),
 (3675018568124021177, 91, 92),
 (3675018568124021177, 110, 111),
 (3675018568124021177, 186, 187),
 (3675018568124021177, 188, 189),
 (3675018568124021177, 190, 191),
 (3675018568124021177, 201, 202),
 (3675018568124021177, 252, 253),
 (3675018568124021177, 333, 334),
 (3675018568124021177, 416, 418),
 (3675018568124021177, 433, 434),
 (3675018568124021177, 448, 449),
 (3675018568124021177, 499, 500),
 (3675018568124021177, 509, 510),
 (3675018568124021177, 565, 566)]

In [5]:
# Reset existing entities
doc.ents = list()

# Iterate over the matches in doc
for match_id, start_index, end_index in matches:
    
    # Initialize span object
    # Assign label 'GPE' to the span
    span = Span(doc, start_index, end_index, label='GPE')
    
    # Add the span to doc entities list
    doc.ents = list(doc.ents) + [span]
    
    # Span's root's head token
    span_root_head = span.root.head
    
    # Text of span's root's head followed by span text
    print(span_root_head, '-->', span.text)

in --> Namibia
in --> South Africa
Africa --> Cambodia
of --> Kuwait
as --> Somalia
Somalia --> Haiti
Haiti --> Mozambique
in --> Somalia
for --> Rwanda
Britain --> Singapore
War --> Sierra Leone
of --> Afghanistan
invaded --> Iraq
in --> Sudan
of --> Congo
earthquake --> Haiti


In [6]:
# Print all the entities in doc
print([(ent.text, ent.label_) for ent in doc.ents if ent.label_ == 'GPE'])

[('Namibia', 'GPE'), ('South Africa', 'GPE'), ('Cambodia', 'GPE'), ('Kuwait', 'GPE'), ('Somalia', 'GPE'), ('Haiti', 'GPE'), ('Mozambique', 'GPE'), ('Somalia', 'GPE'), ('Rwanda', 'GPE'), ('Singapore', 'GPE'), ('Sierra Leone', 'GPE'), ('Afghanistan', 'GPE'), ('Iraq', 'GPE'), ('Sudan', 'GPE'), ('Congo', 'GPE'), ('Haiti', 'GPE')]
