In [1]:
import json
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher

In [2]:
# Load list of countries
with open('countries.json', 'r') as f:
    countries = json.load(f)
    
countries

['Afghanistan',
 'Åland Islands',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia (Plurinational State of)',
 'Bonaire, Sint Eustatius and Saba',
 'Bosnia and Herzegovina',
 'Botswana',
 'Bouvet Island',
 'Brazil',
 'British Indian Ocean Territory',
 'United States Minor Outlying Islands',
 'Virgin Islands (British)',
 'Virgin Islands (U.S.)',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cabo Verde',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Christmas Island',
 'Cocos (Keeling) Islands',
 'Colombia',
 'Comoros',
 'Congo',
 'Congo (Democratic Republic of the)',
 'Cook Islands',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Curaça

In [3]:
nlp = English()
doc = nlp('Czech Republic may help Slovakia protect its airspace')
matcher = PhraseMatcher(nlp.vocab)

doc, matcher

(Czech Republic may help Slovakia protect its airspace,
 <spacy.matcher.phrasematcher.PhraseMatcher at 0x1c73ee0f200>)

In [4]:
# It may be more efficient to match exact strings instead of patterns describing them
# Create a pattern to detect countries
# Short version of [nlp(country) for country in countries]

countries_pattern = list(nlp.pipe(countries))
countries_pattern

[Afghanistan,
 Åland Islands,
 Albania,
 Algeria,
 American Samoa,
 Andorra,
 Angola,
 Anguilla,
 Antarctica,
 Antigua and Barbuda,
 Argentina,
 Armenia,
 Aruba,
 Australia,
 Austria,
 Azerbaijan,
 Bahamas,
 Bahrain,
 Bangladesh,
 Barbados,
 Belarus,
 Belgium,
 Belize,
 Benin,
 Bermuda,
 Bhutan,
 Bolivia (Plurinational State of),
 Bonaire, Sint Eustatius and Saba,
 Bosnia and Herzegovina,
 Botswana,
 Bouvet Island,
 Brazil,
 British Indian Ocean Territory,
 United States Minor Outlying Islands,
 Virgin Islands (British),
 Virgin Islands (U.S.),
 Brunei Darussalam,
 Bulgaria,
 Burkina Faso,
 Burundi,
 Cambodia,
 Cameroon,
 Canada,
 Cabo Verde,
 Cayman Islands,
 Central African Republic,
 Chad,
 Chile,
 China,
 Christmas Island,
 Cocos (Keeling) Islands,
 Colombia,
 Comoros,
 Congo,
 Congo (Democratic Republic of the),
 Cook Islands,
 Costa Rica,
 Croatia,
 Cuba,
 Curaçao,
 Cyprus,
 Czech Republic,
 Denmark,
 Djibouti,
 Dominica,
 Dominican Republic,
 Ecuador,
 Egypt,
 El Salvador,
 Equa

In [5]:
# Add the pattern to matcher
matcher.add('COUNTRIES', None, *countries_pattern)
matches = matcher(doc)
matches

[(2988880774438019688, 0, 2), (2988880774438019688, 4, 5)]

In [6]:
# Print the matched patterns
for match_id, start_index, end_index in matches:
    print(doc[start_index:end_index])

Czech Republic
Slovakia
