In [33]:
import pandas as pd
import geonamescache as gnc
import json
import re
import unidecode as ud

# sanitize input text:
# -- strip whitespace
# -- remove accents, diacritics, umlauts (used for data from geonames)
def sanitize(line):
    return ud.unidecode(line.strip())

# get cities and countries from geonames
countries = gnc.GeonamesCache().get_countries()
cities = gnc.GeonamesCache().get_cities().values()

# build a dictionary with "unidecoded" city names as keys
# (add the sanitized name as an attribute first and later use it
# in a dictionary comprehension)
city_name_lookup = {ud.unidecode(c['name']):c for c in cities}

# build a regular expression to find city names
# (remove accents, diacritics and umlauts)
# to avoid matching shorter city names when a longer name was used in the
# headline, let's pick the longest name that we matched; do this by sorting
# the names in reverse order by length (longest first) before building the
# regular expression (which being greedy will then match the longest one first)
city_names = list(city_name_lookup.keys())
city_names = sorted(city_names, key=lambda x:len(x), reverse=True)

piped_city_names = '|'.join(city_names)
city_name_list = f'({piped_city_names})'
city_name_regex = re.compile(f'\\b{city_name_list}\\b')

collector = {'headline': [], 'city': [], 'country': []}

def extract_city_and_country_from(headline):
        sanitized_headline = sanitize(headline)
        # find all matching city names
        matches = city_name_regex.findall(sanitized_headline)
        if matches:
            # and find the longest match
            winner = matches[0][0]
            # look the city up from our dictionary
            city_name = city_name_lookup[sanitize(winner)]['name']
            # pick the first matching city record from geonames
            first_matching_city = list(gnc.GeonamesCache().get_cities_by_name(city_name)[0].values())[0]
            corresponding_country = countries[first_matching_city['countrycode']]

            return dict(headline=sanitized_headline, city=first_matching_city['name'], country=corresponding_country['name'])
        else:
            print('i could not find any city in ', headline)
            return dict(headline=sanitized_headline, city=None, country=None)
    
with open('data/headlines.txt') as headlines:
    headline_cities_and_countries = [
        extract_city_and_country_from(headline) for headline in headlines
    ]

with open('data/headline_cities_and_countries.json', 'w') as output:
    output.write(json.dumps(headline_cities_and_countries))

i could not find any city in  Louisiana Zika cases up to 26

i could not find any city in  Zika infects pregnant woman in Cebu

i could not find any city in  Spanish Flu Sighted in Antigua

i could not find any city in  Carnival under threat in Rio De Janeiro due to Zika outbreak

i could not find any city in  Zika case reported in Oton

i could not find any city in  Hillsborough uses innovative trap against Zika 20 minutes ago

i could not find any city in  Maka City Experiences Influenza Outbreak

i could not find any city in  More Zika patients reported in Mcallen

i could not find any city in  West Nile Virus Outbreak in Saint Johns

i could not find any city in  More people in Mclean are infected with Hepatitis A every year

i could not find any city in  Malaria Exposure in Sussex

i could not find any city in  Greenwich Establishes Zika Task Force

i could not find any city in  Will West Nile Virus vaccine help Parsons?

i could not find any city in  Yulee takes a hit from Spread