In [1]:
import geonamescache
import re
import unidecode
import pandas as pd
from geonamescache.mappers import country
mapper = country(from_key='iso', to_key='name')

gc = geonamescache.GeonamesCache()
# Gets all countries
countries = tuple(sorted(gc.get_countries_by_names().keys()))
assert len(countries) == 252
# Gets all city names (country does NOT matter, just the name).
cities = tuple(sorted(set([v['name'] for (k, v) in gc.get_cities().items()])))
us_states = tuple(sorted(gc.get_us_states_by_names()))
us_counties = tuple(sorted(set([county['name'].split()[0].strip() for county in gc.get_us_counties()])))
assert 'Louisiana' not in cities
assert 'Antigua' not in cities and 'Antigua Guatemala' in cities
assert 'Sussex' not in cities and 'Sussex' in us_counties
assert 'Louisiana' in us_states
assert len(cities) == 23090

In [2]:
%%time

# Creating an empty Dataframe with column names only
df = pd.DataFrame(columns=['headline', 'countries', 'cities'])
with open("data/headlines.txt") as f:
    counter = 0
    for line in f.read().splitlines():
        counter += 1
        if counter % 20 == 0:
            print(f"{counter} lines have been processed!")
        matches = []
        # Each headline contains the city, never the country alone. So loop over the cities.
        # However we could have an US state or a county. Ignore them for now.
        for city in cities:
            if re.search(f"\\b{unidecode.unidecode(city)}\\b", line, re.IGNORECASE):
                matches.append(city)
        # In 'Could Zika Reach New York City?', we will find 'New York City' AND 'York'
        if matches:
            # Consider the longest match
            match = max(matches, key=len)
            gc_cities = gc.get_cities_by_name(match)
            if not gc_cities:
                print(f"NO match for city name '{match}'.")
            elif len(gc_cities) > 1:
                df = df.append({'headline': line, 'countries': None,
                                'cities': unidecode.unidecode(match)}, ignore_index=True)
            else:
                gc_city = list(gc_cities[0].values())[0]
                df = df.append({'headline': line, 'countries': mapper(gc_city['countrycode']),
                                'cities': unidecode.unidecode(match)}, ignore_index=True)
        else:
            # Should we extend the search to US states/counties?
            print(f"No one single match found in headline '{line}'.")
df.to_csv('output.csv', index=False, encoding='utf-8')
print("Finished!")
                

No one single match found in headline 'Louisiana Zika cases up to 26'.
20 lines have been processed!
No one single match found in headline 'Zika infects pregnant woman in Cebu'.
40 lines have been processed!
No one single match found in headline 'Spanish Flu Sighted in Antigua'.
60 lines have been processed!
No one single match found in headline 'Zika case reported in Oton'.
No one single match found in headline 'Hillsborough uses innovative trap against Zika 20 minutes ago'.
80 lines have been processed!
No one single match found in headline 'Maka City Experiences Influenza Outbreak'.
100 lines have been processed!
120 lines have been processed!
140 lines have been processed!
No one single match found in headline 'West Nile Virus Outbreak in Saint Johns'.
160 lines have been processed!
180 lines have been processed!
200 lines have been processed!
220 lines have been processed!
No one single match found in headline 'Malaria Exposure in Sussex'.
240 lines have been processed!
No one sin

KeyboardInterrupt: 