In [88]:
import geonamescache
import re
import unidecode
import pandas as pd
from geonamescache.mappers import country
mapper = country(from_key='iso', to_key='name')

gc = geonamescache.GeonamesCache()
# Gets all countries
countries = tuple(sorted(gc.get_countries_by_names().keys()))
assert len(countries) == 252
# Gets all city names (country does NOT matter, just the name).
cities = tuple(sorted(set([unidecode.unidecode(v['name']) for (k, v) in gc.get_cities().items()])))
us_states = tuple(sorted(gc.get_us_states_by_names()))
us_counties = tuple(sorted(set([county['name'].split()[0].strip() for county in gc.get_us_counties()])))
assert 'Louisiana' not in cities
assert 'Antigua' not in cities and 'Antigua Guatemala' in cities
assert 'Sussex' not in cities and 'Sussex' in us_counties
assert 'Louisiana' in us_states
assert len(cities) == 23022

In [98]:
# Creating an empty Dataframe with column names only
df = pd.DataFrame(columns=['headline', 'city', 'country'])
with open("data/headlines.txt") as f:
    for line in f.read().splitlines():
        matches = []
        # Each headline contains the city, never the country alone. So loop over the cities.
        # However we could have an US state or a county. Ignore them for now.
        for city in cities:
            if re.search(f"\\b{city}\\b", line, re.IGNORECASE):
                matches.append(city)
        # In 'Could Zika Reach New York City?', we will find 'New York City' AND 'York'
        if matches:
            # Consider the longest match
            match = max(matches, key=len)
            gc_cities = gc.get_cities_by_name(match)
            if not gc_cities:
                print(f"NO match for city name '{match}'.")
            elif len(gc_cities) > 1:
                print(f"Too many entries found for city name '{match}'.")
            else:
                gc_city = list(gc_cities[0].values())[0]
                df = df.append({'headline': line, 'city': city, 'country': mapper(gc_city['countrycode'])}, ignore_index=True)
        else:
            # Should we extend the search to US states/counties
            print(f"No one single match found in headline '{line}'.")
df.to_csv(index=False, 'output.csv', encoding='utf-8')
print("Finished!")
                

Too many entries found for city name 'Dallas'.
Too many entries found for city name 'Trinidad'.
NO match for city name 'Geneve'.
NO match for city name 'Sao Paulo'.
Too many entries found for city name 'Brownsville'.
Too many entries found for city name 'San Juan'.
No one single match found in headline 'Louisiana Zika cases up to 26'.
No one single match found in headline 'Zika infects pregnant woman in Cebu'.
Too many entries found for city name 'Flint'.
Too many entries found for city name 'London'.
Too many entries found for city name 'Boston'.
Too many entries found for city name 'Paris'.
Too many entries found for city name 'San Diego'.
Too many entries found for city name 'San Francisco'.
Too many entries found for city name 'Santa Rosa'.
Too many entries found for city name 'Cleveland'.
Too many entries found for city name 'Austin'.
Too many entries found for city name 'Lima'.
NO match for city name 'Bogota'.
No one single match found in headline 'Spanish Flu Sighted in Antigua'

NO match for city name 'Ribeirao Preto'.
No one single match found in headline 'Martinsville tests new cure for Measles'.
No one single match found in headline 'More Patients in Magnolia are Getting Diagnosed with Malaria'.
Too many entries found for city name 'Durango'.
Too many entries found for city name 'Gladstone'.
Too many entries found for city name 'London'.
NO match for city name 'Custodia'.
No one single match found in headline 'Rumors about Syphilis spreading in Penal have been refuted'.
Too many entries found for city name 'Madrid'.
Too many entries found for city name 'Barcelona'.
Too many entries found for city name 'Madrid'.
Too many entries found for city name 'Barcelona'.
Too many entries found for city name 'Redmond'.
Too many entries found for city name 'Concord'.
No one single match found in headline 'Fort Belvoir tests new cure for Hepatitis C'.
No one single match found in headline 'More people in Oak Brook are infected with Respiratory Syncytial Virus every year'

'headline,city,country\nZika Outbreak Hits Miami,maalot Tarshiha,USA\nCould Zika Reach New York City?,maalot Tarshiha,USA\nFirst Case of Zika in Miami Beach,maalot Tarshiha,USA\n"Mystery Virus Spreads in Recife, Brazil",maalot Tarshiha,\nZika Concerns are Spreading in Houston,maalot Tarshiha,USA\nThe CDC in Atlanta is Growing Worried,maalot Tarshiha,USA\nMosquito control efforts in St. Louis take new tactics with Zika threat,maalot Tarshiha,USA\n"Flu outbreak in Galveston, Texas",maalot Tarshiha,USA\nZika alert – Manila now threatened,maalot Tarshiha,\nZika afflicts 7 in Iloilo City,maalot Tarshiha,\nNew Los Angeles Hairstyle goes Viral,maalot Tarshiha,USA\nOrlando volunteers aid Zika research,maalot Tarshiha,USA\nChicago\'s First Zika Case Confirmed,maalot Tarshiha,USA\nTampa Bay Area Zika Case Count Climbs,maalot Tarshiha,USA\nBaltimore plans for Zika virus,maalot Tarshiha,USA\nZika cases in Vietnam\'s Ho Chi Minh City surge,maalot Tarshiha,\nPhiladelphia experts track pandemic,maalo