In [1]:
#Read in headlines text file as list
with open("data/headlines.txt") as f:
    headlines = [line.rstrip('\n') for line in f.readlines()]

In [2]:
headlines

['Zika Outbreak Hits Miami',
 'Could Zika Reach New York City?',
 'First Case of Zika in Miami Beach',
 'Mystery Virus Spreads in Recife, Brazil',
 'Dallas man comes down with case of Zika',
 'Trinidad confirms first Zika case',
 'Zika Concerns are Spreading in Houston',
 'Geneve Scientists Battle to Find Cure',
 'The CDC in Atlanta is Growing Worried',
 'Zika Infested Monkeys in Sao Paulo',
 'Brownsville teen contracts Zika virus',
 'Mosquito control efforts in St. Louis take new tactics with Zika threat',
 'San Juan reports 1st U.S. Zika-related death amid outbreak',
 'Flu outbreak in Galveston, Texas',
 'Zika alert – Manila now threatened',
 'Zika afflicts 7 in Iloilo City',
 'New Los Angeles Hairstyle goes Viral',
 'Louisiana Zika cases up to 26',
 'Orlando volunteers aid Zika research',
 'Zika infects pregnant woman in Cebu',
 "Chicago's First Zika Case Confirmed",
 'Tampa Bay Area Zika Case Count Climbs',
 'Bad Water Leads to Sickness in Flint, Michigan',
 'Baltimore plans for Zi

In [3]:
import geonamescache
from unidecode import unidecode
import pandas as pd
import re
from collections import defaultdict

Output: Headline, City, Country as pandas data frame

In [4]:
#Country and city list from geonamescache
gc = geonamescache.GeonamesCache()
countries = gc.get_countries()
cities = gc.get_cities()

In [5]:
#Create modified versions of city and country lists for use in headline matching 
country_list = [unidecode(x['name']) for x in countries.values()]
city_list = [unidecode(x['name']) for x in cities.values()]

In [7]:
def build_word_dict(source_list):
    '''Return a dictionary built from the source_list. 
    The keys in the output are the unique first words of the items in the source_list
    The corresponding values are a list of the items that have the key as the first word.
    '''
    output = defaultdict(list)
    
    for item in source_list:
        output[item.split(' ')[0]].append(item)

    return(output)

In [8]:
#Create a dict
country_dict = build_word_dict(country_list)
city_dict = build_word_dict(city_list)

In [12]:
def match_item(headline, item_dict):
    '''Searches in the headline for occurences of items in item_list. 
    Returns the longest match based on number of matching words.
    If there is a tie in length, returns only the first match.
    '''
    
    #Create empty result list
    match_list = []
    
    #Split the headlines into words so we can process word by word, which limits the search space
    words = headline.split(" ")
    
    #Do some cleanup of the words
    #Cleaning up 's does impact a few city names
    words = [word.replace("'s","") if word[-2:] == "'s" else word for word in words]
    #Clean up punctuation
    words = [word[0:-1] if word[-1:] in [",","?","!"] else word for word in words]
    
    for word in words:
        if item_dict[word]:
            for item in item_dict[word]:
                regex = r"\b("+item+r")"                    
                compiled_re = re.compile(regex, flags=re.IGNORECASE)
                result = compiled_re.search(headline)
                if result is not None:
                    match_list.append(result.group())

    if len(match_list) == 0:
        return(None, None)
    else:
        lengths = [len(match) for match in match_list]
        return(match_list[lengths.index(max(lengths))], match_list )

In [13]:
#Run country and city matches to generate an output dataset
data = []
for headline in headlines[0:]:
    row = {}

    city, _ = match_item(headline, city_dict)
    country, _ = match_item(headline, country_dict)

    
    row['Headline'] = headline
    row['City'] = city
    row['Country'] = country
    data.append(row)

df = pd.DataFrame(data)

In [14]:
pd.set_option('display.max_colwidth', -1)
df.sample(20)

Unnamed: 0,Headline,City,Country
430,Schools in Bentonville Closed Due to Hepatitis B Outbreak,Bentonville,
475,More people in Palo Alto are infected with HIV every year,Palo Alto,
366,More people in Huron are infected with Dengue every year,,
254,More Patients in Orange are Getting Diagnosed with Chickenpox,Orange,
482,Rumors about Syphilis spreading in Penal have been refuted,,
538,Authorities a Miami,Miami,
554,Respiratory Syncytial Virus Hits Henderson,Henderson,
44,"Zika outbreak in Piracicaba, Brazil",Piracicaba,Brazil
125,New medicine wipes out West Nile Virus in Ventura,Ventura,
400,Zika spreads to Lewisville,Lewisville,
