In [2]:
#Read in headlines text file as list
with open("data/headlines.txt") as f:
    headlines = [line.rstrip('\n') for line in f.readlines()]

In [3]:
import geonamescache
from unidecode import unidecode
import pandas as pd
import re
from collections import defaultdict

Output: Headline, City, Country as pandas data frame

In [4]:
#Country and city list from geonamescache
gc = geonamescache.GeonamesCache()
countries = gc.get_countries()
cities = gc.get_cities()

In [5]:
#Create modified versions of city and country lists for use in headline matching 
country_list = [unidecode(x['name']) for x in countries.values()]
city_list = [unidecode(x['name']) for x in cities.values()]

In [6]:
def build_word_dict(source_list):
    '''Return a dictionary built from the source_list. 
    The keys in the output are the unique first words of the items in the source_list
    The corresponding values are a list of the items that have the key as the first word.
    '''
    output = defaultdict(list)
    
    for item in source_list:
        output[item.split(' ')[0]].append(item)

    return(output)

In [7]:
#Create a dict
country_dict = build_word_dict(country_list)
city_dict = build_word_dict(city_list)

In [8]:
def match_item(headline, item_dict):
    '''Searches in the headline for occurences of items in item_list. 
    Returns the longest match based on number of matching words.
    If there is a tie in length, returns only the first match.
    '''
    
    #Create empty result list
    match_list = []
    
    #Split the headlines into words so we can process word by word, which limits the search space
    words = headline.split(" ")
    
    #Do some cleanup of the words
    #Cleaning up 's does impact a few city names
    words = [word.replace("'s","") if word[-2:] == "'s" else word for word in words]
    #Clean up punctuation
    words = [word[0:-1] if word[-1:] in [",","?","!"] else word for word in words]
    
    for word in words:
        if item_dict[word]:
            for item in item_dict[word]:
                regex = r"\b("+item+r")"                    
                compiled_re = re.compile(regex, flags=re.IGNORECASE)
                result = compiled_re.search(headline)
                if result is not None:
                    match_list.append(result.group())

    if len(match_list) == 0:
        return(None, None)
    else:
        lengths = [len(match) for match in match_list]
        return(match_list[lengths.index(max(lengths))], match_list )

In [9]:
#Run country and city matches to generate an output dataset
data = []
for headline in headlines[0:]:
    row = {}

    city, _ = match_item(headline, city_dict)
    country, _ = match_item(headline, country_dict)

    
    row['Headline'] = headline
    row['City'] = city
    row['Country'] = country
    data.append(row)

df = pd.DataFrame(data)

In [10]:
pd.set_option('display.max_colwidth', -1)
df.sample(10)

Unnamed: 0,Headline,City,Country
630,Chikungunya has not Left Pismo Beach,,
637,Dengue Outbreak in Easton,Easton,
546,Rumors about Rotavirus Spreading in Joliet have been Refuted,Joliet,
144,The Spread of Measles in Spokane has been Confirmed,Spokane,
111,Authorities are Worried about the Spread of Influenza in Savannah,Savannah,
507,Authorities are Worried about the Spread of Chickenpox in Hemet,Hemet,
181,Sick Livestock Leads to Serious Trouble for Belfort,Belfort,
178,New medicine wipes out Measles in Fresno,Fresno,
405,Iowa City Encounters Severe Symptoms of Rhinovirus,Iowa City,
264,Tuberculosis Outbreak in Hartford,Hartford,
