In [1]:
import pathlib
import pandas as pd
import geonamescache
import re

In [2]:
gc = geonamescache.GeonamesCache()
countries = gc.get_countries()
cities = gc.get_cities()

In [3]:
headlines_file = pathlib.Path().joinpath('data', 'headlines.txt')
with open(headlines_file, mode='r') as headlines:
    all_headlines = [next_headline.strip() for next_headline in headlines]
my_headlines_df = pd.DataFrame(all_headlines, columns=['Headline'])
my_headlines_df.head()

Unnamed: 0,Headline
0,Zika Outbreak Hits Miami
1,Could Zika Reach New York City?
2,First Case of Zika in Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil"
4,Dallas man comes down with case of Zika


In [4]:
def create_regex_from_list(a_list):
    all_items_string = '|'.join([v['name'] for k, v in a_list.items()])
    return re.compile('\\b(' + all_items_string + ')\\b', re.IGNORECASE)


In [5]:
def get_country_codes_for_city(city):
    return [list(c.values())[0]['countrycode'] for c in gc.get_cities_by_name(city)]

In [6]:
def get_state_codes_for_city(city):
    return [list(c.values())[0]['admin1code'] for c in gc.get_cities_by_name(city)]    

In [7]:
country_search_rec = create_regex_from_list(countries)
city_search_rec = create_regex_from_list(cities)

In [8]:
def find_city_state_country_in_headline(a_headline):
    my_country = None
    my_country_codes = []
    my_city = None
    my_state_codes = []
    
    my_country_match = country_search_rec.search(a_headline)
    if my_country_match:
        my_country = my_country_match.groups()[0]
        
    my_city_match = city_search_rec.search(a_headline)
    if my_city_match:
        my_city = my_city_match.groups()[0]
        my_country_codes = get_country_codes_for_city(my_city)
        my_state_codes = get_state_codes_for_city(my_city)
    return (my_city, my_state_codes, my_country, my_country_codes)

In [9]:
my_headlines_df['Tuples'] = my_headlines_df['Headline'].apply(lambda h: find_city_state_country_in_headline(h))

In [10]:
my_headlines_df['City'] = my_headlines_df['Tuples'].apply(lambda x: x[0])

In [11]:
my_headlines_df['State Codes'] = my_headlines_df['Tuples'].apply(lambda x: x[1])

In [12]:
my_headlines_df['Country'] = my_headlines_df['Tuples'].apply(lambda x: x[2])

In [13]:
my_headlines_df['Country Codes'] = my_headlines_df['Tuples'].apply(lambda x: x[3])

In [14]:
del my_headlines_df['Tuples']

In [15]:
my_headlines_df.head()

Unnamed: 0,Headline,City,State Codes,Country,Country Codes
0,Zika Outbreak Hits Miami,Miami,[FL],,[US]
1,Could Zika Reach New York City?,New York City,[NY],,[US]
2,First Case of Zika in Miami Beach,of,[],,[]
3,"Mystery Virus Spreads in Recife, Brazil",Recife,[30],Brazil,[BR]
4,Dallas man comes down with case of Zika,Dallas,"[TX, OR]",,"[US, US]"
