In [1]:
import pathlib
import pandas as pd
import geonamescache
import re
import unidecode

In [2]:
gc = geonamescache.GeonamesCache()
countries = gc.get_countries()
cities = gc.get_cities()

In [3]:
headlines_file = pathlib.Path().joinpath('data', 'headlines.txt')
with open(headlines_file, mode='r') as headlines:
    all_headlines = [unidecode.unidecode(next_headline.strip()) for next_headline in headlines]
#     all_headlines = [next_headline.strip() for next_headline in headlines]
my_headlines_df = pd.DataFrame(all_headlines, columns=['Headline'])
my_headlines_df.head()

Unnamed: 0,Headline
0,Zika Outbreak Hits Miami
1,Could Zika Reach New York City?
2,First Case of Zika in Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil"
4,Dallas man comes down with case of Zika


In [4]:
def create_regex_from_list(a_list):
    # ensure list is sorted by length of item in reverse order - so biggest are first . . .
    all_items_string = r"\b|\b".join(sorted([unidecode.unidecode(v['name']) for v in a_list.values()], key= lambda x: len(x), reverse=True))
#     all_items_string = r"\b|\b".join(sorted([v['name'] for v in a_list.values()], key= lambda x: len(x), reverse=True))
    return all_items_string


In [5]:
def get_country_codes_for_city(city):
    return [list(c.values())[0]['countrycode'] for c in gc.get_cities_by_name(city)]

In [6]:
def get_state_codes_for_city(city):
    return [list(c.values())[0]['admin1code'] for c in gc.get_cities_by_name(city)]    

In [7]:
country_search_rec = create_regex_from_list(countries)
city_search_rec = create_regex_from_list(cities)

In [8]:
def find_city_state_country_in_headline(a_headline):
    my_country = None
    my_country_codes = []
    my_city = None
    my_state_codes = []
    my_latitude = None
    my_longitude = None
#     my_country_match = country_search_rec.search(a_headline)
    my_country_match = re.search(country_search_rec, a_headline)
    if my_country_match:
        my_country = my_country_match.groups(0)
        
#     my_city_match = city_search_rec.search(a_headline)
    my_city_match = re.search(city_search_rec, a_headline)
    if my_city_match:
        my_city = my_city_match.group(0)
        my_country_codes = get_country_codes_for_city(my_city)
        
        my_city_by_name = gc.get_cities_by_name(my_city)
        if my_city_by_name:
            my_city_object = list(my_city_by_name[0].values())[0]        
            if my_country is None or my_country ==():
                my_country = countries[my_city_object['countrycode']]['name']
            my_latitude = my_city_object['latitude']
            my_longitude = my_city_object['longitude']
        my_state_codes = get_state_codes_for_city(my_city)
    return (my_city, my_state_codes, my_country, my_country_codes, my_latitude, my_longitude)

In [9]:
my_headlines_df['Tuples'] = my_headlines_df['Headline'].apply(lambda h: find_city_state_country_in_headline(h))

In [10]:
my_headlines_df['City'] = my_headlines_df['Tuples'].apply(lambda x: x[0])

In [11]:
my_headlines_df['State Codes'] = my_headlines_df['Tuples'].apply(lambda x: x[1])

In [12]:
my_headlines_df['Country'] = my_headlines_df['Tuples'].apply(lambda x: x[2])

In [13]:
my_headlines_df['Country Codes'] = my_headlines_df['Tuples'].apply(lambda x: x[3])

In [14]:
my_headlines_df['Lattitude'] = my_headlines_df['Tuples'].apply(lambda x: x[4])

In [15]:
my_headlines_df['Longitude'] = my_headlines_df['Tuples'].apply(lambda x: x[5])

In [16]:
del my_headlines_df['Tuples']

In [17]:
my_headlines_df.head()

Unnamed: 0,Headline,City,State Codes,Country,Country Codes,Lattitude,Longitude
0,Zika Outbreak Hits Miami,Miami,[FL],United States,[US],25.77427,-80.19366
1,Could Zika Reach New York City?,New York City,[NY],United States,[US],40.71427,-74.00597
2,First Case of Zika in Miami Beach,Miami Beach,[FL],United States,[US],25.79065,-80.13005
3,"Mystery Virus Spreads in Recife, Brazil",Recife,[30],Brazil,[BR],-8.05389,-34.88111
4,Dallas man comes down with case of Zika,Dallas,"[TX, OR]",United States,"[US, US]",32.78306,-96.80667


In [18]:
my_headlines_df['Num Countries'] = my_headlines_df['Country Codes'].apply(lambda x: len(x))

In [19]:
my_headlines_df[my_headlines_df['Num Countries'] ==0]

Unnamed: 0,Headline,City,State Codes,Country,Country Codes,Lattitude,Longitude,Num Countries
7,Geneve Scientists Battle to Find Cure,Geneve,[],,[],,,0
9,Zika Infested Monkeys in Sao Paulo,Sao Paulo,[],,[],,,0
17,Louisiana Zika cases up to 26,,[],,[],,,0
19,Zika infects pregnant woman in Cebu,,[],,[],,,0
47,18 new Zika Cases in Bogota,Bogota,[],,[],,,0
48,Spanish Flu Sighted in Antigua,,[],,[],,,0
63,Carnival under threat in Rio De Janeiro due to...,,[],,[],,,0
64,Second Zika Paitient in Brasilia,Brasilia,[],,[],,,0
73,Zika case reported in Oton,,[],,[],,,0
76,Hillsborough uses innovative trap against Zika...,,[],,[],,,0


In [21]:
my_headlines_df[my_headlines_df['City'] == 'San Diego']

Unnamed: 0,Headline,City,State Codes,Country,Country Codes,Lattitude,Longitude,Num Countries
29,Key Zika Findings in San Diego Institute,San Diego,"[02, CA]",Costa Rica,"[CR, US]",9.89898,-84.00287,2


In [24]:
my_headlines_df.head(10)

Unnamed: 0,Headline,City,State Codes,Country,Country Codes,Lattitude,Longitude,Num Countries,Unaccented Headline
0,Zika Outbreak Hits Miami,Miami,[FL],United States,[US],25.77427,-80.19366,1,Zika Outbreak Hits Miami
1,Could Zika Reach New York City?,New York City,[NY],United States,[US],40.71427,-74.00597,1,Could Zika Reach New York City?
2,First Case of Zika in Miami Beach,Miami Beach,[FL],United States,[US],25.79065,-80.13005,1,First Case of Zika in Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil",Recife,[30],Brazil,[BR],-8.05389,-34.88111,1,"Mystery Virus Spreads in Recife, Brazil"
4,Dallas man comes down with case of Zika,Dallas,"[TX, OR]",United States,"[US, US]",32.78306,-96.80667,2,Dallas man comes down with case of Zika
5,Trinidad confirms first Zika case,Trinidad,"[03, 14, 06]",Bolivia,"[BO, CU, UY]",-14.83333,-64.9,3,Trinidad confirms first Zika case
6,Zika Concerns are Spreading in Houston,Houston,[TX],United States,[US],29.76328,-95.36327,1,Zika Concerns are Spreading in Houston
7,Geneve Scientists Battle to Find Cure,Geneve,[],,[],,,0,Geneve Scientists Battle to Find Cure
8,The CDC in Atlanta is Growing Worried,Atlanta,[GA],United States,[US],33.749,-84.38798,1,The CDC in Atlanta is Growing Worried
9,Zika Infested Monkeys in Sao Paulo,Sao Paulo,[],,[],,,0,Zika Infested Monkeys in Sao Paulo


In [25]:
my_headlines_df.tail(5)

Unnamed: 0,Headline,City,State Codes,Country,Country Codes,Lattitude,Longitude,Num Countries,Unaccented Headline
645,Rumors about Rabies spreading in Jerusalem hav...,Jerusalem,[06],Israel,[IL],31.76904,35.21633,1,Rumors about Rabies spreading in Jerusalem hav...
646,More Zika patients reported in Indang,Indang,[40],Philippines,[PH],14.19528,120.87694,1,More Zika patients reported in Indang
647,Suva authorities confirmed the spread of Rotav...,Suva,[01],Fiji,[FJ],-18.14161,178.44149,1,Suva authorities confirmed the spread of Rotav...
648,More Zika patients reported in Bella Vista,Bella Vista,"[24, 34, AR]",Argentina,"[AR, DO, US]",-27.03424,-65.30196,3,More Zika patients reported in Bella Vista
649,Zika Outbreak in Wichita Falls,Wichita Falls,[TX],United States,[US],33.91371,-98.49339,1,Zika Outbreak in Wichita Falls
