In [1]:
import geonamescache
import numpy as np
import os
import pandas as pd

In [2]:
DATA_DIR = "data"

In [3]:
gc = geonamescache.GeonamesCache()

In [4]:
headlines_df = pd.read_csv(os.path.join(DATA_DIR, "headlines-with-city-country.csv"))
headlines_df.head()

Unnamed: 0,headline,city,country
0,Zika Outbreak Hits Miami,Miami,United States
1,Could Zika Reach New York City?,New York City,United States
2,First Case of Zika in Miami Beach,Miami Beach,United States
3,"Mystery Virus Spreads in Recife, Brazil",Recife,Brazil
4,Dallas man comes down with case of Zika,Dallas,United States


In [5]:
def isNaN(x):
    return x != x

countries_by_name = gc.get_countries_by_names()
def get_country_code(country_name):
    if isNaN(country_name):
        return None
    return countries_by_name[country_name]["iso"]

# countries = headlines_df["country"].values
# for country_name in countries:
#     print(country_name, get_country_code(country_name))
    
print(get_country_code("United States"))
print(get_country_code("India"))
print(get_country_code("Brazil"))

US
IN
BR


In [6]:
headlines_df["countrycode"] = headlines_df.apply(
    lambda row: get_country_code(row["country"]), axis=1)
headlines_df.head()

Unnamed: 0,headline,city,country,countrycode
0,Zika Outbreak Hits Miami,Miami,United States,US
1,Could Zika Reach New York City?,New York City,United States,US
2,First Case of Zika in Miami Beach,Miami Beach,United States,US
3,"Mystery Virus Spreads in Recife, Brazil",Recife,Brazil,BR
4,Dallas man comes down with case of Zika,Dallas,United States,US


In [7]:
def get_latlon(city_name, country_code):
    if isNaN(city_name) or country_code is None:
        return (None, np.nan, np.nan)
    cities = gc.get_cities_by_name(city_name)
    candidates = []
    for city in cities:
        lat, lon, ctry = np.nan, np.nan, None
        for x in city.values():
            candidates.append((x["countrycode"], x["latitude"], x["longitude"]))
    # filter for country
    candidates = [c for c in candidates if c[0] == country_code]
    if len(candidates) > 0:
        return candidates[0]
    else:
        return (None, np.nan, np.nan)
    
    
print(get_latlon("Dallas", "US"))
print(get_latlon("San Diego", "US"))
print(get_latlon("Recife", "BR"))

('US', 32.78306, -96.80667)
('US', 32.71571, -117.16472)
('BR', -8.05389, -34.88111)


In [8]:
headlines_df["latitude"] = headlines_df.apply(
    lambda row: get_latlon(row["city"], row["countrycode"])[1], axis=1)
headlines_df["longitude"] = headlines_df.apply(
    lambda row: get_latlon(row["city"], row["countrycode"])[2], axis=1)

headlines_df.drop(columns=["country"], inplace=True)
headlines_df.dropna(axis=0, inplace=True)

headlines_df.head()

Unnamed: 0,headline,city,countrycode,latitude,longitude
0,Zika Outbreak Hits Miami,Miami,US,25.77427,-80.19366
1,Could Zika Reach New York City?,New York City,US,40.71427,-74.00597
2,First Case of Zika in Miami Beach,Miami Beach,US,25.79065,-80.13005
3,"Mystery Virus Spreads in Recife, Brazil",Recife,BR,-8.05389,-34.88111
4,Dallas man comes down with case of Zika,Dallas,US,32.78306,-96.80667


In [9]:
headlines_df.to_csv(os.path.join(DATA_DIR, "headlines-with-latlon.csv"), index=False)

In [10]:
len(headlines_df)

600