In [1]:
import geonamescache
import os
import pandas as pd
import re
import string
import unidecode

In [2]:
DATA_DIR = "data"

In [3]:
gc = geonamescache.GeonamesCache()

countries = gc.get_countries()
us_states = gc.get_us_states()
cities = gc.get_cities()

len(countries), len(us_states), len(cities)

(252, 51, 24336)

In [4]:
city2country = {}
for city_key in cities.keys():
    city_name = unidecode.unidecode(cities[city_key]["name"])
    country_code = cities[city_key]["countrycode"]
    country_name = unidecode.unidecode(countries[country_code]["name"])
    if city_name in city2country.keys():
        city2country[city_name].append(country_name)
    else:
        city2country[city_name] = [country_name]
    
len(city2country)

23022

In [5]:
def convert_to_list(geoname_dict):
    geoname_list = []
    for key, value in geoname_dict.items():
        geoname_list.append(unidecode.unidecode(value["name"]))
    return geoname_list
    
countries_list = convert_to_list(countries)
us_states_list = convert_to_list(us_states)
cities_list = convert_to_list(cities)

len(countries_list), len(us_states_list), len(cities_list)

(252, 51, 24336)

In [6]:
def pad_punctuation(s):
    s = re.sub("([{:s}])".format(string.punctuation), r" \1 ", s)
    s = re.sub("\s+", " ", s)
    return " " + s + " "

def create_regex(s):
    s = pad_punctuation(s)
    return re.compile(s, flags=re.IGNORECASE)
        

country_patterns = [create_regex(x) for x in countries_list]
city_patterns = [create_regex(x) for x in cities_list]
us_state_patterns = [create_regex(x) for x in us_states_list]

In [7]:
def build_city_prefix_patterns(cities_list):
    prefixes = []
    for i, city in enumerate(cities_list):
        city = city.replace("(", "").replace(")", "")
        city_words = city.split()
        for j in range(1, len(city_words)):
            p = create_regex(" ".join(city_words[0:j]))
            prefixes.append((p, i))
    return prefixes

def build_city_suffix_patterns(cities_list):
    suffixes = []
    for i, city in enumerate(cities_list):
        city = city.replace("(", "").replace(")", "")
        city_words = city.split()
        p = create_regex(city_words[-1])
        suffixes.append((p, i))
    return suffixes


city_prefix_patterns = build_city_prefix_patterns(cities_list)
city_suffix_patterns = build_city_suffix_patterns(cities_list)

In [8]:
def apply_transform(city_name):
    if city_name.startswith("Saint "):
        return city_name.replace("Saint ", "St . ")
    elif city_name.endswith("borough"):
        return city_name.replace("borough", "boro")
    elif city_name.endswith("ucau"):
        return city_name.replace("ucau", "caue")
    else:
        return None

In [9]:
def find_country(s, country_patterns, countries_list):
    for i, country_pattern in enumerate(country_patterns):
        m = re.search(country_pattern, s)
        if m is not None:
            return countries_list[i]
    return None

def find_state(s, us_state_patterns, us_state_list):
    for i, us_state_pattern in enumerate(us_state_patterns):
        m = re.search(us_state_pattern, s)
        if m is not None:
            return us_states_list[i]
    return None

def find_city(s, city_patterns, cities_list):
    matched_cities = []
    for i, city_pattern in enumerate(city_patterns):
        m = re.search(city_pattern, s)
        if m is not None:
            matched_cities.append(cities_list[i])
    if len(matched_cities) == 0:
        # if we couldn't find anything, try applying some common
        # transformations and repeat matching
        s_trans = apply_transform(s)
        if s_trans is not None:
            for i, city_pattern in enumerate(city_patterns):
                m = re.search(city_pattern, s)
                if m is not None:
                    matched_cities.append(cities_list[i])
    if len(matched_cities) == 0:
        for p, k in city_prefix_patterns:
            m = re.search(p, s)
            if m is not None:
                matched_cities.append(cities_list[k])
    if len(matched_cities) == 0:
        for p, k in city_suffix_patterns:
            m = re.search(p, s)
            if m is not None:
                matched_cities.append(cities_list[k])
    if len(matched_cities) > 0:
        return sorted(matched_cities, key=lambda x: len(x), reverse=True)[0]
    else:
        return None

def find_country_from_city(city, city2country):
    countries = city2country[city]
    if countries is None:
        return None
    else:
        return countries[0]

In [10]:
i, num_missed_city, num_missed_country, num_missed_both, num_got_both = 0, 0, 0, 0, 0
data = []
fhl = open(os.path.join(DATA_DIR, "headlines.txt"), "r")
for line in fhl:
    if i % 100 == 0:
        print("{:d} headlines processed".format(i))
        pass
    line = unidecode.unidecode(line.strip())
    line_m = pad_punctuation(line)
    matched_country = find_country(line_m, country_patterns, countries_list)
    matched_city = find_city(line_m, city_patterns, cities_list)
    if matched_city is not None and matched_country is None:
        matched_country = find_country_from_city(matched_city, city2country)
    if matched_country is not None and matched_city is None:
        num_missed_city += 1
    elif matched_city is not None and matched_country is None:
        num_missed_country += 1
    elif matched_city is None and matched_country is None:
        matched_us_state = find_state(line_m, us_state_patterns, us_states_list)
        if matched_us_state is not None:
            matched_country = "United States"
        if matched_country is None:
#             print(line, "|", matched_city, "|", matched_country)
            num_missed_both += 1
        else:
            num_missed_city += 1
    else:
        num_got_both += 1
        pass
    data.append([line, matched_city, matched_country])
    i += 1

print("{:d} headlines processed, COMPLETE".format(i))
fhl.close()

print(i, num_got_both, num_missed_city, num_missed_country, num_missed_both)

0 headlines processed
100 headlines processed
200 headlines processed
300 headlines processed
400 headlines processed
500 headlines processed
600 headlines processed
650 headlines processed, COMPLETE
650 634 1 0 15


In [11]:
data_df = pd.DataFrame(data, columns=["headline", "city", "country"])
data_df.fillna(value=pd.np.nan, inplace=True)

len(data_df)

650

In [12]:
data_df[data_df.isnull().any(axis=1)].head()

Unnamed: 0,headline,city,country
17,Louisiana Zika cases up to 26,,United States
73,Zika case reported in Oton,,
234,Malaria Exposure in Sussex,,
248,Greenwich Establishes Zika Task Force,,
308,More people in Boucau are infected with HIV ev...,,


In [13]:
data_df.head()

Unnamed: 0,headline,city,country
0,Zika Outbreak Hits Miami,Miami,United States
1,Could Zika Reach New York City?,New York City,United States
2,First Case of Zika in Miami Beach,Miami Beach,United States
3,"Mystery Virus Spreads in Recife, Brazil",Recife,Brazil
4,Dallas man comes down with case of Zika,Dallas,United States


In [14]:
data_df.to_csv(os.path.join(DATA_DIR, "headlines-with-city-country.csv"), index=False)