In [1]:
import re
from typing import List

import geonamescache
import pandas as pd
from unidecode import unidecode as to_ascii

pd.options.display.max_colwidth = 120
pd.options.display.width = 120

### Read the headlines from data/headlines.txt into a pandas dataframe.  Use unidecode.unidecode to transliterate the headlines to ASCII

In [2]:
def get_headlines() -> pd.DataFrame:
    df = pd.read_table('data/headlines.txt', header=None, names=['headline'])
    df['headline'] = df['headline'].apply(to_ascii)
    return df


#### Get the list of cities and countries from geonamescache.  We want to match the longest possible name so sort the lists in reverse order by length.  Use unidecode.unidecode to transliterate the names to ASCII.  We will use the pandsa datafame str method extractall to extract the country and city names from the headlines so use a re pattern like \b(City1|City2|City3....)\b  Some names contain parentheses which re interprets as groups so replace them with '.'s  Use Series.drop_duplicates to remove duplicated city names

In [3]:
def get_cities_re() -> str:
    gc = geonamescache.GeonamesCache()
    cities_list = [re.sub(r'[()]', '.', to_ascii(city['name'])) 
                   for city in gc.get_cities().values()]
    cities_list.sort(key=len, reverse=True)
    cities_list = pd.Series(cities_list).drop_duplicates()
    cities_re = r'\b(' + r'|'.join(cities_list) + r')\b'
    return cities_re


def test_cities_re() -> None:
    cities_re = get_cities_re()
    teststr = 'Zika infects Zurich (Kreis 10) / Hongg in Flint, Michigan Boynton Beach '\
              'Pune experts Philadelphia track Hou Houston, TX? Recife, Brazil ,Ho.pandemic'
    assert re.findall(cities_re, teststr) == \
           ['Zurich (Kreis 10) / Hongg', 'Flint', 'Boynton Beach', 'Pune',
            'Philadelphia', 'Houston', 'Recife', 'Ho']


test_cities_re()

#### Countries are a little easier.  No parentheses or duplicates

In [4]:
def get_countries_re() -> str:
    gc = geonamescache.GeonamesCache()
    countries_list = [re.sub(r'[()]', '.', to_ascii(country["name"]))
                      for country in gc.get_countries().values()]
    countries_list.sort(key=len, reverse=True)
    countries_re = r'\b(' + r'|'.join(countries_list) + r')\b'
    return countries_re


def test_countries_re() -> None:
    countries_re = get_countries_re()
    teststr = "Spreads in Recife, Brazil Reach New York City  in Vietnam's Ho Chi Minh City surge"\
              "wile Bonaire, Saint Eustatius and Saba expect Sahara in U.S. Virgin Islands"
    assert re.findall(countries_re, teststr) == \
           ['Brazil', 'Vietnam', 'Bonaire, Saint Eustatius and Saba ', 'U.S. Virgin Islands']


test_countries_re()

#### Some utilities

In [5]:
def find_cities(pattern: str) -> List[str]:
    """
    return a list of cities with names containing the
    string pattern
    """
    gc = geonamescache.GeonamesCache()
    cities_list = set([re.sub(r'[()]', '.', to_ascii(city['name']))
                       for city in gc.get_cities().values()])
    return [c for c in cities_list if c.find(pattern) >= 0]


def find_countries(pattern: str) -> List[str]:
    """
    return a list of countries with names containing the
    string pattern
    """
    gc = geonamescache.GeonamesCache()
    countries_list = [re.sub(r'[()]', '.', to_ascii(country["name"]))
                      for country in gc.get_countries().values()]

    return [c for c in countries_list if c.find(pattern) >= 0][-9:]



#### extractall returns a dataframe with a multiindex where level 1 is incremented for each match.  See how many headlines have more than one city or country matched

In [6]:
def find_multimatch_headlines(headline_df: pd.DataFrame, colname: str='headline') -> None:
    """
    print headlines with multiple city and country matches
    """
    cities_re = get_cities_re()
    cities = headline_df[colname].str.extractall(cities_re)
    matches = cities.index.get_level_values("match").unique()
    if len(matches) == 1:
        print('No headlines have multiple cities')
    else:
        print('Headlines with multiple city matches')
        ndx = cities.loc[(slice(None), slice(1,None)), :].index
        ndx = ndx.get_level_values(0)
        print(cities.loc[ndx].reset_index(level=1, drop=True)\
                    .join(headline_df.loc[ndx]))
    countrues_re = get_countries_re()
    countries = headline_df[colname].str.extractall(countrues_re)
    matches = countries.index.get_level_values("match").unique()
    if len(matches) == 1:
        print('No headlines have multiple countries')
    else:
        print('Headlines with multiple country matches')
        ndx = countries.loc[(slice(None), slice(1,None)), :].index
        ndx = ndx.get_level_values(0)
        print(countries.loc[ndx].reset_index(level=1, drop=True)\
                       .join(headline_df.loc[ndx]))

find_multimatch_headlines(get_headlines())

Headlines with multiple city matches
                   0                                        headline
21             Tampa           Tampa Bay Area Zika Case Count Climbs
21               Bay           Tampa Bay Area Zika Case Count Climbs
33      Kuala Lumpur              Kuala Lumpur is Hit By Zika Threat
33               Hit              Kuala Lumpur is Hit By Zika Threat
98          Seminole               Zika Patient in Seminole, Florida
98           Florida               Zika Patient in Seminole, Florida
101              Can            Can Zika make it here to Vero Beach?
101       Vero Beach            Can Zika make it here to Vero Beach?
120           Spring  Spring break ruined by Zika in Fort Lauderdale
120  Fort Lauderdale  Spring break ruined by Zika in Fort Lauderdale
134         Batangas   Batangas Tourism Takes a Hit as Virus Spreads
134              Hit   Batangas Tourism Takes a Hit as Virus Spreads
159          Belmont                  Zika in Belmont, Belmont wor

#### So headlines containing Tampa Bay, Hit, Florida, Can, Spring or Belmont, Belmont have incorrect city matches.  Create a modified headlines column for this dataset that modifies these headlines to eliminate the problem strings

In [7]:
def mod_headlines(df: pd.DataFrame, badstr: str, goodstr: str) -> pd.DataFrame:
    """
    replace the string badstr with the string goodstr in column mod_headline
    entries containing goodstr 
    """
    ndx = df.loc[df['headline'].str.contains(badstr)].index
    df.loc[ndx, 'mod_headline'] = df.loc[ndx, 'mod_headline'].str.replace(badstr, goodstr, regex=False)
    return df


def get_cities(df: pd.DataFrame) -> pd.DataFrame:
    """
    return a DataFrame with a countries column containing countries extracted 
    from the column named headline.  
    """
    # add a mod_headline column that will contain headlines without the problematic
    # city strings found above
    df['mod_headline'] = df['headline']
    # should Tampa Bay Area return Tampa for the city or NaN?  Will use
    # NaN since Tampa Bay Area is not very specific
    df = mod_headlines(df=df, badstr='Tampa Bay', goodstr='TampaBay')
    df = mod_headlines(df=df, badstr=', Florida', goodstr=', FL')
    df = mod_headlines(df=df, badstr=' Hit', goodstr='Hit')
    df = mod_headlines(df=df, badstr='Can ', goodstr='Can')
    df = mod_headlines(df=df, badstr='Spring break', goodstr='Spr Brk')
    df = mod_headlines(df=df, badstr='Belmont, Belmont', goodstr='Belmont')
    cities_re = get_cities_re()
    find_multimatch_headlines(headline_df=df, colname='mod_headline')
    cities = df['mod_headline'].str.extractall(cities_re)
    return cities.reset_index(level=1, drop=True)


def get_countries(df: pd.DataFrame) -> pd.DataFrame:
    """
    return a DataFrame with a cites column containing cities extracted 
    from the column named headline.  
    """
    countries_re = get_countries_re()
    countries = df['headline'].str.extractall(countries_re)#, flags=re.IGNORECASE)
    return countries.reset_index(level=1, drop=True)



In [8]:
headline_df = get_headlines()
headline_df['countries'] = get_countries(df=headline_df)
headline_df['cities'] = get_cities(df=headline_df.copy())

No headlines have multiple cities
No headlines have multiple countries


#### Looks like the country names are correct for those headlines with country names.  

In [9]:
headline_df[headline_df['countries'].notnull()]

Unnamed: 0,headline,countries,cities
3,"Mystery Virus Spreads in Recife, Brazil",Brazil,Recife
25,Zika cases in Vietnam's Ho Chi Minh City surge,Vietnam,Ho Chi Minh City
30,Thailand-Zika Virus in Bangkok,Thailand,Bangkok
44,"Zika outbreak in Piracicaba, Brazil",Brazil,Piracicaba
58,"Zika surfaces in Klang, Malaysia",Malaysia,Klang
59,Rumors about Meningitis spreading in Guatemala City have been refuted,Guatemala,Guatemala City
77,Belize City under threat from Zika,Belize,Belize City
78,"Student sick in Campinas, Brazil",Brazil,Campinas
83,Zika outbreak spreads to Mexico City,Mexico,Mexico City
124,"New Zika Case in Kota Kinabalu, Malaysia",Malaysia,Kota Kinabalu


#### Check a few random entries

In [10]:
headline_df.sample(20)

Unnamed: 0,headline,countries,cities
294,Zika case reported in Abilene,,Abilene
121,Villavicencio under Zika threat,,Villavicencio
119,Santa Barbara tests new cure for Hepatitis C,,Santa Barbara
266,Respiratory Syncytial Virus Vaccine is now Required in Tulsa,,Tulsa
532,Zika Troubles come to Jaen,,Jaen
516,Outbreak of Zika in Hutchins,,
65,Jacksonville man hit by Zika,,Jacksonville
253,Will the Mad Cow Outbreak Reach Vienna?,,Vienna
604,More Patients in Maynard are Getting Diagnosed with Syphilis,,
549,Zika Reported in Ciudad Acuna,,Ciudad Acuna


#### Look at the headlines without cities

In [11]:
missing_city_df = headline_df.loc[headline_df['cities'].isnull()]
print(f'{missing_city_df.shape[0]} of {headline_df.shape[0]} headlines are missing a city.')
print(missing_city_df)

44 of 650 headlines are missing a city.
                                                                              headline countries cities
17                                                       Louisiana Zika cases up to 26       NaN    NaN
19                                                 Zika infects pregnant woman in Cebu       NaN    NaN
21                                               Tampa Bay Area Zika Case Count Climbs       NaN    NaN
48                                                      Spanish Flu Sighted in Antigua       NaN    NaN
63                        Carnival under threat in Rio De Janeiro due to Zika outbreak       NaN    NaN
73                                                          Zika case reported in Oton       NaN    NaN
76                       Hillsborough uses innovative trap against Zika 20 minutes ago       NaN    NaN
88                                            Maka City Experiences Influenza Outbreak       NaN    NaN
139                     

#### Chceck geonamescache to see if it contains entries similar to some of the cities

In [12]:
missing_cities = ['Antigua', 'Antioquia', 'Bouc', 'Cebu', 'Coam', 'Dang', 'Janeiro', 'Maka', 
                  'lean', 'Saint J', 'Huron', 'Fres', 'Moreh', 'Pism']
[find_cities(cityname) for cityname in missing_cities]

[['Antigua Guatemala'],
 [],
 ['Port-de-Bouc', 'Boucherville'],
 ['Cebu City'],
 [],
 ['Danghara'],
 ['Rio de Janeiro'],
 ['Makati City',
  'Tha Maka',
  'Makabe',
  'Makassar',
  'Makamba',
  'Makakilo',
  'Makakilo City'],
 ['An Muileann gCearr', 'Orleans', 'Tecpan de Galeana', 'New Orleans'],
 ['Saint Joseph', "Saint John's", 'Saint John'],
 ['Port Huron'],
 ['Fresh Meadows', 'Fresnillo', 'Fresnes', 'Fresno'],
 [],
 []]

#### Some city names are missing because the names are not in geonamescache, some are spelled differently or missing parts of the name, and some have different case than the headlines.  Using re.IGNORECASE found lots of cities named Of and other short names so that's not a quick fix.

TO DO: 
  * Manually fix more city names in the headlines before extracting city names
  * Fill in the country for the city names in geonamescache where possible