In [301]:
%matplotlib inline
import re
import geonamescache
import pandas as pd
import unidecode


In [302]:
# regexp = re.compile("Zika")
gc = geonamescache.GeonamesCache()
from geonamescache.mappers import country
country_name2capital_mapper = country(from_key='name', to_key='capital')
cities_by_name = gc.get_dataset_by_key(gc.get_cities(), 'name')
countries_by_name = gc.get_dataset_by_key(gc.get_countries(), 'name')
country_accent_mapping = {
    unidecode.unidecode(country): country for country in countries_by_name
}

city_accent_mapping = {
    unidecode.unidecode(city): city for city in cities_by_name
}

In [303]:
def get_city_by_name(name):
    city_lat = 0
    city_long = 0
    city_countrycode = ''
    city_name = ''
    if name in city_accent_mapping.keys():
        city = cities_by_name[city_accent_mapping[name]]
        city_lat = city['latitude'] 
        city_long = city['longitude'] 
        city_countrycode = city['countrycode']
        city_name = name
    else:
         city_name = 'NaN'   
    return [city_name, city_lat, city_long, city_countrycode]

In [304]:
def find_city_and_country_in_word(word, prev_word, preprev_word, line_city, line_lat, line_long, line_country, line_capital, line_capital_lat,line_capital_long, line_countrycode):
    
    [prev_line_city, prev_line_lat, prev_line_long, prev_line_countrycode] = [line_city, line_lat, line_long, line_countrycode]
    word3 = preprev_word+' '+prev_word+' '+word
    [line_city, line_lat, line_long, line_countrycode] = get_city_by_name(word3)
    if line_city == 'NaN':
        word2 = prev_word+' '+word
        [line_city, line_lat, line_long, line_countrycode] = get_city_by_name(word2)
        if line_city == 'NaN':
            [line_city, line_lat, line_long, line_countrycode] = get_city_by_name(word)
            if line_city == 'NaN':
                [line_city, line_lat, line_long, line_countrycode] = [prev_line_city, prev_line_lat, prev_line_long, prev_line_countrycode]
                # check if word is country    
                country_capital = country_name2capital_mapper(word)
                if country_capital:
                    line_country = word
                    line_capital = unidecode.unidecode(country_capital)
                    # if there was no city detected in line previously
                    if prev_line_city == '':
                        [line_city, line_lat, line_long, line_countrycode] = get_city_by_name(line_capital)
                        if line_city != 'NaN':
                            line_capital = line_city
                            line_capital_lat = line_lat
                            line_capital_long = line_long
                        else:
                            print(word +' CAPITAL = ' +country_capital+' not found in cities dataset')
    
    return [line_city, line_lat, line_long, line_country, line_capital, line_capital_lat,line_capital_long, line_countrycode]


In [305]:
def find_city_and_country_in_line(line, line_count):
    line_count = line_count + 1
    # init
    preprev_word = ''
    prev_word = ''
    line_city = ''
    line_country = ''
    line_capital = ''
    line_lat = 0
    line_long = 0
    line_capital_lat  = 0
    line_capital_long = 0
    line_countrycode = ''
    # find geo names in line
    line_words = re.split('[ |?|!|\t|\n|\'|,]+', line) #'\W+'#print(line_words)
    for word in line_words:
        word = unidecode.unidecode(word)
        [line_city, line_lat, line_long, line_country, line_capital, line_capital_lat, line_capital_long, line_countrycode] = find_city_and_country_in_word(word, prev_word, preprev_word, line_city, line_lat, line_long, line_country, line_capital, line_capital_lat, line_capital_long, line_countrycode)
        preprev_word = prev_word
        prev_word = word
        
    return [line_city, line_lat, line_long, line_country, line_capital, line_capital_lat,line_capital_long, line_countrycode, line_count]


In [314]:
count = 0
line_count = 0
city_count = 0
city2_count = 0
city3_count = 0
country_count = 0
line_none_count = 0
df_line = []

file = open("data/headlines.txt", 'r')
for line in file.readlines():
    [line_city, line_lat, line_long, line_country, line_capital, line_capital_lat, line_capital_long, line_countrycode, line_count] = find_city_and_country_in_line(line, line_count)        
    # update names list
    line_city_name = 'NaN'
    line_country_name = 'NaN'
    line_countrycode_name = 'NaN'
    if line_city:
        line_city_name = line_city
        line_countrycode_name = line_countrycode
        city_count = city_count+1
        
    if line_country:
        line_country_name = line_country
        country_count = country_count + 1
        
    if not line_city and not line_country:
        line_none_count = line_none_count + 1
        #print('Line by NONE: '+ line)
    
    df_line = df_line + [[line, line_city_name, line_lat, line_long, line_countrycode_name]]
    

df_file = pd.DataFrame(df_line, columns = ['headline', 'cities', 'latitude', 'longitude', 'countrycode'])

        
file.close()


In [315]:
print('total headlines in file: '+ str(line_count))
print('total cities found: '+ str(city_count))
print('total countries found: '+ str(country_count))
print('total headlines with no match: '+ str(line_none_count))



total headlines in file: 650
total cities found: 608
total countries found: 9
total headlines with no match: 42


In [316]:
df_file.shape


(650, 5)

In [317]:
df_file

Unnamed: 0,headline,cities,latitude,longitude,countrycode
0,Zika Outbreak Hits Miami\n,Miami,25.77427,-80.19366,US
1,Could Zika Reach New York City?\n,New York City,40.71427,-74.00597,US
2,First Case of Zika in Miami Beach\n,Miami Beach,25.79065,-80.13005,US
3,"Mystery Virus Spreads in Recife, Brazil\n",Recife,-8.05389,-34.88111,BR
4,Dallas man comes down with case of Zika\n,Dallas,44.91928,-123.31705,US
5,Trinidad confirms first Zika case\n,Trinidad,-33.51650,-56.89957,UY
6,Zika Concerns are Spreading in Houston\n,Houston,29.76328,-95.36327,US
7,Geneve Scientists Battle to Find Cure\n,Geneve,46.20222,6.14569,CH
8,The CDC in Atlanta is Growing Worried\n,Atlanta,33.74900,-84.38798,US
9,Zika Infested Monkeys in Sao Paulo\n,Sao Paulo,-23.54750,-46.63611,BR


In [169]:
word = 'Brazil'#'Istanbul'#'Louisiana'
gc.get_cities_by_name(word)

[]

In [170]:
cities_by_name[word]

KeyError: 'Brazil'

In [171]:
countries_by_name[word]

{'geonameid': 3469034,
 'name': 'Brazil',
 'iso': 'BR',
 'iso3': 'BRA',
 'isonumeric': 76,
 'fips': 'BR',
 'continentcode': 'SA',
 'capital': 'Brasilia',
 'areakm2': 8511965,
 'population': 201103330,
 'tld': '.br',
 'currencycode': 'BRL',
 'currencyname': 'Real',
 'phone': '55',
 'postalcoderegex': '^\\d{5}-\\d{3}$',
 'languages': 'pt-BR,es,en,fr',
 'neighbours': 'SR,PE,BO,UY,GY,PY,GF,VE,CO,AR'}