### Discovering Disease Outbreaks from News Headlines: Unit 1
#### Jeremy Loscheider, March 22, 2020

In [1]:
### Installations
!pip install geonamescache
!pip install unidecode



In [2]:
### Location
filePath = 'C://Users/ThaddeusRyan.000/discovering-disease-outbreaks-base/data'

In [3]:
### Libraries
from pathlib import Path
import pandas as pd
import re as re
import geonamescache as gns
import unidecode as uni
import json as json
from time import time as pytime

In [4]:
### Declare location
data_folder = Path(filePath)
file_to_open = data_folder / "headlines.txt"

### Create empty list to catch each line
theHeadline = []

### Append each line, after decoding and stripping line break
with open(file_to_open) as f:
    for line in f:
        line2 = uni.unidecode(line.strip('\n'))
        theHeadline.append(line2)
f.close()

In [5]:
### Validate we now have usable lines
theHeadline[20:30]

["Chicago's First Zika Case Confirmed",
 'Tampa Bay Area Zika Case Count Climbs',
 'Bad Water Leads to Sickness in Flint, Michigan',
 'Baltimore plans for Zika virus',
 'London Health Unit Tracks Mad Cow Disease',
 "Zika cases in Vietnam's Ho Chi Minh City surge",
 'Philadelphia experts track pandemic',
 'Flu season hits Boston',
 'Scientists in Paris to look for answers',
 'Key Zika Findings in San Diego Institute']

### Problems
-- characters: â€“
-- Irrelevant headlines -> "Party Fever", "go Viral"
-- Common words combine "Mad Cow"
-- Multiple possible countries for a given city name
-- Compound names - "San Diego", "Wichita Falls", "Bella Vista" - assume Look for next instance of Capitalized

### Create a list of city names and a list of their associated countries

In [6]:
### Create dict objects and dismantle these cumbersome things
gc = gns.GeonamesCache()
all_cities = gc.get_cities()
all_countries = gc.get_countries()

In [7]:
### Create a city to country code relation
city_list = []
countrycode_list = []
for f in all_cities.keys():
    city_list.append(uni.unidecode(all_cities[f]['name']))
    countrycode_list.append(uni.unidecode(all_cities[f]['countrycode']))
    
cc_df = pd.DataFrame(city_list,countrycode_list).reset_index()
cc_df.columns=['CountryCode','City']
print(cc_df.head(5))
cc_df.tail(5)

  CountryCode                 City
0          AD     Andorra la Vella
1          AE   Umm Al Quwain City
2          AE  Ras Al Khaimah City
3          AE           Zayed City
4          AE         Khawr Fakkan


Unnamed: 0,CountryCode,City
24331,ZW,Bulawayo
24332,ZW,Bindura
24333,ZW,Beitbridge
24334,ZW,Epworth
24335,ZW,Chitungwiza


In [8]:
### Create a country code to country relation
countrycode_list_dup = []
country_list = []
for g in all_countries.keys():
    countrycode_list_dup.append(uni.unidecode(all_countries[g]['iso']))
    country_list.append(uni.unidecode(all_countries[g]['name']))
co_df = pd.DataFrame(countrycode_list_dup,country_list).reset_index()
co_df.columns = ['Country','CountryCode']
print(co_df.head(5))
co_df.tail(5)
    

                Country CountryCode
0               Andorra          AD
1  United Arab Emirates          AE
2           Afghanistan          AF
3   Antigua and Barbuda          AG
4              Anguilla          AI


Unnamed: 0,Country,CountryCode
247,South Africa,ZA
248,Zambia,ZM
249,Zimbabwe,ZW
250,Serbia and Montenegro,CS
251,Netherlands Antilles,AN


In [9]:
## Combine these to one
## Pattern: df.set_index('key').join(other.set_index('key'))
ci_co_df = cc_df.set_index('CountryCode').join(co_df.set_index('CountryCode')).reset_index()
ci_co_df.head(5)

Unnamed: 0,CountryCode,City,Country
0,AD,Andorra la Vella,Andorra
1,AE,Umm Al Quwain City,United Arab Emirates
2,AE,Ras Al Khaimah City,United Arab Emirates
3,AE,Zayed City,United Arab Emirates
4,AE,Khawr Fakkan,United Arab Emirates


#### Try matching form list one at a time

In [10]:
def findCity(_headline):
    for city in (city_list):
        match= re.search(r"\b%s\b" %city, _headline)
        if match is not None:
            return city
    return "Not Found"

def findCountry(_city):
    if _city != 'Not Found':
#         print("Going on the not found path")
#         country = ci_co_df[ci_co_df['City']==_city]['Country'][0]
        country = ci_co_dict[_city]
#         print(country)
        return country
    else:
        return "Not Found"

In [11]:
### Try dicts instead of dataframes to avoid indexing issues
_country = ci_co_df['Country']
_city = ci_co_df['City']
ci_co_dict = {_city[i]: _country[i] for i in range(len(_city))}
### Validate
print(ci_co_dict['Sao Pedro'])

Brazil


In [12]:
### Test on small version of dataset
smallheadline = theHeadline[26:31]
smallheadline

['Philadelphia experts track pandemic',
 'Flu season hits Boston',
 'Scientists in Paris to look for answers',
 'Key Zika Findings in San Diego Institute',
 'Thailand-Zika Virus in Bangkok']

In [13]:
sh_df = pd.DataFrame(smallheadline,columns=['Headline'])
sh_df['City'] = sh_df['Headline'].apply(findCity)
sh_df['Country'] = sh_df['City'].apply(findCountry)

In [14]:
sh_df.head(5)

Unnamed: 0,Headline,City,Country
0,Philadelphia experts track pandemic,Philadelphia,United States
1,Flu season hits Boston,Boston,United States
2,Scientists in Paris to look for answers,Paris,United States
3,Key Zika Findings in San Diego Institute,San Diego,United States
4,Thailand-Zika Virus in Bangkok,Bangkok,Thailand


### Obviously some issues with dupcliate place names.... probably not a big virology research lab in Paris, Texas

In [15]:
_ = pytime()
final_df = pd.DataFrame(theHeadline,columns=['Headline'])
final_df['City'] = final_df['Headline'].apply(findCity)
final_df['Country'] = final_df['City'].apply(findCountry)
__ = pytime()
print("is this performant? Done in %i seconds..." %(__-_))

is this performant? Done in 501 seconds...


In [16]:
### Show examples
final_df.head(5)

Unnamed: 0,Headline,City,Country
0,Zika Outbreak Hits Miami,Miami,United States
1,Could Zika Reach New York City?,York,United States
2,First Case of Zika in Miami Beach,Miami,United States
3,"Mystery Virus Spreads in Recife, Brazil",Recife,Brazil
4,Dallas man comes down with case of Zika,Dallas,United States


In [17]:
final_df.tail(5)

Unnamed: 0,Headline,City,Country
645,Rumors about Rabies spreading in Jerusalem hav...,Jerusalem,Israel
646,More Zika patients reported in Indang,Indang,Philippines
647,Suva authorities confirmed the spread of Rotav...,Suva,Fiji
648,More Zika patients reported in Bella Vista,Bella Vista,United States
649,Zika Outbreak in Wichita Falls,Wichita,United States
