In [1]:
# Loading headline data
headline_file = open('C:\\Users\\sol23\\Documents\\Discovering Disease Outbreaks\\data\\headlines.txt','r')
headlines = [line.strip()
             for line in headline_file.readlines()]
num_headlines = len(headlines)
print(f"{num_headlines} headlines have been loaded")

650 headlines have been loaded


In [2]:
from unidecode import unidecode
import re

# Converting names to regexes
def name_to_regex(name):
    decoded_name = unidecode(name)
    if name != decoded_name:
        regex = fr'\b({name}|{decoded_name})\b'
    else:
        regex = fr'\b{name}\b'
    return re.compile(regex, flags=re.IGNORECASE)

In [3]:
import geonamescache

gc = geonamescache.GeonamesCache()

# Mapping names to regexes
countries = [country['name']
            for country in gc.get_countries().values()]

country_to_name = {name_to_regex(name): name
                    for name in countries}

cities = [city['name'] for city in gc.get_cities().values()]
city_to_name = {name_to_regex(name): name for name in cities}

In [4]:
# Finding locations in text
def get_name_in_text(text, dictionary):
    for regex, name in sorted(dictionary.items(),
                                key=lambda x:x[1]):
        if regex.search(text):
            return name
    return None

In [5]:
# Finding locations in headlines
import pandas as pd

matched_countries = [get_name_in_text(headline, country_to_name)
                     for headline in headlines]
matched_cities = [get_name_in_text(headline, city_to_name)
                  for headline in headlines]
data = {'Headline': headlines, 'City': matched_cities, 'Country': matched_countries}
df = pd.DataFrame(data)

In [6]:
# Summarizing the location data
summary = df[['City','Country']].describe()
print(summary)

       City   Country
count   619        15
unique  510        10
top      Of  Malaysia
freq     45         3


In [7]:
# Fetching cities named "Of"
of_cities = df[df.City == 'Of'][['City','Headline']]
ten_of_cities = of_cities.head(10)
print(ten_of_cities.to_string(index=False))

City                                           Headline
  Of              Case of Measles Reported in Vancouver
  Of  Authorities are Worried about the Spread of Br...
  Of  Authorities are Worried about the Spread of Ma...
  Of  Rochester authorities confirmed the spread of ...
  Of     Tokyo Encounters Severe Symptoms of Meningitis
  Of  Authorities are Worried about the Spread of In...
  Of            Spike of Pneumonia Cases in Springfield
  Of  The Spread of Measles in Spokane has been Conf...
  Of                    Outbreak of Zika in Panama City
  Of    Urbana Encounters Severe Symptoms of Meningitis


In [8]:
# Finding multicity headlines
def get_cities_in_headline(headline):
    cities_in_headline = set()
    for regex, name in city_to_name.items():
        match = regex.search(headline)
        if match:
            if headline[match.start()].isupper():
                cities_in_headline.add(name)
    return list(cities_in_headline)

df['Cities'] = df['Headline'].apply(get_cities_in_headline)
df['Num_cities'] = df['Cities'].apply(len)
df_multiple_cities = df[df.Num_cities > 1]
num_rows, _ = df_multiple_cities.shape
print(f"{num_rows} headlines match multiple cities")

68 headlines match multiple cities


In [9]:
# Sampling multicity headlines
ten_cities = df_multiple_cities[['Cities','Headline']].head(10)
print(ten_of_cities.to_string(index=False))

City                                           Headline
  Of              Case of Measles Reported in Vancouver
  Of  Authorities are Worried about the Spread of Br...
  Of  Authorities are Worried about the Spread of Ma...
  Of  Rochester authorities confirmed the spread of ...
  Of     Tokyo Encounters Severe Symptoms of Meningitis
  Of  Authorities are Worried about the Spread of In...
  Of            Spike of Pneumonia Cases in Springfield
  Of  The Spread of Measles in Spokane has been Conf...
  Of                    Outbreak of Zika in Panama City
  Of    Urbana Encounters Severe Symptoms of Meningitis


In [10]:
# Selecting the longest city names
def get_longest_city(cities):
    if cities:
        return(max(cities, key=len))
    return None

df['City'] = df['Cities'].apply(get_longest_city)

In [11]:
# Printing the shortest city names
short_cities = df[df.City.str.len() <= 4][['City','Headline']]
print(short_cities.to_string(index=False))

 City                                           Headline
 Lima                Lima tries to address Zika Concerns
 Pune                     Pune woman diagnosed with Zika
 Rome  Authorities are Worried about the Spread of Ma...
 Molo                Molo Cholera Spread Causing Concern
 Miri                               Zika arrives in Miri
 Nadi  More people in Nadi are infected with HIV ever...
 Baud  Rumors about Tuberculosis Spreading in Baud ha...
 Kobe                     Chikungunya re-emerges in Kobe
 Waco                More Zika patients reported in Waco
 Erie                        Erie County sets Zika traps
 Kent                       Kent is infested with Rabies
 Reno  The Spread of Gonorrhea in Reno has been Confi...
 Sibu                      Zika symptoms spotted in Sibu
 Baku    The Spread of Herpes in Baku has been Confirmed
 Bonn  Contaminated Meat Brings Trouble for Bonn Farmers
 Jaén                         Zika Troubles come to Jaen
 Yuma                       Zik

In [12]:
# Fetching headlines with countries
df_countries = df[df.Country.notnull()][['City',
                                         'Country',
                                         'Headline']]
print(df_countries.to_string(index=False))

             City    Country                                           Headline
           Recife     Brazil            Mystery Virus Spreads in Recife, Brazil
 Ho Chi Minh City    Vietnam     Zika cases in Vietnam's Ho Chi Minh City surge
          Bangkok   Thailand                     Thailand-Zika Virus in Bangkok
       Piracicaba     Brazil                Zika outbreak in Piracicaba, Brazil
            Klang   Malaysia                   Zika surfaces in Klang, Malaysia
   Guatemala City  Guatemala  Rumors about Meningitis spreading in Guatemala...
      Belize City     Belize                 Belize City under threat from Zika
         Campinas     Brazil                   Student sick in Campinas, Brazil
      Mexico City     Mexico               Zika outbreak spreads to Mexico City
    Kota Kinabalu   Malaysia           New Zika Case in Kota Kinabalu, Malaysia
      Johor Bahru   Malaysia                 Zika reaches Johor Bahru, Malaysia
        Hong Kong  Hong Kong            

In [13]:
# Dropping countries from the table
df.drop('Country', axis=1, inplace=True)

In [14]:
# Exploring unmatched headlines
df_unmatched = df[df.City.isnull()]
num_unmatched = len(df_unmatched)
print(f"{num_unmatched} headlines contain no city matches.")
print(df_unmatched.head(10)[['Headline']].values)

39 headlines contain no city matches.
[['Louisiana Zika cases up to 26']
 ['Zika infects pregnant woman in Cebu']
 ['Spanish Flu Sighted in Antigua']
 ['Zika case reported in Oton']
 ['Hillsborough uses innovative trap against Zika 20 minutes ago']
 ['Maka City Experiences Influenza Outbreak']
 ['West Nile Virus Outbreak in Saint Johns']
 ['Malaria Exposure in Sussex']
 ['Greenwich Establishes Zika Task Force']
 ['Will West Nile Virus vaccine help Parsons?']]


In [15]:
# Dropping unmatched headlines
df = df[~df.City.isnull()][['City','Headline']]

In [19]:
# Adding headlines with countries
rsl = df.merge(df_countries, on='Headline',how='left')
rsl.drop('City_y', axis=1, inplace=True)
rsl.rename(columns={'City_x': 'City'}, inplace=True)

In [20]:
# Save dataframe for later use
rsl.to_csv('headline_df.csv',index=False)