In [252]:
import numpy as np
import pandas as pd
import json, pprint, spacy, string
from pathlib import Path
from copy import deepcopy
from word2number import w2n
from dateutil import parser
from difflib import SequenceMatcher
from jellyfish import jaro_winkler
from spacy import *
from spacy.pipeline import *
from spacy.lang.en.stop_words import STOP_WORDS

In [253]:
### Load trained model, data, and psgc
nlp = spacy.load('models')
df = pd.read_json('test_data.json')
psgc = pd.read_csv('psgc.csv', encoding='latin1')
provinces = psgc.loc[:, 'Province': 'Province Code'].drop_duplicates()
municipalities = psgc.loc[:, 'Municipality': 'Municipality Code'].drop_duplicates()

In [254]:
def record_to_json(record, f):
    if record == 'incident':
        return {
            'disease': str(f[0]),
            'date-start': str(f[1]),
            'date-end': str(f[2]),
            'title': str(f[3]),
            'url': str(f[4]),
            'code': str(f[5]),
            'incident': w2n.word_to_num(str(f[6]))
        }
    elif record == 'state':
        return {
            'disease': str(f[0]),
            'date-start': str(f[1]),
            'date-end': str(f[2]),
            'title': str(f[3]),
            'url': str(f[4]),
            'code': str(f[5]),
            'state': str(f[6])
        }
    elif record == 'change':
        return {
            'disease': str(f[0]),
            'date-start': str(f[1]),
            'date-end': str(f[2]),
            'title': str(f[3]),
            'url': str(f[4]),
            'code': str(f[5]),
            'change': str(f[6])
        }
    return {}

In [255]:
# Returns a list of tuples composed of the name
# and type of the location

def extract_location(text, ref=False):
    locs = []
    
    # Retrieves the location specified at the start of each article
    if ref:
        for word in text:
            if word.is_punct:
                continue
            elif word.ent_type_ == 'GPE':
                locs.append(str(word))
            else:
                break
                
    # Retrieves the location found in any given sentence
    else:
        for word in text:
            if word not in STOP_WORDS:
                if word.ent_type_ == 'GPE':
                    locs.append(str(word))
    out = search(psgc, locs)     
    return out

In [256]:
def search(data, locations):
    if locations:
        # remove instances of "Philippines"
        if 'Philippines' in locations:
            locations.remove('Philippines')
    
        # fuzzy search for each line; keep the options with the highest similarity
        possible_locs = []
    
        for loc in locations:
            max_similarity = 0
            locs = []
            final_locs = []
        
            # go through all provinces before municipalities
            for _, province in provinces.iterrows():
                jaro_distance = jaro_winkler(province[0].lower(), loc.lower())
                #jaro_distance = SequenceMatcher(None, province[0].lower(), loc.lower()).ratio()

                # if current similarity is higher than stored max similarity,
                # replace all area codes in locs[] with current and update max_similarity
                if jaro_distance > max_similarity:
                    locs = [province[1]]
                    max_similarity = jaro_distance

                # if current similarity is equal to the stored max similarity,
                # append current area code to locs[]
                elif int(jaro_distance) == int(max_similarity):
                    locs.append(province[1])

            # append locs[] to final_locs[] only if similarity is acceptable
            if max_similarity >= 0.5:
                final_locs += locs
        
            max_similarity = 0
            
            # go through all municipalities
            for _, municipal in municipalities.iterrows():
                jaro_distance = SequenceMatcher(None, municipal[0].lower(), loc.lower()).ratio()

                # if current similarity is higher than stored max similarity,
                # replace all area codes in locs[] with current and update max_similarity
                if jaro_distance > max_similarity:
                    locs = [municipal[1]]
                    max_similarity = jaro_distance

                # if current similarity is equal to the stored max similarity,
                # append current area code to locs[]
                elif jaro_distance == max_similarity:
                    locs.append(municipal[1])

            # append locs[] to final_locs[] only if similarity is acceptable
            if max_similarity >= 0.5:
                final_locs += locs

            possible_locs.append(final_locs)
        
        # find intersecting rows for all locations
        possible_locations = []
    
        try:
            _ = possible_locs[1]

            if len(possible_locs) > 1:
                for loc in possible_locs[0]:
                    for loc2 in possible_locs[1]:
                        if str(loc2)[:-5] == str(loc)[:-5]:
                            possible_locations += [loc, loc2]
        except IndexError:
            #return '0' + str(possible_locs[0][0])[:-5] + '00000'
            return '0' + str(possible_locs[0])[:-5] + '00000'

        # choose highest similarity
        possible_locations = list(set(possible_locations))
        return (str(possible_locations[0])[:-5] + '00000').zfill(9)

In [257]:
"""
features:
    disease
    date
    title
    url
    geocode
    count
"""
def extract_incident(sent, refs):
    # Load metadata values
    dis, date, title, url, loc = refs
    date_start = date.strftime('%Y-%m-%d')
    date_end = date.strftime('%Y-%m-%d')
    relations = []
    
    extracted_loc = extract_location(sent) # Look for location specified in the sentence
    #if extracted_loc:
    #    loc = search(extracted_loc)
        
    # Find all CARDINAL entities, then check if they are 
    # linked to a disease, case, or location
    # If they are, they are probably incidence counts
    for number in filter(lambda w: w.ent_type_ == 'CARDINAL', sent):
        if number.dep_ in ('attr', 'dobj'):
            case = [w for w in number.head.lefts if w.ent_type == 'nsubj']
            if case: 
                count = number.text.replace(',', '')
                relations.append(record_to_json('incident', [dis,  date_start, date_end, title, url, loc, count]))
        else:
            case = number.head.ent_type_
            count = number.text.replace(',', '')
            if case == 'CASE':
                relations.append(record_to_json('incident', [dis,  date_start, date_end, title, url, loc, count]))
            if case == 'LOC':
                relations.append(record_to_json('incident', [dis,  date_start, date_end, title, url, loc, count]))
            
    return relations

In [258]:
"""
features:
    disease
    date
    title
    url
    geocode
    state
"""
def extract_status(sent, refs):
    # Load metadata values
    dis, date, title, url, loc = refs
    date_start = date.strftime('%Y-%m-%d')
    date_end = date.strftime('%Y-%m-%d')
    relations = []
    
    extracted_loc = extract_location(sent) # Look for location specified in the sentence
    #if extracted_loc:
    #    loc = search(extracted_loc)
    
    ## I need to find a way to locate a substring of STATE tokens,
    ## concatenate the token.text of each token into one string
    ## then append it into the relations list
    
    ## this one here doesn't implement that yet vvv
    states = filter(lambda x: x.ent_type_ == 'STATE', sent)
    state = ' '.join(map(str, states))
    if state:
        if 'hot' in state:
            state = 'hot'
        elif 'calamity' in state:
            state = 'calamity'
        elif 'outbreak' in state or 'epidemic' in state:
            state = 'outbreak'
        relations.append(record_to_json('state', [dis, date_start, date_end, title, url, loc, state]))
        
    return relations    

In [248]:
"""
features:
    disease
    date
    title
    url
    geocode
    change
"""
def extract_change(sent, refs):
    # Load metadata values
    dis, date, title, url, loc = refs
    date_start = date.strftime('%Y-%m-%d')
    date_end = date.strftime('%Y-%m-%d')
    relations = []
    
    extracted_loc = extract_location(sent) # Look for location specified in the sentence
    #if extracted_loc:
    #    loc = search(extracted_loc)
    
    for change in filter(lambda w: w.ent_type_ == 'CHANGE', sent):
        for child in change.children:
            if child.ent_type_ == 'PERCENT':
                if str(change) in ['high, higher', 'increase', 'increasing', 'increased', 'rise', 'rose', 'more']:
                    change = 'rise'
                elif str(change) in ['low', 'lower', 'decrease', 'decreasing', 'decreased', 'fall', 'fell', 'less']:
                    change = 'fall'
                relations.append(record_to_json('change', [dis, date_start, date_end, title, url, loc, change]))
                
    return relations

In [249]:
def extract_refs(article):
    dis = article['disease']
    date = article['timestamp'] 
    title = article['title']
    url = article['url']
    loc = extract_location(nlp(article['content']), ref=True)
    #loc = extract_location(nlp(article['content']), ref=True)
    return [dis, date, title, url, loc]

In [250]:
def extract(df):
    total_incidents = []
    total_statuses = []
    total_changes = []
    for index, article in df.iterrows():
        doc = nlp(article['content'])
        refs = extract_refs(article)
        deets = [article['title'], article['url']]
        incidents = []
        statuses = []
        changes = []
        for sent in doc.sents:
            i = extract_incident(sent,  refs)
            s = extract_status(sent, refs)
            t = extract_change(sent, refs)
            if i: [incidents.append(x) for x in i]
            if s: [statuses.append(x) for x in s]
            if t: [changes.append(x) for x in t]
        #if incidents: pp.pprint(incidents)
        #if statuses: pp.print(statuses)
        #if changes: pp.print(changes)
    #"""
        if incidents: [total_incidents.append(y) for y in incidents]
        if statuses: [total_statuses.append(y) for y in statuses]
        if changes: [total_changes.append(y) for y in changes]
    
    with open('incidents2.json', 'w', encoding='utf-8') as outfile:
        json.dump(total_incidents, outfile, indent=4, ensure_ascii=False)
    with open('statuses2.json', 'w', encoding='utf-8') as outfile:
        json.dump(total_statuses, outfile, indent=4, ensure_ascii=False)
    with open('changes2.json', 'w', encoding='utf-8') as outfile:
        json.dump(total_changes, outfile, indent=4, ensure_ascii=False)
    #"""

In [251]:
extract(df)

IndexError: list index out of range

In [6]:
#"""
test = nlp(df['content'][0])
sents = [sent.as_doc() for sent in test.sents]
displacy.render(sents[2], style='dep', jupyter=True)
#"""

In [None]:
"""
a = nlp(df['content'][7])
displacy.render(a, style='ent', jupyter=True)
"""