In [142]:
import numpy as np
import pandas as pd
import json, pprint, spacy, string
from pathlib import Path
from copy import deepcopy
from word2number import w2n
from dateutil import parser
from jellyfish import jaro_winkler
from spacy import *
from spacy.pipeline import *
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS

In [136]:
### Load trained model, data, and psgc
nlp = spacy.load('models')
df = pd.read_json('test_data.json')
psgc = pd.read_csv('psgc.csv', encoding='latin1')
pp = pprint.PrettyPrinter(compact=True)
lemmatizer = Lemmatizer

In [137]:
def record_to_json(record, f):
    if record == 'incident':
        return {
            'disease': str(f[0]),
            'date-start': str(f[1]),
            'date-end': str(f[2]),
            'title': str(f[3]),
            'url': str(f[4]),
            'code': str(f[5]),
            'incident': w2n.word_to_num(str(f[6]))
        }
    elif record == 'state':
        return {
            'disease': str(f[0]),
            'date-start': str(f[1]),
            'date-end': str(f[2]),
            'title': str(f[3]),
            'url': str(f[4]),
            'code': str(f[5]),
            'state': str(f[6])
        }
    elif record == 'change':
        return {
            'disease': str(f[0]),
            'date-start': str(f[1]),
            'date-end': str(f[2]),
            'title': str(f[3]),
            'url': str(f[4]),
            'code': str(f[5]),
            'change': str(f[6])
        }
    return {}

In [138]:
# Returns a list of tuples composed of the name
# and type of the location

def extract_location(text, ref=False):
    locs = []
    
    # Retrieves the location specified at the start of each article
    if ref:
        for word in text:
            if word.is_punct:
                continue
            elif word.ent_type_ == 'GPE':
                locs.append(word)
            else:
                break
                
    # Retrieves the location found in any given sentence
    else:
        for word in text:
            if word not in STOP_WORDS:
                if word.ent_type_ == 'GPE':
                    locs.append(word.text)
                    
    return locs

In [139]:
def search(location):
    ps = deepcopy(psgc)
    
    # sort list by location type from most general to most specific, w/ 'none' at the end
    loc_types = ['reg', 'prov', 'mun', 'bgy', 'none']
    loc_sorter = [loc_types.index(tup[1]) for tup in location]
    location = [x for _, x in sorted(zip(loc_sorter, location))]

    # list of row indices of possible locations
    possible_locs = [x for x in range(42026)]
    cols = ['Region', 'Province', 'Municipality', 'Barangays']
    col = 0
    
    # narrow down list of possible locations, starting from general to specific
    for tup in location:
        # first, deal with all locations that do not have a 'none' type
        if not tup[1] == 'none':
            ps = ps.iloc[possible_locs]
            col = 'Region'

            if tup[1] == 'prov':
                col = 'Province'
            elif tup[1] == 'mun':
                col = 'Municipality'
            elif tup[1] == 'bgy':
                col = 'Barangays'
            
            # substring search
            possible_locs = [i for i,row in ps.iterrows() if tup[0].lower() in row[col].lower()]
            col = ps.columns.get_loc(col)
            
        # next, deal with 'none' loc_types OR zero matches from previous search
        if tup[1] == 'none' or len(possible_locs) == 0:
            sub_possible_locs = []
            
            # search all columns for all matches per tuple
            for x in cols:
                sub_possible_locs += [i for i,row in ps.iterrows() if tup[0].lower() in row[x].lower()]
            
            possible_locs = list(set(possible_locs) & set(sub_possible_locs))
    
    # still dealing with 'none' loc_types and/or zero matches from previous search
    # keep track of column where most specific area is found
    for tup in location:
        if tup[1] == 'none':
            for loc in possible_locs:
                for x in cols:
                    if (tup[0].lower() in ps.iloc[loc, ps.columns.get_loc(x)].lower()) and (ps.columns.get_loc(x) > col):
                        col = ps.columns.get_loc(x)
    
    col += 1

    # get possible area codes
    # area_codes keeps the first instance row index of each unique area code
    new_locs = []
    area_codes = []
    
    for row in possible_locs:
        if not psgc.iloc[row, col] in new_locs:
            new_locs.append(psgc.iloc[row, col])
            area_codes.append((row, psgc.iloc[row, col]))
    
    print(area_codes)
    
    # if more than one possible area, do similarity check and choose area w/ highest similarity (jaro-winkler distance)
    for tup in location:
        if len(area_codes) != 1:
            # deal first with locations that do not have a 'none' type
            if not tup[1] == 'none':
                col = ps.columns.get_loc('Region')

                if tup[1] == 'prov':
                    col = ps.columns.get_loc('Province')
                elif tup[1] == 'mun':
                    col = ps.columns.get_loc('Municipality')
                elif tup[1] == 'bgy':
                    col = ps.columns.get_loc('Barangays')
                
                jaro_distance = [jaro_winkler(tup[0].lower(), psgc.iloc[area[0], col].lower()) for area in area_codes]
        
        # return None if no area code found
        if not area_codes:
            return None
        # else, return area code
        else:
            return area_codes[0][1]

In [140]:
"""
features:
    disease
    date
    title
    url
    geocode
    count
"""
def extract_incident(sent, refs):
    # Load metadata values
    dis, date, title, url, loc = refs
    date_start = date.strftime('%Y-%m-%d')
    date_end = date.strftime('%Y-%m-%d')
    relations = []
    
    extracted_loc = extract_location(sent) # Look for location specified in the sentence
    #if extracted_loc:
    #    loc = search(extracted_loc)
        
    # Find all CARDINAL entities, then check if they are 
    # linked to a disease, case, or location
    # If they are, they are probably incidence counts
    for number in filter(lambda w: w.ent_type_ == 'CARDINAL', sent):
        if number.dep_ in ('attr', 'dobj'):
            case = [w for w in number.head.lefts if w.ent_type == 'nsubj']
            if case: 
                count = number.text.replace(',', '')
                relations.append(record_to_json('incident', [dis,  date_start, date_end, title, url, loc, count]))
        else:
            case = number.head.ent_type_
            count = number.text.replace(',', '')
            if case == 'CASE':
                relations.append(record_to_json('incident', [dis,  date_start, date_end, title, url, loc, count]))
            if case == 'LOC':
                relations.append(record_to_json('incident', [dis,  date_start, date_end, title, url, loc, count]))
            
    return relations

In [141]:
"""
features:
    disease
    date
    title
    url
    geocode
    state
"""
def extract_status(sent, refs):
    # Load metadata values
    dis, date, title, url, loc = refs
    date_start = date.strftime('%Y-%m-%d')
    date_end = date.strftime('%Y-%m-%d')
    relations = []
    
    extracted_loc = extract_location(sent) # Look for location specified in the sentence
    #if extracted_loc:
    #    loc = search(extracted_loc)
    
    ## I need to find a way to locate a substring of STATE tokens,
    ## concatenate the token.text of each token into one string
    ## then append it into the relations list
    
    ## this one here doesn't implement that yet vvv
    states = filter(lambda x: x.ent_type_ == 'STATE', sent)
    state = ' '.join(map(str, states))
    if state:
        if 'hot' in state:
            state = 'hot'
        elif 'calamity' in state:
            state = 'calamity'
        elif 'outbreak' in state or 'epidemic' in state:
            state = 'outbreak'
        relations.append(record_to_json('state', [dis, date_start, date_end, title, url, loc, state]))
        
    return relations    

In [44]:
"""
features:
    disease
    date
    title
    url
    geocode
    change
"""
def extract_change(sent, refs):
    # Load metadata values
    dis, date, title, url, loc = refs
    date_start = date.strftime('%Y-%m-%d')
    date_end = date.strftime('%Y-%m-%d')
    relations = []
    
    extracted_loc = extract_location(sent) # Look for location specified in the sentence
    #if extracted_loc:
    #    loc = search(extracted_loc)
    
    for change in filter(lambda w: w.ent_type_ == 'CHANGE', sent):
        for child in change.children:
            if child.ent_type_ == 'PERCENT':
                if change in 
                relations.append(record_to_json('change', [dis, date_start, date_end, title, url, loc, change]))
                
    return relations

In [45]:
def extract_refs(article):
    dis = article['disease']
    date = article['timestamp'] 
    title = article['title']
    url = article['url']
    loc = search(extract_location(nlp(article['content']), ref=True))
    #loc = extract_location(nlp(article['content']), ref=True)
    return [dis, date, title, url, loc]

In [46]:
def extract(df):
    total_incidents = []
    total_statuses = []
    total_changes = []
    for index, article in df.iterrows():
        doc = nlp(article['content'])
        refs = extract_refs(article)
        deets = [article['title'], article['url']]
        incidents = []
        statuses = []
        changes = []
        for sent in doc.sents:
            i = extract_incident(sent,  refs)
            s = extract_status(sent, refs)
            t = extract_change(sent, refs)
            if i: [incidents.append(x) for x in i]
            if s: [statuses.append(x) for x in s]
            if t: [changes.append(x) for x in t]
        #if incidents: pp.pprint(incidents)
        #if statuses: pp.print(statuses)
        #if changes: pp.print(changes)
    """
        if incidents: [total_incidents.append(y) for y in incidents]
        if statuses: [total_statuses.append(y) for y in statuses]
        if changes: [total_changes.append(y) for y in changes]
    
    with open('incidents.json', 'w', encoding='utf-8') as outfile:
        json.dump(total_incidents, outfile, indent=4, ensure_ascii=False)
    with open('statuses.json', 'w', encoding='utf-8') as outfile:
        json.dump(total_statuses, outfile, indent=4, ensure_ascii=False)
    with open('changes.json', 'w', encoding='utf-8') as outfile:
        json.dump(total_changes, outfile, indent=4, ensure_ascii=False)
    """

In [47]:
extract(df)

TypeError: 'spacy.tokens.token.Token' object does not support indexing

In [6]:
#"""
test = nlp(df['content'][0])
sents = [sent.as_doc() for sent in test.sents]
displacy.render(sents[2], style='dep', jupyter=True)
#"""

In [None]:
"""
a = nlp(df['content'][7])
displacy.render(a, style='ent', jupyter=True)
"""