In [461]:
import numpy as np
import pandas as pd
import json, spacy, string
from pathlib import Path
from dateutil import parser
from spacy import *
from spacy.pipeline import *
from spacy.lang.en.stop_words import STOP_WORDS

In [462]:
### Load trained model and data
nlp = spacy.load('models')
df = pd.read_json('test_data.json')

In [463]:
def record_to_json(record, f):
    if record == 'incident':
        return {
            'disease': str(f[0]),
            'date-start': str(f[1]),
            'date-end': str(f[2]),
            'title': str(f[3]),
            'url': str(f[4]),
            'geocode': str(f[5]),
            'incident': str(f[6])
        }
    elif record == 'state':
        return {
            'disease': str(f[0]),
            'date-start': str(f[1]),
            'date-end': str(f[2]),
            'title': str(f[3]),
            'url': str(f[4]),
            'geocode': str(f[5]),
            'state': str(f[6])
        }
    elif record == 'trend':
        return {
            'disease': str(f[0]),
            'date-start': str(f[1]),
            'date-end': str(f[2]),
            'title': str(f[3]),
            'url': str(f[4]),
            'geocode': str(f[5]),
            'trend': str(f[6])
        }
    return {}

In [464]:
def geolocate(x):
    return x

In [465]:
# Returns a list of tuples composed of the name
# and type of the location

def extract_location(text, ref=False):
    locs = []
    
    # Retrieves the location specified at the start of each article
    if ref:
        for word in text:
            if word.is_punct:
                continue
            elif word.ent_type_ == 'GPE':
                locs.append(word)
            else:
                break
                
    # Retrieves the location found in any given sentence
    else:
        for word in text:
            if word not in STOP_WORDS:
                if word.ent_type_ == 'GPE' or word.ent_type_ == 'LOC_TYPE':
                    locs.append(word)
                    
    # Found locations shall be reorganized into tuples
    # of location name and its type

    return locs

In [466]:
"""
features:
    disease
    date
    title
    url
    geocode
    count
"""
def extract_incident(sent, refs):
    # Load metadata values
    dis, date, title, url, loc = refs
    date_start = date.strftime('%Y-%m-%d')
    date_end = date.strftime('%Y-%m-%d')
    
    relations = []
    extracted_loc = extract_location(sent) # Look for location specified in the sentence
    
    # Find all CARDINAL entities, then check if they are 
    # linked to a disease, case, or location
    # If they are, they are probably incidence counts
    for number in filter(lambda w: w.ent_type_ == 'CARDINAL', sent):
        if number.dep_ in ('attr', 'dobj'):
            case = [w for w in number.head.lefts if w.ent_type == 'nsubj']
            if case: 
                count = number.text.replace(',', '')
                relations.append(record_to_json('incident', [dis,  date_start, date_end, title, url, geolocate(loc), count]))
        else:
            case = number.head.ent_type_
            count = number.text.replace(',', '')
            if case == 'CASE':
                relations.append(record_to_json('incident', [dis,  date_start, date_end, title, url, geolocate(loc), count]))
            if case == 'LOC':
                relations.append(record_to_json('incident', [dis,  date_start, date_end, title, url, geolocate(loc), count]))
            
    return relations

In [467]:
"""
features:
    disease
    date
    title
    url
    geocode
    state
"""
def extract_status(sent, refs):
    # Load metadata values
    dis, date, title, url, loc = refs
    date_start = date.strftime('%Y-%m-%d')
    date_end = date.strftime('%Y-%m-%d')
    
    relations = []
    extracted_loc = extract_location(sent) # Look for location specified in the sentence
    
    ## I need to find a way to locate a substring of STATE tokens,
    ## concatenate the token.text of each token into one string
    ## then append it into the relations list
    
    ## this one here doesn't implement that yet vvv
    for state in filter(lambda w: w.ent_type_ == 'STATE', sent):
        if extracted_loc: loc = extracted_loc
        relations.append(record_to_json('state', [dis, date_start, date_end, title, url, geolocate(loc), state]))

    return relations    

In [468]:
"""
features:
    disease
    date
    title
    url
    geocode
    change
"""
def extract_trend(sent, refs):
    # Load metadata values
    dis, date, title, url, loc = refs
    date_start = date.strftime('%Y-%m-%d')
    date_end = date.strftime('%Y-%m-%d')
    
    relations = []
    extracted_loc = extract_location(sent) # Look for location specified in the sentence
    
    for change in filter(lambda w: w.ent_type_ == 'CHANGE', sent):
        for child in change.children:
            if child.ent_type_ == 'PERCENT':
                relations.append(record_to_json('trend', [dis, date_start, date_end, title, url, geolocate(loc), change]))
    return relations

In [469]:
def extract_refs(article):
    dis = article['disease']
    date = article['timestamp'] 
    title = article['title']
    url = article['url']
    loc = extract_location(nlp(article['content']), ref=True)
    return [dis, date, title, url, loc]

In [470]:
def extract(df):
    total_incidents = []
    total_statuses = []
    total_trends = []
    for index, article in df.iterrows():
        doc = nlp(article['content'])
        refs = extract_refs(article)
        deets = [article['title'], article['url']]
        incidents = []
        statuses = []
        trends = []
        for sent in doc.sents:
            i = extract_incident(sent,  refs)
            s = extract_status(sent, refs)
            t = extract_trend(sent, refs)
            if i: [incidents.append(x) for x in i]
            if s: [statuses.append(x) for x in s]
            if t: [trends.append(x) for x in t]
        #if incidents: print(index, incidents)
        #if statuses: print(index, statuses)
        #if trends: print(index, trends)
    #"""
        if incidents: [total_incidents.append(y) for y in incidents]
        if statuses: [total_statuses.append(y) for y in statuses]
        if trends: [total_trends.append(y) for y in trends]
    
    with open('incidents.json', 'w', encoding='utf-8') as outfile:
        json.dump(total_incidents, outfile, indent=4, ensure_ascii=False)
    with open('statuses.json', 'w', encoding='utf-8') as outfile:
        json.dump(total_statuses, outfile, indent=4, ensure_ascii=False)
    with open('trends.json', 'w', encoding='utf-8') as outfile:
        json.dump(total_trends, outfile, indent=4, ensure_ascii=False)
    #"""

In [471]:
extract(df)

In [None]:
test = nlp(df['content'][0])
sents = [sent.as_doc() for sent in test.sents]
displacy.render(sents[4], style='dep', jupyter=True)

In [158]:
extract_location(sents[1])

[municipality, Luna, Isabela, province, northern, Philippines]

In [373]:
a = nlp(df['content'][7])
displacy.render(a, style='ent', jupyter=True)