In [280]:
import spacy, json
import numpy as np
import pandas as pd
from pathlib import Path
from dateutil import parser
from spacy import *
from spacy.pipeline import *
from spacy.lang.en.stop_words import STOP_WORDS

In [281]:
### Load trained model and data
nlp = spacy.load('models')
df = pd.read_json('test_data.json')

In [282]:
def record_to_json(record, f):
    if record == 'incident':
        return {
            'disease': f[0],
            'date-start': f[1],
            'date-end': f[2],
            'title': f[3],
            'url': f[4],
            'geocode': f[5],
            'incident': f[6]
        }
    elif record == 'state':
        return {
            'disease': f[0],
            'date-start': f[1],
            'date-end': f[2],
            'title': f[3],
            'url': f[4],
            'geocode': f[5],
            'state': f[6]
        }
    elif record == 'trend':
        return {
            'disease': f[0],
            'date-start': f[1],
            'date-end': f[2],
            'title': f[3],
            'url': f[4],
            'geocode': f[5],
            'trend': f[6]
        }
    return {}

In [283]:
def geolocate(x):
    return x

In [284]:
# Returns a list of tuples composed of the name
# and type of the location

def extract_location(text, ref=False):
    locs = []
    
    # Retrieves the location specified at the start of each article
    if ref:
        for word in text:
            if word.is_punct:
                continue
            elif word.ent_type_ == 'GPE':
                locs.append(word)
            else:
                break
                
    # Retrieves the location found in any given sentence
    else:
        for word in text:
            if word not in STOP_WORDS:
                if word.ent_type_ == 'GPE' or word.ent_type_ == 'LOC_TYPE':
                    locs.append(word)
                    
    # Found locations shall be reorganized into tuples
    # of location name and its type

    return locs

In [285]:
"""
features:
    disease
    date
    title
    url
    geocode
    count
"""
def extract_incident(sent, refs):
    # Load metadata values
    dis, date, title, url, loc = refs
    date_start = date.date() 
    date_end = date.date()
    
    relations = []
    extracted_loc = extract_location(sent) # Look for location specified in the sentence
    
    # Find all CARDINAL entities, then check if they are 
    # linked to a disease, case, or location
    # If they are, they are probably incidence counts
    for number in filter(lambda w: w.ent_type_ == 'CARDINAL', sent):
        if number.dep_ in ('attr', 'dobj'):
            case = [w for w in number.head.lefts if w.ent_type == 'nsubj']
            if case: 
                case = case[0]
                relations.append(record_to_json('incident', [dis,  date_start, date_end, title, url, geolocate(loc), number]))
        else:
            case = number.head.ent_type_
            if case == 'CASE':
                relations.append(record_to_json('incident', [dis,  date_start, date_end, title, url, geolocate(loc), number]))
            if case == 'LOC':
                relations.append(record_to_json('incident', [dis,  date_start, date_end, title, url, geolocate(loc), number]))
            
    return relations

In [286]:
"""
features:
    disease
    date
    title
    url
    geocode
    state
"""
def extract_status(sent, refs):
    # Load metadata values
    dis, date, title, url, loc = refs
    date_start = date.date() 
    date_end = date.date()
    
    relations = []
    extracted_loc = extract_location(sent) # Look for location specified in the sentence
    
    ## I need to find a way to locate a substring of STATE tokens,
    ## concatenate the token.text of each token into one string
    ## then append it into the relations list
    
    ## this one here doesn't implement that yet vvv
    for state in filter(lambda w: w.ent_type_ == 'STATE', sent):
        if extracted_loc: loc = extracted_loc
        relations.append(record_to_json('state', [dis, date_start, date_end, title, url, geolocate(loc), state]))

    return relations    

In [287]:
"""
features:
    disease
    date
    title
    url
    geocode
    change
"""
def extract_trend(sent, refs):
    dis, date, title, url, loc = refs
    relations = []

In [288]:
def extract_refs(article):
    dis = article['disease']
    date = article['timestamp'] 
    title = article['title']
    url = article['url']
    loc = extract_location(nlp(article['content']), ref=True)
    return [dis, date, title, url, loc]

In [289]:
def extract(df):
    #total_incidents = []
    #total_statuses = []
    #total_trends = []
    for index, article in df.iterrows():
        doc = nlp(article['content'])
        refs = extract_refs(article)
        deets = [article['title'], article['url']]
        incidents = []
        statuses = []
        trends = []
        for sent in doc.sents:
            i = extract_incident(sent,  refs)
            s = extract_status(sent, refs)
            t = extract_trend(sent, refs)
            if i: [incidents.append(x) for x in i]
            if s: [statuses.append(x) for x in s]
            if t: [trends.append(x) for x in t]
        print(index, incidents)
        #print(index, statuses)
        #print(index, trends)
        
        #[total_incidents.append(y) for y in incidents]
        #[total_statuses.append(y) for y in statuses]
        #[total_trends.append(y) for y in trends]
    
    #print(total_incidents)
    """
    with open('incidents.json', 'w', encoding='utf-8') as outfile:
        json.dump(incidents, outfile, indent=4, ensure_ascii=False)
    with open('statuses.json', 'w', encoding='utf-8') as outfile:
        json.dump(statuses, outfile, indent=4, ensure_ascii=False)
    with open('trends.json', 'w', encoding='utf-8') as outfile:
        json.dump(trends, outfile, indent=4, ensure_ascii=False)
    """

In [290]:
extract(df)

0 [{'disease': 'dengue', 'date-start': datetime.date(2015, 6, 30), 'date-end': datetime.date(2015, 6, 30), 'title': 'Isabela town in state of calamity due to dengue', 'url': 'https://www.rappler.com/nation/97920-luna-isabela-state-of-calamity-dengue', 'geocode': [ISABELA, Philippines], 'incident': 54}, {'disease': 'dengue', 'date-start': datetime.date(2015, 6, 30), 'date-end': datetime.date(2015, 6, 30), 'title': 'Isabela town in state of calamity due to dengue', 'url': 'https://www.rappler.com/nation/97920-luna-isabela-state-of-calamity-dengue', 'geocode': [ISABELA, Philippines], 'incident': 19,946}]
1 []
2 [{'disease': 'dengue', 'date-start': datetime.date(2012, 9, 24), 'date-end': datetime.date(2012, 9, 24), 'title': 'Dengue in Davao up 300%', 'url': 'https://www.rappler.com/nation/12942-dengue-in-davao-up-300', 'geocode': [DAVAO, CITY, Philippines], 'incident': 3,096}, {'disease': 'dengue', 'date-start': datetime.date(2012, 9, 24), 'date-end': datetime.date(2012, 9, 24), 'title': '

KeyboardInterrupt: 

In [155]:
test = nlp(df['content'][0])
sents = [sent.as_doc() for sent in test.sents]
displacy.render(sents[10], style='dep', jupyter=True)

In [158]:
extract_location(sents[1])

[municipality, Luna, Isabela, province, northern, Philippines]

In [171]:
a = nlp(df['content'][2])
displacy.render(a, style='ent', jupyter=True)