In [2]:
DATASET_PATH = '../data/promed_dengue.csv'

In [3]:
import pandas as pd
import re
from tqdm import tqdm
tqdm.pandas()

In [4]:
sys.path.append('../Epitator')

In [5]:
from epitator.annotator import AnnoDoc
from epitator.count_annotator import CountAnnotator
from epitator.date_annotator import DateAnnotator
from epitator.geoname_annotator import GeonameAnnotator

In [1]:
from transformers import BartForConditionalGeneration, BartTokenizer
from typing import List

# setup our BART transformer summarization model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').cuda()

In [6]:
# helper function to summarize an input text with the BART model
def summarizer(text: str) -> str:
    input_ids = tokenizer(text, return_tensors='pt', max_length=1024, padding=True, truncation=True)['input_ids']
    summary_ids = model.generate(input_ids.cuda())
    summary = ''.join([tokenizer.decode(s) for s in summary_ids])
    summary = summary.replace('<s>', '').replace('</s>', '')
    return summary

In [7]:
# helper function to strip html tags from a string (needed for better accuracy)
def clean_html(raw_html: str, strip=True) -> str:
  cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
  clean = re.sub(cleanr, '', raw_html)
  if strip:
    clean = re.sub(r'\W+', ' ', clean)
  return clean

In [8]:
# helper function to extract the date the article was published from the header/title
def extract_publish_date(text: str) -> str:
    return re.search(r'[0-9]{4}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])', text)[0]

In [9]:
DISEASES = [
    'Chagas disease',
    'Chikungunya',
    'Congo-Crimean haemorrhagic fever',
    'Dengue',
    'Dracunculiasis',
    'guinea-worm disease'
    'Human African trypanosomiasis',
    'Leishmaniasis',
    'Lymphatic filariasis',
    'Lyme disease',
    'Malaria',
    'Onchocerciasis',
    'Schistosomiasis',
    'Yellow fever',
]
# helper function to extract type of vector-borne disease from data
def extract_disease(txt: str) -> str:
    txt = txt.lower()
    for d in DISEASES:
        if d.lower() in txt:
            return d

    return 'Not found'

In [10]:
# function that extracts location names/admin codes/lat/lng, case and death counts, and date ranges from the input string
# uses epitator since it already trained rules for extracting medical/infectious disease data
def epitator_extract(txt, max_ents=1):
    # input string and add annotators
    doc = AnnoDoc(txt)
    doc.add_tiers(GeonameAnnotator())
    doc.add_tiers(CountAnnotator())
    doc.add_tiers(DateAnnotator())

    # extract geographic data
    geos = doc.tiers["geonames"].spans
    geo_admin1s = [x.geoname.admin1_code for x in geos]
    geo_admin2s = [x.geoname.admin2_code for x in geos]
    geo_admin3s = [x.geoname.admin3_code for x in geos]
    geo_admin4s = [x.geoname.admin4_code for x in geos]
    geo_names = [x.geoname.name for x in geos]
    geo_lats = [x.geoname.latitude for x in geos]
    geo_lons = [x.geoname.longitude for x in geos]

    # extract case counts and death counts
    counts = doc.tiers["counts"].spans
    cases_counts = [x.metadata['count'] for x in counts if 'case' in x.metadata['attributes'] and 'death' not in x.metadata['attributes']]
    cases_tags = [x.metadata['attributes'] for x in counts if 'case' in x.metadata['attributes'] and 'death' not in x.metadata['attributes']]
    death_counts = [x.metadata['count'] for x in counts if 'death' in x.metadata['attributes']]
    death_tags = [x.metadata['attributes'] for x in counts if 'death' in x.metadata['attributes']]

    # extract the date range
    dates = doc.tiers["dates"].spans
    dates_start = [pd.to_datetime(x.metadata["datetime_range"][0], errors='coerce') for x in dates]
    dates_end = [pd.to_datetime(x.metadata["datetime_range"][1], errors='coerce') for x in dates]

    # return only max_ents entities from the extracted lists
    # currently set to the first result for each list, since that is usually the most important one
    # and other ones can be filler/garbage data
    return pd.Series([ 
        geo_admin1s[:max_ents],
        geo_admin2s[:max_ents],
        geo_admin3s[:max_ents],
        geo_admin4s[:max_ents],
        geo_names[:max_ents],
        geo_lats[:max_ents],
        geo_lons[:max_ents],
        cases_counts[:max_ents],
        cases_tags[:max_ents],
        death_counts[:max_ents],
        death_tags[:max_ents],
        dates_start[:max_ents],
        dates_end[:max_ents],
    ])

In [11]:
df = pd.read_csv(DATASET_PATH)
print(len(df))
df.head()

1414


Unnamed: 0,id,header,body
0,8322075,"<span class=""blue"">Published Date:</span> 2021...",DENGUE/DHF UPDATE (02): AMERICAS<br>**********...
1,8163513,"<span class=""blue"">Published Date:</span> 2021...",DENGUE/DHF UPDATE (01): AMERICAS<br>**********...
2,8094266,"<span class=""blue"">Published Date:</span> 2021...",CORONAVIRUS DISEASE 2019 UPDATE (14): IMMUNE R...
3,7961056,"<span class=""blue"">Published Date:</span> 2020...","DENGUE/DHF UPDATE (13): ASIA, EUROPE, AFRICA, ..."
4,7945357,"<span class=""blue"">Published Date:</span> 2020...",DENGUE/DHF UPDATE (12): AMERICAS<br>**********...


In [12]:
df['publish_date'] = df['header'].progress_apply(extract_publish_date) # add date column
df['header'] = df['header'].progress_apply(clean_html)
df['body'] = df['body'].progress_apply(clean_html)
df['summary'] = df['body'].progress_apply(summarizer)
df['disease'] = df['body'].progress_apply(extract_disease)
df[['admin1_code',
'admin2_code',
'admin3_code',
'admin4_code',
'location_name',
'location_lat',
'location_lon',
'cases',
'cases_tags',
'deaths',
'deaths_tags',
'dates_start',
'dates_end',]] = df['summary'].progress_apply(epitator_extract)
df = df.applymap(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x)
df = df.applymap(lambda y: pd.NA if isinstance(y, (list, str)) and len(y) == 0 else y)

Bad date range split: 1908 1926 and 1943 to 1944 ['1908 1926 ', ' 1943 ', ' 1944']
Bad date range split: 1908 1926 and 1943 to 1944 ['1908 1926 ', ' 1943 ', ' 1944']


In [24]:
df.iloc[1]

id                                                         8163513
header           Published Date 2021 02 04 06 39 21Subject PRO ...
body             DENGUE DHF UPDATE 01 AMERICAS A ProMED mail po...
publish_date                                            2021-02-04
summary          Dengue fever in the Americas by country or ter...
disease                                                     Dengue
admin1_code                                                     00
admin2_code                                                   <NA>
admin3_code                                                   <NA>
admin4_code                                                   <NA>
location_name                                        United States
location_lat                                                 39.76
location_lon                                                 -98.5
cases                                                           53
cases_tags                                                  [c

In [14]:
df.to_csv(DATASET_PATH[:-4]+'_parsed.csv')