In [125]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [126]:
df = pd.read_csv('promed_malaria.csv')

In [127]:
def clean(content):
	split = content.splitlines()
	last_index = -1
	lower = [x.lower().strip() for x in split]
	if '--' in lower:
		last_index = lower.index('--')
	elif 'communicated by:' in lower:
		last_index = lower.index('communicated by:')-1

	cleaned = split[12:last_index]
	return '\n'.join([x for x in cleaned if x])

In [128]:
df['content'] = df['content'].apply(clean)

In [129]:
import sys
sys.path.append('../EpiTator')

In [130]:
from epitator.annotator import AnnoDoc
from epitator.count_annotator import CountAnnotator
from epitator.date_annotator import DateAnnotator
from epitator.geoname_annotator import GeonameAnnotator

In [131]:
from transformers import BartForConditionalGeneration, BartTokenizer
from typing import List

# setup our BART transformer summarization model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').cuda()

In [132]:
# helper function to summarize an input text with the BART model
def summarizer(text: str) -> str:
    input_ids = tokenizer(text, return_tensors='pt', max_length=1024, padding=True, truncation=True)['input_ids']
    summary_ids = model.generate(input_ids.cuda())
    summary = ''.join([tokenizer.decode(s) for s in summary_ids])
    summary = summary.replace('<s>', '').replace('</s>', '')
    return summary

In [133]:
# function that extracts location names/admin codes/lat/lng, case and death counts, and date ranges from the input string
# uses epitator since it already trained rules for extracting medical/infectious disease data
def epitator_extract(txt, max_ents=1):
    # input string and add annotators
    doc = AnnoDoc(txt)
    doc.add_tiers(GeonameAnnotator())
    doc.add_tiers(CountAnnotator())
    doc.add_tiers(DateAnnotator())

    # extract geographic data
    geos = doc.tiers["geonames"].spans
    geo_admin1s = [x.geoname.admin1_code for x in geos]
    geo_admin2s = [x.geoname.admin2_code for x in geos]
    geo_admin3s = [x.geoname.admin3_code for x in geos]
    geo_admin4s = [x.geoname.admin4_code for x in geos]
    geo_names = [x.geoname.name for x in geos]
    geo_lats = [x.geoname.latitude for x in geos]
    geo_lons = [x.geoname.longitude for x in geos]

    # extract case counts and death counts
    counts = doc.tiers["counts"].spans
    cases_counts = [x.metadata['count'] for x in counts if 'case' in x.metadata['attributes'] and 'death' not in x.metadata['attributes']]
    cases_tags = [x.metadata['attributes'] for x in counts if 'case' in x.metadata['attributes'] and 'death' not in x.metadata['attributes']]
    death_counts = [x.metadata['count'] for x in counts if 'death' in x.metadata['attributes']]
    death_tags = [x.metadata['attributes'] for x in counts if 'death' in x.metadata['attributes']]

    # extract the date range
    dates = doc.tiers["dates"].spans
    dates_start = [pd.to_datetime(x.metadata["datetime_range"][0], errors='coerce') for x in dates]
    dates_end = [pd.to_datetime(x.metadata["datetime_range"][1], errors='coerce') for x in dates]

    # return only max_ents entities from the extracted lists
    # currently set to the first result for each list, since that is usually the most important one
    # and other ones can be filler/garbage data
    return pd.Series([ 
        geo_admin1s[:max_ents],
        geo_admin2s[:max_ents],
        geo_admin3s[:max_ents],
        geo_admin4s[:max_ents],
        geo_names[:max_ents],
        geo_lats[:max_ents],
        geo_lons[:max_ents],
        cases_counts[:max_ents],
        cases_tags[:max_ents],
        death_counts[:max_ents],
        death_tags[:max_ents],
        dates_start[:max_ents],
        dates_end[:max_ents],
    ])

In [134]:
df['summary'] = df['content'].progress_apply(summarizer)

 12%|█▏        | 90/743 [01:43<12:32,  1.15s/it]


KeyboardInterrupt: 

In [None]:
df[['admin1_code',
'admin2_code',
'admin3_code',
'admin4_code',
'location_name',
'location_lat',
'location_lon',
'cases',
'cases_tags',
'deaths',
'deaths_tags',
'dates_start',
'dates_end',]] = df['summary'].progress_apply(epitator_extract)
df = df.applymap(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x)
df = df.applymap(lambda y: pd.NA if isinstance(y, (list, str)) and len(y) == 0 else y)

100%|██████████| 743/743 [01:02<00:00, 11.89it/s]


In [None]:
df.iloc[0]

Unnamed: 0                                                          0
id                                                            8531354
title                                       ['Malaria - Yemen: (TA)']
zoom_lat                                                    13.566667
zoom_lon                                                    44.033333
zoom_level                                                          4
alert_id                                                      8531354
feed_id                                                             1
summary             Health facilities in the province have recorde...
issue_date                                        2021-07-20 14:42:19
load_date                                         2021-07-20 14:42:19
incident_date                                                     NaN
descr               Thousands of citizens have been infected with ...
alert_tag_id                                                      NaN
dup_count           

In [None]:
df.to_csv('promed_malaria_parsed.csv', sep='\t')

In [None]:
len(df)

743

In [None]:
data = 'GPM_3IMERGHH_06_precipitationCal'
lat = '13.566667'
lon = '44.033333'
time_start = '2021-07-20T14:42:19'
time_end = '2021-07-20T14:42:19'

In [173]:
df.iloc[1][['zoom_lat', 'zoom_lon', 'issue_date']]

zoom_lat                36.519981
zoom_lon               103.891769
issue_date    2021-07-06 10:44:55
Name: 1, dtype: object

In [None]:
import requests

In [None]:
f'https://api.giovanni.earthdata.nasa.gov/timeseries?data=${data}&location=%5B${lat}%2C${lon}%5D&time=${time_start}%2F${time_end}'

'https://api.giovanni.earthdata.nasa.gov/timeseries?data=$GPM_3IMERGHH_06_precipitationCal&location=%5B$13.566667%2C$44.033333%5D&time=$2021-07-20T14:42:19%2F$2021-07-20T14:42:19'

In [None]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [196]:
def get_gpm(row):
    headers = {
        'authorizationtoken': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyIjoieWVlYiIsInRva2VuIjoiZXlKMGVYQWlPaUpLVjFRaUxDSnZjbWxuYVc0aU9pSkZZWEowYUdSaGRHRWdURzluYVc0aUxDSmhiR2NpT2lKSVV6STFOaUo5LmV5SjBlWEJsSWpvaVQwRjFkR2dpTENKMWFXUWlPaUo1WldWaUlpd2lZMnhwWlc1MFgybGtJam9pWlRKWFZtczRVSGMyZDJWbFRGVkxXbGxQZUhaVVVTSXNJbVY0Y0NJNk1UWXlPVGt4TVRreU9Dd2lhV0YwSWpveE5qSTNNekU1T1RJNExDSnBjM01pT2lKRllYSjBhR1JoZEdFZ1RHOW5hVzRpZlEuYk9lcVlkQ2UzU0lOVWpsNWNadmtSYU5LU0RvSkN4VWRlRmZMT0tLNHFOYyIsImlhdCI6MTYyNzMxOTkyOH0.O7A7WYds19vi0R3q4THfBHHHXeqYWPJca2djiBsqwGA',
    }

    time = datetime.strptime(row['issue_date'], '%Y-%m-%d %H:%M:%S')
    time_start = time.strftime('%Y-%m-%dT00:00:00')
    time_end = (time + relativedelta(months=1)).strftime('%Y-%m-%dT00:00:00')

    params = (
        ('data', 'GPM_3IMERGHH_06_precipitationCal'),
        ('location', f'[{row["zoom_lat"]},{row["zoom_lon"]}]'),
        ('time', f'{time_start}/{time_end}'),
    )
    print(params)

    r = requests.get('https://api.giovanni.earthdata.nasa.gov/timeseries', headers=headers, params=params)
    print(r.text)
    if 'mean,' in r.text:
        mean_idx = r.text.find('mean,')
        return r.text[mean_idx+5:mean_idx+r.text[mean_idx:].find('\n')]
    # print('uhoh')
    return pd.NA


In [197]:
get_gpm(df.iloc[60])

(('data', 'GPM_3IMERGHH_06_precipitationCal'), ('location', '[75.0,-39.990234]'), ('time', '2018-07-23T00:00:00/2018-08-23T00:00:00'))
{"Message":"User is not authorized to access this resource with an explicit deny"}


<NA>

In [181]:
df['gpm'] = df.progress_apply(get_gpm, axis=1)
df.to_csv('promed_malaria_gpm.csv', sep='\t')

  2%|▏         | 13/743 [00:10<10:15,  1.19it/s]


KeyboardInterrupt: 

In [188]:
headers = {
    'authorizationtoken': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyIjoieWVlYiIsInRva2VuIjoiZXlKMGVYQWlPaUpLVjFRaUxDSnZjbWxuYVc0aU9pSkZZWEowYUdSaGRHRWdURzluYVc0aUxDSmhiR2NpT2lKSVV6STFOaUo5LmV5SjBlWEJsSWpvaVQwRjFkR2dpTENKMWFXUWlPaUo1WldWaUlpd2lZMnhwWlc1MFgybGtJam9pWlRKWFZtczRVSGMyZDJWbFRGVkxXbGxQZUhaVVVTSXNJbVY0Y0NJNk1UWXlPVGcyTnprNU9Dd2lhV0YwSWpveE5qSTNNamMxT1RrNExDSnBjM01pT2lKRllYSjBhR1JoZEdFZ1RHOW5hVzRpZlEucmFkZFl0ZHRuOXlGQ3RwOTNENHRselp3bVU1Y1ZVRjFoclJYZXY5dlhQMCIsImlhdCI6MTYyNzI3NTk5OX0.0y_IMp4_2cUrUWzZRVmkL5MBdDPI7VzFLPtAQb1_fEk',
}
# df.iloc[1]['zoom_lat'], 'zoom_lon', 'issue_date']]
# params = (
#     ('data', 'GPM_3IMERGHH_06_precipitationCal'),
#     ('location', f'[{df.iloc[1]["zoom_lat"]},{df.iloc[1]["zoom_lon"]}]'),
#     ('time', '2018-07-23T00:00:00/2018-08-23T00:00:00'),
# )
params = (('data', 'GPM_3IMERGHH_06_precipitationCal'), ('location', '[75.0,-39.990234]'), ('time', '2018-07-23T00:00:00/2018-08-23T00:00:00'))

response = requests.get('https://api.giovanni.earthdata.nasa.gov/timeseries', headers=headers, params=params)

In [190]:
print(response.text)

{"Message":"User is not authorized to access this resource with an explicit deny"}


In [191]:
print(response)

<Response [403]>


In [192]:
headers = {
    'authorizationtoken': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyIjoieWVlYiIsInRva2VuIjoiZXlKMGVYQWlPaUpLVjFRaUxDSnZjbWxuYVc0aU9pSkZZWEowYUdSaGRHRWdURzluYVc0aUxDSmhiR2NpT2lKSVV6STFOaUo5LmV5SjBlWEJsSWpvaVQwRjFkR2dpTENKMWFXUWlPaUo1WldWaUlpd2lZMnhwWlc1MFgybGtJam9pWlRKWFZtczRVSGMyZDJWbFRGVkxXbGxQZUhaVVVTSXNJbVY0Y0NJNk1UWXlPVGcyTnprNU9Dd2lhV0YwSWpveE5qSTNNamMxT1RrNExDSnBjM01pT2lKRllYSjBhR1JoZEdFZ1RHOW5hVzRpZlEucmFkZFl0ZHRuOXlGQ3RwOTNENHRselp3bVU1Y1ZVRjFoclJYZXY5dlhQMCIsImlhdCI6MTYyNzI3NTk5OX0.0y_IMp4_2cUrUWzZRVmkL5MBdDPI7VzFLPtAQb1_fEk',
}

params = (
    ('data', 'GPM_3IMERGHH_06_precipitationCal'),
    ('location', '[4.75,0.55]'),
    ('time', '2000-06-01T00:00:00/2000-06-01T07:30:00'),
)

response = requests.get('https://api.giovanni.earthdata.nasa.gov/timeseries', headers=headers, params=params)

In [193]:
print(response)

<Response [403]>


In [198]:
curl -X GET "https://api.giovanni.earthdata.nasa.gov/timeseries?data=GPM_3IMERGHH_06_precipitationCal&location=%5B4.75%2C0.55%5D&time=2000-06-01T00:00:00%2F2000-06-01T07:30:00" -H "authorizationtoken: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyIjoieWVlYiIsInRva2VuIjoiZXlKMGVYQWlPaUpLVjFRaUxDSnZjbWxuYVc0aU9pSkZZWEowYUdSaGRHRWdURzluYVc0aUxDSmhiR2NpT2lKSVV6STFOaUo5LmV5SjBlWEJsSWpvaVQwRjFkR2dpTENKMWFXUWlPaUo1WldWaUlpd2lZMnhwWlc1MFgybGtJam9pWlRKWFZtczRVSGMyZDJWbFRGVkxXbGxQZUhaVVVTSXNJbVY0Y0NJNk1UWXlPVGt4TVRreU9Dd2lhV0YwSWpveE5qSTNNekU1T1RJNExDSnBjM01pT2lKRllYSjBhR1JoZEdFZ1RHOW5hVzRpZlEuYk9lcVlkQ2UzU0lOVWpsNWNadmtSYU5LU0RvSkN4VWRlRmZMT0tLNHFOYyIsImlhdCI6MTYyNzMxOTkyOH0.O7A7WYds19vi0R3q4THfBHHHXeqYWPJca2djiBsqwGA"

SyntaxError: invalid syntax (<ipython-input-198-bc5436a29e9f>, line 1)