In [90]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [126]:
df = pd.read_csv('promed_malaria.csv')

In [127]:
def clean(content):
	split = content.splitlines()
	last_index = -1
	lower = [x.lower().strip() for x in split]
	if '--' in lower:
		last_index = lower.index('--')
	elif 'communicated by:' in lower:
		last_index = lower.index('communicated by:')-1

	cleaned = split[12:last_index]
	return '\n'.join([x for x in cleaned if x])

In [128]:
df['content'] = df['content'].apply(clean)

In [80]:
import sys
sys.path.append('../EpiTator')

In [81]:
from epitator.annotator import AnnoDoc
from epitator.count_annotator import CountAnnotator
from epitator.date_annotator import DateAnnotator
from epitator.geoname_annotator import GeonameAnnotator

In [43]:
from transformers import BartForConditionalGeneration, BartTokenizer
from typing import List

# setup our BART transformer summarization model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').cuda()

In [44]:
# helper function to summarize an input text with the BART model
def summarizer(text: str) -> str:
    input_ids = tokenizer(text, return_tensors='pt', max_length=1024, padding=True, truncation=True)['input_ids']
    summary_ids = model.generate(input_ids.cuda())
    summary = ''.join([tokenizer.decode(s) for s in summary_ids])
    summary = summary.replace('<s>', '').replace('</s>', '')
    return summary

In [85]:
import time

In [86]:
# function that extracts location names/admin codes/lat/lng, case and death counts, and date ranges from the input string
# uses epitator since it already trained rules for extracting medical/infectious disease data
def epitator_extract(txt, max_ents=1):
    # input string and add annotators
    t0 = time.time()

    doc = AnnoDoc(txt)
    print('doc', time.time()-t0)
    doc.add_tiers(GeonameAnnotator())
    print('geoname', time.time()-t0)
    doc.add_tiers(CountAnnotator())
    print('count', time.time()-t0)
    doc.add_tiers(DateAnnotator())
    print('date', time.time()-t0)

    # extract geographic data
    geos = doc.tiers["geonames"].spans
    geo_admin1s = [x.geoname.admin1_code for x in geos]
    geo_admin2s = [x.geoname.admin2_code for x in geos]
    geo_admin3s = [x.geoname.admin3_code for x in geos]
    geo_admin4s = [x.geoname.admin4_code for x in geos]
    geo_names = [x.geoname.name for x in geos]
    geo_lats = [x.geoname.latitude for x in geos]
    geo_lons = [x.geoname.longitude for x in geos]

    # extract case counts and death counts
    counts = doc.tiers["counts"].spans
    cases_counts = [x.metadata['count'] for x in counts if 'case' in x.metadata['attributes'] and 'death' not in x.metadata['attributes']]
    cases_tags = [x.metadata['attributes'] for x in counts if 'case' in x.metadata['attributes'] and 'death' not in x.metadata['attributes']]
    death_counts = [x.metadata['count'] for x in counts if 'death' in x.metadata['attributes']]
    death_tags = [x.metadata['attributes'] for x in counts if 'death' in x.metadata['attributes']]

    # extract the date range
    dates = doc.tiers["dates"].spans
    dates_start = [pd.to_datetime(x.metadata["datetime_range"][0], errors='coerce') for x in dates]
    dates_end = [pd.to_datetime(x.metadata["datetime_range"][1], errors='coerce') for x in dates]

    # return only max_ents entities from the extracted lists
    # currently set to the first result for each list, since that is usually the most important one
    # and other ones can be filler/garbage data
    return pd.Series([ 
        geo_admin1s[:max_ents],
        geo_admin2s[:max_ents],
        geo_admin3s[:max_ents],
        geo_admin4s[:max_ents],
        geo_names[:max_ents],
        geo_lats[:max_ents],
        geo_lons[:max_ents],
        cases_counts[:max_ents],
        cases_tags[:max_ents],
        death_counts[:max_ents],
        death_tags[:max_ents],
        dates_start[:max_ents],
        dates_end[:max_ents],
    ])

In [134]:
df['summary'] = df['content'].progress_apply(summarizer)

 12%|█▏        | 90/743 [01:43<12:32,  1.15s/it]


KeyboardInterrupt: 

In [None]:
df[['admin1_code',
'admin2_code',
'admin3_code',
'admin4_code',
'location_name',
'location_lat',
'location_lon',
'cases',
'cases_tags',
'deaths',
'deaths_tags',
'dates_start',
'dates_end',]] = df['summary'].progress_apply(epitator_extract)
df = df.applymap(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x)
df = df.applymap(lambda y: pd.NA if isinstance(y, (list, str)) and len(y) == 0 else y)

100%|██████████| 743/743 [01:02<00:00, 11.89it/s]


In [None]:
df.iloc[0]

Unnamed: 0                                                          0
id                                                            8531354
title                                       ['Malaria - Yemen: (TA)']
zoom_lat                                                    13.566667
zoom_lon                                                    44.033333
zoom_level                                                          4
alert_id                                                      8531354
feed_id                                                             1
summary             Health facilities in the province have recorde...
issue_date                                        2021-07-20 14:42:19
load_date                                         2021-07-20 14:42:19
incident_date                                                     NaN
descr               Thousands of citizens have been infected with ...
alert_tag_id                                                      NaN
dup_count           

In [None]:
df.to_csv('promed_malaria_parsed.csv', sep='\t')

In [None]:
len(df)

743

In [None]:
data = 'GPM_3IMERGHH_06_precipitationCal'
lat = '13.566667'
lon = '44.033333'
time_start = '2021-07-20T14:42:19'
time_end = '2021-07-20T14:42:19'

In [173]:
df.iloc[1][['zoom_lat', 'zoom_lon', 'issue_date']]

zoom_lat                36.519981
zoom_lon               103.891769
issue_date    2021-07-06 10:44:55
Name: 1, dtype: object

In [94]:
import requests

In [None]:
f'https://api.giovanni.earthdata.nasa.gov/timeseries?data=${data}&location=%5B${lat}%2C${lon}%5D&time=${time_start}%2F${time_end}'

'https://api.giovanni.earthdata.nasa.gov/timeseries?data=$GPM_3IMERGHH_06_precipitationCal&location=%5B$13.566667%2C$44.033333%5D&time=$2021-07-20T14:42:19%2F$2021-07-20T14:42:19'

In [None]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [196]:
def get_gpm(row):
    headers = {
        'authorizationtoken': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyIjoieWVlYiIsInRva2VuIjoiZXlKMGVYQWlPaUpLVjFRaUxDSnZjbWxuYVc0aU9pSkZZWEowYUdSaGRHRWdURzluYVc0aUxDSmhiR2NpT2lKSVV6STFOaUo5LmV5SjBlWEJsSWpvaVQwRjFkR2dpTENKMWFXUWlPaUo1WldWaUlpd2lZMnhwWlc1MFgybGtJam9pWlRKWFZtczRVSGMyZDJWbFRGVkxXbGxQZUhaVVVTSXNJbVY0Y0NJNk1UWXlPVGt4TVRreU9Dd2lhV0YwSWpveE5qSTNNekU1T1RJNExDSnBjM01pT2lKRllYSjBhR1JoZEdFZ1RHOW5hVzRpZlEuYk9lcVlkQ2UzU0lOVWpsNWNadmtSYU5LU0RvSkN4VWRlRmZMT0tLNHFOYyIsImlhdCI6MTYyNzMxOTkyOH0.O7A7WYds19vi0R3q4THfBHHHXeqYWPJca2djiBsqwGA',
    }

    time = datetime.strptime(row['issue_date'], '%Y-%m-%d %H:%M:%S')
    time_start = time.strftime('%Y-%m-%dT00:00:00')
    time_end = (time + relativedelta(months=1)).strftime('%Y-%m-%dT00:00:00')

    params = (
        ('data', 'GPM_3IMERGHH_06_precipitationCal'),
        ('location', f'[{row["zoom_lat"]},{row["zoom_lon"]}]'),
        ('time', f'{time_start}/{time_end}'),
    )
    print(params)

    r = requests.get('https://api.giovanni.earthdata.nasa.gov/timeseries', headers=headers, params=params)
    print(r.text)
    if 'mean,' in r.text:
        mean_idx = r.text.find('mean,')
        return r.text[mean_idx+5:mean_idx+r.text[mean_idx:].find('\n')]
    # print('uhoh')
    return pd.NA


In [197]:
get_gpm(df.iloc[60])

(('data', 'GPM_3IMERGHH_06_precipitationCal'), ('location', '[75.0,-39.990234]'), ('time', '2018-07-23T00:00:00/2018-08-23T00:00:00'))
{"Message":"User is not authorized to access this resource with an explicit deny"}


<NA>

In [181]:
df['gpm'] = df.progress_apply(get_gpm, axis=1)
df.to_csv('promed_malaria_gpm.csv', sep='\t')

  2%|▏         | 13/743 [00:10<10:15,  1.19it/s]


KeyboardInterrupt: 

In [188]:
headers = {
    'authorizationtoken': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyIjoieWVlYiIsInRva2VuIjoiZXlKMGVYQWlPaUpLVjFRaUxDSnZjbWxuYVc0aU9pSkZZWEowYUdSaGRHRWdURzluYVc0aUxDSmhiR2NpT2lKSVV6STFOaUo5LmV5SjBlWEJsSWpvaVQwRjFkR2dpTENKMWFXUWlPaUo1WldWaUlpd2lZMnhwWlc1MFgybGtJam9pWlRKWFZtczRVSGMyZDJWbFRGVkxXbGxQZUhaVVVTSXNJbVY0Y0NJNk1UWXpNVEE0TURBd09Td2lhV0YwSWpveE5qSTRORGc0TURBNUxDSnBjM01pT2lKRllYSjBhR1JoZEdFZ1RHOW5hVzRpZlEuZ3RXM1MzdjVQZlNUdk1iaHZiZXJjcV85b2R5WWxpRV84ZGMxM3lxMmVjWSIsImlhdCI6MTYyODQ4ODAwOX0.BvWgKN1ZTpqbs4uHFYU3dLBavtCY2bRvyqf-NVF8AfM',
}
# df.iloc[1]['zoom_lat'], 'zoom_lon', 'issue_date']]
# params = (
#     ('data', 'GPM_3IMERGHH_06_precipitationCal'),
#     ('location', f'[{df.iloc[1]["zoom_lat"]},{df.iloc[1]["zoom_lon"]}]'),
#     ('time', '2018-07-23T00:00:00/2018-08-23T00:00:00'),
# )
params = (('data', 'GPM_3IMERGHH_06_precipitationCal'), ('location',
          '[75.0,-39.990234]'), ('time', '2018-07-23T00:00:00/2018-08-23T00:00:00'))

response = requests.get(
    'https://api.giovanni.earthdata.nasa.gov/timeseries', headers=headers, params=params)


In [190]:
print(response.text)

{"Message":"User is not authorized to access this resource with an explicit deny"}


In [191]:
print(response)

<Response [403]>


In [192]:
headers = {
    'authorizationtoken': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyIjoieWVlYiIsInRva2VuIjoiZXlKMGVYQWlPaUpLVjFRaUxDSnZjbWxuYVc0aU9pSkZZWEowYUdSaGRHRWdURzluYVc0aUxDSmhiR2NpT2lKSVV6STFOaUo5LmV5SjBlWEJsSWpvaVQwRjFkR2dpTENKMWFXUWlPaUo1WldWaUlpd2lZMnhwWlc1MFgybGtJam9pWlRKWFZtczRVSGMyZDJWbFRGVkxXbGxQZUhaVVVTSXNJbVY0Y0NJNk1UWXlPVGcyTnprNU9Dd2lhV0YwSWpveE5qSTNNamMxT1RrNExDSnBjM01pT2lKRllYSjBhR1JoZEdFZ1RHOW5hVzRpZlEucmFkZFl0ZHRuOXlGQ3RwOTNENHRselp3bVU1Y1ZVRjFoclJYZXY5dlhQMCIsImlhdCI6MTYyNzI3NTk5OX0.0y_IMp4_2cUrUWzZRVmkL5MBdDPI7VzFLPtAQb1_fEk',
}

params = (
    ('data', 'GPM_3IMERGHH_06_precipitationCal'),
    ('location', '[4.75,0.55]'),
    ('time', '2000-06-01T00:00:00/2000-06-01T07:30:00'),
)

response = requests.get('https://api.giovanni.earthdata.nasa.gov/timeseries', headers=headers, params=params)

In [193]:
print(response)

<Response [403]>


In [198]:
curl -X GET "https://api.giovanni.earthdata.nasa.gov/timeseries?data=GPM_3IMERGHH_06_precipitationCal&location=%5B4.75%2C0.55%5D&time=2000-06-01T00:00:00%2F2000-06-01T07:30:00" -H "authorizationtoken: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyIjoieWVlYiIsInRva2VuIjoiZXlKMGVYQWlPaUpLVjFRaUxDSnZjbWxuYVc0aU9pSkZZWEowYUdSaGRHRWdURzluYVc0aUxDSmhiR2NpT2lKSVV6STFOaUo5LmV5SjBlWEJsSWpvaVQwRjFkR2dpTENKMWFXUWlPaUo1WldWaUlpd2lZMnhwWlc1MFgybGtJam9pWlRKWFZtczRVSGMyZDJWbFRGVkxXbGxQZUhaVVVTSXNJbVY0Y0NJNk1UWXlPVGt4TVRreU9Dd2lhV0YwSWpveE5qSTNNekU1T1RJNExDSnBjM01pT2lKRllYSjBhR1JoZEdFZ1RHOW5hVzRpZlEuYk9lcVlkQ2UzU0lOVWpsNWNadmtSYU5LU0RvSkN4VWRlRmZMT0tLNHFOYyIsImlhdCI6MTYyNzMxOTkyOH0.O7A7WYds19vi0R3q4THfBHHHXeqYWPJca2djiBsqwGA"

SyntaxError: invalid syntax (<ipython-input-198-bc5436a29e9f>, line 1)

In [1]:
import pandas as pd

In [2]:
df = pd.read_feather('dataset.v1.0.feather')

In [11]:
content = df.loc[(df['disease'] == 'malaria')& (df['cases'].isna())][['content', 'summary']]

In [15]:
content.iloc[6]['content']

'Citation: Uwimana A, Umulisa N, Venkatesan M, et al. Association of _Plasmodium falciparum kelch13_ R561H genotypes with delayed parasite clearance in Rwanda: an open-label, single-arm, multicentre, therapeutic efficacy study. Lancet. Epub 14 Apr 2021. <https://doi.org/10.1016/S1473-3099(21)00142-0>\nSummary\n------\nBackground: Partial artemisinin resistance is suspected if delayed parasite clearance (i.e., persistence of parasitaemia on day 3 after treatment initiation) is observed. Validated markers of artemisinin partial resistance in southeast Asia, _Plasmodium falciparum kelch13_ (_Pfkelch13_) R561H and P574L, have been reported in Rwanda, but no association with parasite clearance has been observed. We aimed to establish the efficacy of artemether-lumefantrine and genetic characterisation of _Pfkelch13_ alleles and their association with treatment outcomes.\nMethods: This open-label, single-arm, multicentre, therapeutic efficacy study was done in 2018 in 3 Rwandan sites: Masaka

In [17]:
content.iloc[6]['summary']

'Researchers found the presence of malaria parasite _Plasmodium knowlesi_ in the north Indian population. The zoonotic parasite was found in the states of Uttar Pradesh, Haryana, and Delhi. A previous study at AIIMS had found its presence in the Andaman and Nicobar Islands.'

In [28]:
len(df[df['summary'].str.contains('case')])

1402

In [29]:
len(df)

2991

In [30]:
df[df['summary'].str.contains('case')]['cases'].isna().sum()

271

In [38]:
valid_df = df[(df['summary'].str.contains('case')) & (df['cases'].notna())]

In [40]:
valid_df.iloc[0]

index                                                                    5
Unnamed: 0                                                         7945357
id                                                                 7945357
title                                 ['Dengue/DHF update (12): Americas']
zoom_lat                                                              75.0
zoom_lon                                                        -39.990234
zoom_level                                                               4
alert_id                                                           7945357
feed_id                                                                  1
summary                  Cases in various countries. WHO/PAHO countries...
issue_date                                             2020-11-15 19:55:03
load_date                                              2020-11-15 19:55:03
incident_date                                                          NaN
descr                    

In [42]:
valid_df['precipitation_anomaly'].corr(valid_df['cases'])

-0.013714449294276154

In [45]:
summarizer('''Malaria has taken a turn for the worse in tribal dominated remote
hamlets under Chawmanu rural administrative block of Dhalai district
in North Tripura with 5 deaths reported. The death toll in diarrhoea,
viral fever and malaria has reached 129 so far this year in the area.

Officials here today said 7 special medical teams from the district
headquarters had been dispatched to the affected hamlets with
necessary medical aids.

Medicine, saline and instant malaria parasite testing kits were sent
from Agartala while 7 more doctors were posted in various primary
health centres in the area in the past one month to contain the
situation, said a top health official. He, however, pointed out that
scanty rainfall and humid weather were the main reasons for spread of
the diseases in hilly locations.

The entire Dhalai district had already been identified as malaria
prone and become drug resistant. Spraying of DDT, use of instant
diagnostic kits in highly malaria prone areas besides, distribution of
115 000 medicated nets and fogging machines were on.

Alarmed at the increasing incidence of deaths due to malaria, the
Tripura National Vector Borne Disease Control Programme (NVBDC) had
conducted a comprehensive study in malaria prone areas on
effectiveness of conventional medicines for malaria treatment.

The Central Drug Research Institute (CDRI) categorised the state as
Chloroquine (malaria drug) resistant 3 years back and Sulpha-
Pyramethamine Combination, 2nd-generation anti-malaria drugs had been
prescribed for the patients.

The _Plasmodium falciparum_ (PF) component present in malaria affected
patients and Chloroquine had become inactive for long-time sufferers. New
chemoprophylactic drugs had been supplied as an immediate measure, said
health officials.
''')

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ..\aten\src\ATen\native\BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


'Malaria has taken a turn for the worse in tribal dominated remote hamlets of North Tripura. Death toll in diarrhoea, viral fever and malaria has reached 129 so far this year in the area. 7 special medical teams from the district headquarters have been dispatched to the affected hamlets.'

In [83]:
extraction = epitator_extract('''Malaria has claimed the lives of more than 700 people in Gombe state
in the last 2.5 years according to Mallam Ahmed Audu. Evaluating
Officer of the World Bank Assisted Malaria Control Booster Project in
Gombe.

Mallam Ahmed Audu made this revelation recently in Dukku LGA [Local
Government Area] of the state during an interactive session organized
for working journalists in the state.

He said more than 100 deaths occurred between February and May this
year [2011] alone even as he pointed out that about 170 000 people
were infected by the malaria scourge in the 11 Local Government Areas
of the State from 2009 to May 2011.

According to the project monitoring officer, about 46 000 out of the
malaria prevalence rate were recorded between February and May this
year [2011]. Audu, however, stated that the project had in February
this year distributed over 130 000 doses of malaria drugs to all
government hospitals in the 11 local government areas in the state.

He further said about 2 million doses of malaria preventive drugs for
pregnant women and children were donated to the state Ministry of
Health for use during the Children Health Week in May this year.

Also speaking, the Project Manager, Dr. Arnold Abel stated that
malaria accounted for the loss of over N130 billion [about USD 844
million] annually in Nigeria as well as for 30 percent deaths in
children and 5 percent in pregnant women.

In his speech, the Project Mobilization officer, Mallam Muhammad
Lawal, said there were plans to produce jingles, drama, and programmes
on malaria attacks and treatment on the state radio and television
stations in an effort to reduce its spread, especially among poor
rural communities. He also said the Project collaborated with the
State Environmental Protection Agency (GOSEPA) on refuse disposal
while plans were underway to resurrect sanitary departments in the 11
local government areas of the state.

[Byline: Williams Attah]
''')

In [91]:
oaa = pd.DataFrame()
oaa[['admin1_code',
'admin2_code',
'admin3_code',
'admin4_code',
'location_name',
'location_lat',
'location_lon',
'cases',
'cases_tags',
'deaths',
'deaths_tags',
'dates_start',
'dates_end',]] = df['content'].iloc[:6].progress_apply(epitator_extract)

  0%|          | 0/6 [00:00<?, ?it/s]

doc 0.0
geoname 4.67788290977478
count 25.025121450424194


 33%|███▎      | 2/6 [00:43<01:27, 21.87s/it]

date 43.693984270095825
doc 0.0
geoname 3.761343240737915
count 8.320590734481812


 50%|█████     | 3/6 [00:55<00:52, 17.55s/it]

date 11.48423433303833
doc 0.0
geoname 2.6495070457458496
count 9.396372318267822


 67%|██████▋   | 4/6 [01:12<00:35, 17.51s/it]

date 17.423551321029663
doc 0.0
geoname 1.6490018367767334
count 6.492999315261841


 83%|████████▎ | 5/6 [01:23<00:15, 15.14s/it]

date 10.70549464225769
doc 0.0
geoname 2.760970115661621
count 13.286296606063843


100%|██████████| 6/6 [01:45<00:00, 17.49s/it]

date 22.202698469161987
doc 0.0
geoname 7.592491626739502
count 18.948084354400635


100%|██████████| 6/6 [02:14<00:00, 22.42s/it]

date 28.867743492126465





In [93]:
print(oaa.iloc[0])

admin1_code                       [12]
admin2_code                      [035]
admin3_code                         []
admin4_code                         []
location_name               [Américas]
location_lat                [18.36455]
location_lon               [-99.54467]
cases                        [2163354]
cases_tags                    [[case]]
deaths                            [53]
deaths_tags            [[case, death]]
dates_start      [2021-01-02 00:00:00]
dates_end        [2021-01-03 00:00:00]
Name: 0, dtype: object


In [92]:
df['content'].iloc[0]



In [None]:
['admin1_code',
'admin2_code',
'admin3_code',
'admin4_code',
'location_name',
'location_lat',
'location_lon',
'cases',
'cases_tags',
'deaths',
'deaths_tags',
'dates_start',
'dates_end',]

In [75]:
df= pd.read_feather('combined_df_anomaly.feather')

In [65]:
bruh = df[~df['content'].str.contains('|'.join(('case', 'cases', 'death', 'deaths')))].reset_index()

In [76]:
df = df[df['content'].str.contains('|'.join(('case', 'cases', 'death', 'deaths')))].reset_index()