<a href="https://colab.research.google.com/github/sindhoora8/DataAnalyticsFall2022_Sindhoora_Mandadi/blob/main/Parse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
DATASET_PATH = '/Users/sindhooramandadi/Documents/GitHub/Untitled/promed_Dengue.csv'

In [None]:
import pandas as pd
import re
import sys
from tqdm import tqdm
tqdm.pandas()

In [None]:
sys.path.append('../Epitator')


In [None]:
from epitator.annotator import AnnoDoc
from epitator.count_annotator import CountAnnotator
from epitator.date_annotator import DateAnnotator
from epitator.geoname_annotator import GeonameAnnotator

ModuleNotFoundError: ignored

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer
from typing import List

# setup our BART transformer summarization model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').cuda()

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [None]:
# helper function to summarize an input text with the BART model
def summarizer(text: str) -> str:
    input_ids = tokenizer(text, return_tensors='pt', max_length=1024, padding=True, truncation=True)['input_ids']
    summary_ids = model.generate(input_ids.cuda())
    summary = ''.join([tokenizer.decode(s) for s in summary_ids])
    summary = summary.replace('<s>', '').replace('</s>', '')
    return summary


In [None]:
# helper function to strip html tags from a string (needed for better accuracy)
def clean_html(raw_html: str, strip=True) -> str:
  cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
  clean = re.sub(cleanr, '', raw_html)
  if strip:
    clean = re.sub(r'\W+', ' ', clean)
  return clean

In [None]:
# helper function to extract the date the article was published from the header/title
def extract_publish_date(text: str) -> str:
    return re.search(r'[0-9]{4}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])', text)[0]

In [None]:
DISEASES = [
    'Chagas disease',
    'Chikungunya',
    'Congo-Crimean haemorrhagic fever',
    'Dengue',
    'Dracunculiasis',
    'guinea-worm disease'
    'Human African trypanosomiasis',
    'Leishmaniasis',
    'Lymphatic filariasis',
    'Lyme disease',
    'Malaria',
    'Onchocerciasis',
    'Schistosomiasis',
    'Yellow fever',
]
# helper function to extract type of vector-borne disease from data
def extract_disease(txt: str) -> str:
    txt = txt.lower()
    for d in DISEASES:
        if d.lower() in txt:
            return d

    return 'Not found'

In [None]:
# function that extracts location names/admin codes/lat/lng, case and death counts, and date ranges from the input string
# uses epitator since it already trained rules for extracting medical/infectious disease data
def epitator_extract(txt, max_ents=1):
    # input string and add annotators
    doc = AnnoDoc(txt)
    doc.add_tiers(GeonameAnnotator())
    doc.add_tiers(CountAnnotator())
    doc.add_tiers(DateAnnotator())

    # extract geographic data
    geos = doc.tiers["geonames"].spans
    geo_admin1s = [x.geoname.admin1_code for x in geos]
    geo_admin2s = [x.geoname.admin2_code for x in geos]
    geo_admin3s = [x.geoname.admin3_code for x in geos]
    geo_admin4s = [x.geoname.admin4_code for x in geos]
    geo_names = [x.geoname.name for x in geos]
    geo_lats = [x.geoname.latitude for x in geos]
    geo_lons = [x.geoname.longitude for x in geos]

    # extract case counts and death counts
    counts = doc.tiers["counts"].spans
    cases_counts = [x.metadata['count'] for x in counts if 'case' in x.metadata['attributes'] and 'death' not in x.metadata['attributes']]
    cases_tags = [x.metadata['attributes'] for x in counts if 'case' in x.metadata['attributes'] and 'death' not in x.metadata['attributes']]
    death_counts = [x.metadata['count'] for x in counts if 'death' in x.metadata['attributes']]
    death_tags = [x.metadata['attributes'] for x in counts if 'death' in x.metadata['attributes']]

    # extract the date range
    dates = doc.tiers["dates"].spans
    dates_start = [pd.to_datetime(x.metadata["datetime_range"][0], errors='coerce') for x in dates]
    dates_end = [pd.to_datetime(x.metadata["datetime_range"][1], errors='coerce') for x in dates]

    # return only max_ents entities from the extracted lists
    # currently set to the first result for each list, since that is usually the most important one
    # and other ones can be filler/garbage data
    return pd.Series([ 
        geo_admin1s[:max_ents],
        geo_admin2s[:max_ents],
        geo_admin3s[:max_ents],
        geo_admin4s[:max_ents],
        geo_names[:max_ents],
        geo_lats[:max_ents],
        geo_lons[:max_ents],
        cases_counts[:max_ents],
        cases_tags[:max_ents],
        death_counts[:max_ents],
        death_tags[:max_ents],
        dates_start[:max_ents],
        dates_end[:max_ents],
    ])

In [None]:
df = pd.read_csv('promed_Dengue.csv', sep='\t')
print(len(df))

df.head()

1376


Unnamed: 0.1,Unnamed: 0,id,title,zoom_lat,zoom_lon,zoom_level,alert_id,feed_id,summary,issue_date,...,descr,alert_tag_id,dup_count,dup_of,unique_string,info_hash,submitted_by,reviewed,search_string_id,content
0,8707683,8707683,"['Dengue/DHF update (01): Asia, 2022']",12.38293,103.007812,4,8707683,1,"PRO/AH/EDR> Dengue/DHF update (01): Asia, 2022",2023-01-08 00:07:57,...,,,0,,20230110.0,,22568.0,,,"DENGUE/DHF UPDATE (01): ASIA, 2022\n**********..."
1,8707227,8707227,"['Dengue/DHF update (10): Asia, vaccine']",23.843229,90.268501,5,8707227,1,"PRO/AH/EDR> Dengue/DHF update (10): Asia, vaccine",2022-12-12 19:34:32,...,Takeda's dengue vaccine [trade name Qdenga - M...,,0,,20221210.0,,22568.0,,,"DENGUE/DHF UPDATE (10): ASIA, VACCINE\n*******..."
2,8707497,8707497,['Dengue/DHF update (11): Americas'],24.106649,-102.678223,4,8707497,1,PRO/EDR> Dengue/DHF update (11): Americas,2022-12-28 17:19:48,...,Cases by Country / Week updated / Serotype / T...,,0,,20221230.0,,3419.0,,,DENGUE/DHF UPDATE (11): AMERICAS\n************...
3,8706717,8706717,"['Dengue/DHF update (08): Americas, Asia, Euro...",75.0,-39.990234,4,8706717,1,"PRO/AH/EDR> Dengue/DHF update (08): Americas, ...",2022-11-13 18:47:41,...,Cases by Country / Week updated / Serotype / T...,,0,,20221110.0,,3419.0,,,"DENGUE/DHF UPDATE (08): AMERICAS, ASIA, EUROPE..."
4,8706699,8706699,"['Dengue/DHF update (07): Asia, Africa, Americ...",23.843229,90.268501,5,8706699,1,"PRO/AH/EDR> Dengue/DHF update (07): Asia, Afri...",2022-11-12 15:47:50,...,,,0,,20221110.0,,22568.0,,,"DENGUE/DHF UPDATE (07): ASIA, AFRICA, AMERICAS..."


In [None]:
!python -m spacy download en_core_web_md

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en_core_web_md==2.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz (95.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.4/95.4 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: en_core_web_md
  Building wheel for en_core_web_md (setup.py) ... [?25l[?25hdone
  Created wheel for en_core_web_md: filename=en_core_web_md-2.1.0-py3-none-any.whl size=97126236 sha256=0d39852ce5ba9364d5b1573f41f4f4b4c0aa28e1813bca643467e61dd812dabd
  Stored in directory: /tmp/pip-ephem-wheel-cache-1se_qjtu/wheels/3a/b4/ca/4a76b83a984d253fa2cccacea8b11bc69c04efb3e186221dcc
Successfully built en_core_web_md
Installing collected packages: en_core_web_md
Successfully installed en_core_web_md-2.1.0
[38;5;2m✔ Dow

In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import files
uploades = files.upload()

Saving promed_Dengue.csv to promed_Dengue.csv


In [None]:
pip install epitator

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting epitator
  Downloading EpiTator-1.3.5.tar.gz (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.0/57.0 KB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dateparser==0.7.1
  Downloading dateparser-0.7.1-py2.py3-none-any.whl (351 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m351.4/351.4 KB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting unicodecsv>=0.14.1
  Downloading unicodecsv-0.14.1.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting spacy==2.1.8
  Downloading spacy-2.1.8.tar.gz (30.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing

In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m74.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


In [None]:
df['publish_date'] = df['issue_date'].apply(extract_publish_date) # add date column


In [None]:
!python -m epitator.importers.import_all

Creating database at: /root/.epitator.sqlitedb
Loading disease ontology...
Traceback (most recent call last):
  File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.8/dist-packages/epitator/importers/import_all.py", line 23, in <module>
    import_disease_ontology(args.drop_previous)
  File "/usr/local/lib/python3.8/dist-packages/epitator/importers/import_disease_ontology.py", line 51, in import_disease_ontology
    disease_ontology.parse(os.path.join(os.path.dirname(__file__), "doid_extension.ttl"), format="turtle")
  File "/usr/local/lib/python3.8/dist-packages/rdflib/graph.py", line 1306, in parse
    source = create_input_source(
  File "/usr/local/lib/python3.8/dist-packages/rdflib/parser.py", line 404, in create_input_source
    ) = _create_input_source_from_location(
  File "/usr/local/lib/python3.8/

In [None]:
df['title'] = df['title'].progress_apply(clean_html)


100%|██████████| 1433/1433 [00:00<00:00, 71931.32it/s]


In [None]:
df['body'] = df['content'].progress_apply(clean_html)


100%|██████████| 1376/1376 [00:02<00:00, 565.26it/s] 


In [None]:
df['disease'] = df['body'].progress_apply(extract_disease)


100%|██████████| 1433/1433 [00:00<00:00, 5605.44it/s]


In [None]:
df['title'] = df['title']

ValueError: ignored

In [None]:
df['summary'] = df['body'].progress_apply(summarizer)

KeyError: ignored

In [None]:
df[['admin1_code',
'admin2_code',
'admin3_code',
'admin4_code',
'location_name',
'location_lat',
'location_lon',
'cases',
'cases_tags',
'deaths',
'deaths_tags',
'dates_start',
'dates_end',]] = df['summary'].progress_apply(epitator_extract)

100%|██████████| 1433/1433 [01:20<00:00, 17.82it/s]


In [None]:
!pip install geonames

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting geonames
  Downloading geonames-0.1.3-py2.py3-none-any.whl (38 kB)
Installing collected packages: geonames
Successfully installed geonames-0.1.3


In [None]:
pip install geonameszip

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting geonameszip
  Downloading geonameszip-0.3.0.zip (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: geonameszip
  Building wheel for geonameszip (setup.py) ... [?25l[?25hdone
  Created wheel for geonameszip: filename=geonameszip-0.3.0-py3-none-any.whl size=10089 sha256=fecadaa42be663bf6560042946882e0ac597becbda6b378760803a612be08123
  Stored in directory: /root/.cache/pip/wheels/f8/62/87/0870b17a368438a78a8a048805718760139d8f0c30eee5ec39
Successfully built geonameszip
Installing collected packages: geonameszip
Successfully installed geonameszip-0.3.0


In [None]:
from epitator.geoname_annotator import GeonameAnnotator

In [None]:
!python -m epitator.importers.import_geonames

Downloading geoname data from: http://download.geonames.org/export/dump/allCountries.zip
Download complete
275000 / 11000000 + geonames imported
550000 / 11000000 + geonames imported
825000 / 11000000 + geonames imported
1100000 / 11000000 + geonames imported
1375000 / 11000000 + geonames imported
1650000 / 11000000 + geonames imported
1925000 / 11000000 + geonames imported
2200000 / 11000000 + geonames imported
2475000 / 11000000 + geonames imported
2750000 / 11000000 + geonames imported
3025000 / 11000000 + geonames imported
3300000 / 11000000 + geonames imported
3575000 / 11000000 + geonames imported
3850000 / 11000000 + geonames imported
4125000 / 11000000 + geonames imported
4400000 / 11000000 + geonames imported
4675000 / 11000000 + geonames imported
4950000 / 11000000 + geonames imported
5225000 / 11000000 + geonames imported
5500000 / 11000000 + geonames imported
5775000 / 11000000 + geonames imported
6050000 / 11000000 + geonames imported
6325000 / 11000000 + geonames imported

In [None]:
df.iloc[1]

Unnamed: 0                                                    8707497
id                                                            8707497
title                            ['Dengue/DHF update (11): Americas']
zoom_lat                                                    24.106649
zoom_lon                                                  -102.678223
zoom_level                                                          4
alert_id                                                      8707497
feed_id                                                             1
summary                     PRO/EDR> Dengue/DHF update (11): Americas
issue_date                                        2022-12-28 17:19:48
load_date                                         2022-12-28 17:19:48
incident_date                                                     NaN
descr               Cases by Country / Week updated / Serotype / T...
alert_tag_id                                                      NaN
dup_count           

In [None]:
df.to_csv('/content/drive/My Drive/Datasets/promed_Dengue_parsed.csv')

In [None]:
import os
os.getcwd()


'/content'