## Import

In [7]:
%load_ext autoreload
%autoreload 2

In [8]:
import time
import os
import argparse
from subprocess import call


import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
tqdm.pandas()

from IPython.display import display, Markdown, Latex, HTML

import torch

from transformers import GPT2Tokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer


from verisci.covid import AbstractRetriever, RationaleSelector, LabelPredictor
from verisci.evaluate.lib.data import GoldDataset
#import wandb

In [42]:
from ipymarkup import show_span_box_markup
from ipymarkup.palette import palette, BLUE, RED, GREEN

In [9]:
loc_df_scispacy_sentence_word_unq_ner_abr_filtered ='../../dfs_generated/linguistic/df_scispacy_sentence_word_unq_ner_abr_filtered.pkl'

## Scifact

### Load Data

In [10]:
loc_corpus = '../../data/scifact/corpus.jsonl'
loc_claim_train = '../../data/scifact/claims_train.jsonl'
loc_claim_test = '../../data/scifact/claims_test.jsonl'
loc_claim_dev = '../../data/scifact/claims_dev.jsonl'

In [11]:
from verisci.evaluate.lib.data import GoldDataset

In [12]:
def get_claim_label_from_jsonl(dataset_jsonl):
    claim_label_list_train = []


    for cur_claim in dataset_jsonl:
        claim_txt = cur_claim.claim

        for doc_id, evidence in cur_claim.evidence.items():

            ev_doc = cur_claim.release.corpus.get_document(doc_id)

            claim_label = evidence.label.name

            tmp_dic = {"claim" : claim_txt, "label" : claim_label}

            claim_label_list_train.append(tmp_dic)
    return claim_label_list_train

In [13]:
def get_claim_label_evidence_from_jsonl(dataset_jsonl, source):
    claim_label_list_train = []


    for cur_claim in dataset_jsonl:
        claim_txt = cur_claim.claim

        for doc_id, evidence in cur_claim.evidence.items():

            ev_doc = cur_claim.release.corpus.get_document(doc_id)

            claim_label = evidence.label.name
            
            list_rationales = []
            for i, sents in enumerate(evidence.rationales):
                list_rationales = [sent for i, sent in enumerate(ev_doc.sentences) if i in sents]

            tmp_dic = {"claim" : claim_txt, "label" : claim_label, "list_rationales" :list_rationales, "source" :source}

            claim_label_list_train.append(tmp_dic)
    return claim_label_list_train

In [14]:
ds_train = GoldDataset(loc_corpus, loc_claim_train)
# claim_train = ds_train.get_claim(39)
# claim_train.pretty_print()
dic_train = get_claim_label_evidence_from_jsonl(ds_train, source = "train")

In [15]:
ds_valid = GoldDataset(loc_corpus, loc_claim_dev)
# claim_valid = ds_valid.get_claim(42)
# claim_valid.pretty_print()
dic_valid = get_claim_label_evidence_from_jsonl(ds_valid, source = "dev")

In [16]:
df_claim_evid_label = pd.concat([pd.DataFrame(dic_train), pd.DataFrame(dic_valid)], ignore_index=True)

df_claim_evid_label

Unnamed: 0,claim,label,list_rationales,source
0,1 in 5 million in UK have abnormal PrP positiv...,REFUTES,"[RESULTS Of the 32,441 appendix samples 16 wer...",train
1,32% of liver transplantation programs required...,SUPPORTS,[Policies requiring discontinuation of methado...,train
2,40mg/day dosage of folic acid and 2mg/day dosa...,SUPPORTS,[CONCLUSION Treatment with high doses of folic...,train
3,76-85% of people with severe mental disorder r...,SUPPORTS,[Although disorder severity was correlated wit...,train
4,A T helper 2 cell (Th2) environment impedes di...,REFUTES,"[Thus, in Lyn(-/-) mice, basophils and IgE aut...",train
...,...,...,...,...
768,Women with a higher birth weight are more like...,SUPPORTS,[Increased risk of breast cancer was noted wit...,dev
769,Women with a higher birth weight are more like...,SUPPORTS,[RESULTS We found that heavier birth weights w...,dev
770,aPKCz causes tumour enhancement by affecting g...,REFUTES,"[Taken together, this demonstrates that PKCζ i...",dev
771,cSMAC formation enhances weak ligand signalling.,SUPPORTS,[This conclusion was supported by experiments ...,dev


In [17]:
HTML(df_claim_evid_label[['claim']].to_html())

Unnamed: 0,claim
0,1 in 5 million in UK have abnormal PrP positivity.
1,32% of liver transplantation programs required patients to discontinue methadone treatment in 2001.
2,40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.
3,76-85% of people with severe mental disorder receive no treatment in low and middle income countries.
4,A T helper 2 cell (Th2) environment impedes disease development in patients with systemic lupus erythematosus (SLE).
5,A breast cancer patient's capacity to metabolize tamoxifen influences treatment outcome.
6,A country's Vaccine Alliance (GAVI) eligibility is not indictivate of accelerated adoption of the Hub vaccine.
7,A deficiency of folate increases blood levels of homocysteine.
8,A diminished ovarian reserve does not solely indicate infertility in an a priori non-infertile population.
9,"A diminished ovarian reserve is a very strong indicator of infertility, even in an a priori non-infertile population."


## Transformer

In [12]:
df_claim_evid_label.iloc[638]['claim']

'High-sensitivity cardiac troponin T (HSCT-T) dosage may not be diagnostic if the onset of symptoms occurs less than 3 hours before acute myocardial injury (AMI).'

In [44]:
from transformers import BertTokenizer, BertForTokenClassification
from temporal_taggers.tagger import BERTWithDateLayerTokenClassification, BertWithCRF, DateTokenizer
from temporal_taggers.evaluation import merge_tokens, insert_tags_in_raw_text

In [60]:
tokenizer_temporal_sat_bert = AutoTokenizer.from_pretrained("satyaalmasian/temporal_tagger_BERT_tokenclassifier", use_fast=False)
model_temporal_sat_bert = BertForTokenClassification.from_pretrained("satyaalmasian/temporal_tagger_BERT_tokenclassifier")

In [61]:
#processed_temporal_sat_bert = tokenizer_temporal_sat_bert.tokenize(df_claim_evid_label.iloc[638]['claim'])


In [62]:
list_all_time_ents = []
for indx_, cur_row in tqdm(df_claim_evid_label.iterrows(), total=len(df_claim_evid_label)):

    try:
        annotation_id_temporal_sat_bert = 1
        id2label_temporal_sat_bert = {v: k for k, v in model_temporal_sat_bert.config.label2id.items()}


        processed_temporal_sat_bert = tokenizer_temporal_sat_bert(cur_row['claim'], return_tensors="pt")

        result_temporal_sat_bert = model_temporal_sat_bert(**processed_temporal_sat_bert)
        classification_temporal_sat_bert= result_temporal_sat_bert[0]


        result_temporal_sat_bert = model_temporal_sat_bert(**processed_temporal_sat_bert)
        classification_temporal_sat_bert = torch.argmax(result_temporal_sat_bert[0], dim=2)

        merged_tokens_temporal_sat_bert = merge_tokens(processed_temporal_sat_bert["input_ids"][0], classification_temporal_sat_bert[0], id2label_temporal_sat_bert, tokenizer_temporal_sat_bert)

        cur_time_ents = [x_ for x_ in merged_tokens_temporal_sat_bert if x_[1] != 'O']
        list_all_time_ents.append({'claim' : cur_row['claim'], 'time_ents' : cur_time_ents})
    except Exception as e:
        pass
# merged_tokens = merge_tokens(processed_text["input_ids"][0], classification[0], id2label, text_tokenizer)
# annotated_text, annotation_id = insert_tags_in_raw_text(input_text, merged_tokens, annotation_id)

100%|██████████| 773/773 [01:20<00:00,  9.58it/s]


In [63]:
[print(x_) for x_ in merged_tokens_temporal_sat_bert if x_[1] != 'O']

[]

In [64]:
df_claim_time_ent_list = pd.DataFrame(list_all_time_ents)

In [65]:
HTML(df_claim_time_ent_list[df_claim_time_ent_list['time_ents'].map(lambda d: len(d)) > 0].reset_index(drop = True).to_html())

Unnamed: 0,claim,time_ents
0,32% of liver transplantation programs required patients to discontinue methadone treatment in 2001.,"[(2001, DATE)]"
1,40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.,"[(day, DURATION)]"
2,A mutation in HNF4A leads to an increased risk of developing diabetes by the age of 14 years.,"[(the, DURATION), (age, DURATION), (14, DURATION), (years, DURATION)]"
3,Combination nicotine replacement therapies with varenicline or bupropion are more effective after 12 weeks of reatment compared to varenicline monotherapy.,"[(12, DURATION), (weeks, DURATION), (of, DURATION), (reatment, DURATION)]"
4,Commelina yellow mottle virus' (ComYMV) genome consists of 2140 baise pairs.,"[(2140, DATE)]"
5,Improvements in OER catalysts show stable activity over several hundred hours.,"[(several, DURATION), (hundred, DURATION), (hours, DURATION)]"
6,"In rhesus macaques, daily subcutaneous injections of tenofovir protects against rectally transmitted simian-human immunodeficiency virus.","[(daily, SET)]"
7,"In young and middle-aged adults, current or remote uses of ADHD medications increase the risk of serious cardiovascular events.","[(current, DATE)]"
8,Incidence of heart failure increased by 10% in women since 1979.,"[(1979, DATE)]"
9,Incidence of sepsis has fallen substantially from 2009 to 2014.,"[(2009, DATE), (2014, DATE)]"


In [44]:
df_claim_time_ent_list[df_claim_time_ent_list['time_ents'].map(lambda d: len(d)) > 0].to_csv('time_ents.csv')

## sutime
https://github.com/FraBle/python-sutime

In [4]:
import json
from sutime import SUTime


test_case = 'I need a desk for tomorrow from 2pm to 3pm'
sutime = SUTime(mark_time_ranges=True, include_range=True)
print(json.dumps(sutime.parse(test_case), sort_keys=True, indent=4))

[
    {
        "end": 26,
        "start": 18,
        "text": "tomorrow",
        "timex-value": "2022-02-19",
        "type": "DATE",
        "value": "2022-02-19"
    },
    {
        "end": 42,
        "start": 27,
        "text": "from 2pm to 3pm",
        "type": "DURATION",
        "value": {
            "begin": "T14:00",
            "end": "T15:00"
        }
    }
]


In [34]:
list_all_temporal_ents_sutime = []
for indx_, cur_row in tqdm(df_claim_evid_label.iterrows(), total = len(df_claim_evid_label)):
    cur_claim_su_time = sutime.parse(cur_row['claim'])
    list_cur_claim_imarkup_spans = []
    for cur_su_time_ent in cur_claim_su_time:
        tpl_imarkup_span = (cur_su_time_ent['start'], cur_su_time_ent['end'], cur_su_time_ent['type'])
        list_cur_claim_imarkup_spans.append(tpl_imarkup_span)
    list_all_temporal_ents_sutime.append({'claim' : cur_row['claim'], 
                                          'su_time_ents' : cur_claim_su_time, 
                                         'imarkup_span' :list_cur_claim_imarkup_spans})

100%|██████████| 773/773 [00:42<00:00, 18.09it/s]


In [35]:
df_all_claim_temporal_su_time = pd.DataFrame(list_all_temporal_ents_sutime)

In [38]:
df_all_claim_temporal_su_time_valids = df_all_claim_temporal_su_time[df_all_claim_temporal_su_time['su_time_ents'].map(lambda d: len(d)) > 0].reset_index(drop = True)

In [43]:
for cur_indx, cur_row in df_all_claim_temporal_su_time_valids.iterrows():
    show_span_box_markup(cur_row['claim'], cur_row['imarkup_span'])

In [67]:
HTML(df_all_claim_temporal_su_time_valids.to_html())

Unnamed: 0,claim,su_time_ents,imarkup_span
0,32% of liver transplantation programs required patients to discontinue methadone treatment in 2001.,"[{'timex-value': '2001', 'start': 94, 'end': 98, 'text': '2001', 'type': 'DATE', 'value': '2001'}]","[(94, 98, DATE)]"
1,40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.,"[{'timex-value': 'P1D', 'start': 5, 'end': 8, 'text': 'day', 'type': 'DURATION', 'value': 'P1D'}, {'timex-value': 'P1D', 'start': 38, 'end': 41, 'text': 'day', 'type': 'DURATION', 'value': 'P1D'}]","[(5, 8, DURATION), (38, 41, DURATION)]"
2,A mutation in HNF4A leads to an increased risk of developing diabetes by the age of 14 years.,"[{'timex-value': 'P14Y', 'start': 84, 'end': 92, 'text': '14 years', 'type': 'DURATION', 'value': 'P14Y'}]","[(84, 92, DURATION)]"
3,Anthrax spores are very difficult to dispose once they are dispersed.,"[{'timex-value': 'PAST_REF', 'start': 45, 'end': 49, 'text': 'once', 'type': 'DATE', 'value': 'PAST_REF'}]","[(45, 49, DATE)]"
4,Combination nicotine replacement therapies with varenicline or bupropion are more effective after 12 weeks of reatment compared to varenicline monotherapy.,"[{'timex-value': 'P12W', 'start': 98, 'end': 106, 'text': '12 weeks', 'type': 'DURATION', 'value': 'P12W'}]","[(98, 106, DURATION)]"
5,Commelina yellow mottle virus' (ComYMV) genome consists of 2140 baise pairs.,"[{'timex-value': '2140', 'start': 59, 'end': 63, 'text': '2140', 'type': 'DATE', 'value': '2140'}]","[(59, 63, DATE)]"
6,Improvements in OER catalysts show stable activity over several hundred hours.,"[{'timex-value': 'PT100H', 'start': 64, 'end': 77, 'text': 'hundred hours', 'type': 'DURATION', 'value': 'PT100H'}]","[(64, 77, DURATION)]"
7,"In rhesus macaques, daily subcutaneous injections of tenofovir protects against rectally transmitted simian-human immunodeficiency virus.","[{'timex-value': 'P1D', 'start': 20, 'end': 25, 'text': 'daily', 'type': 'SET', 'value': 'P1D'}]","[(20, 25, SET)]"
8,"In young and middle-aged adults, current or remote uses of ADHD medications increase the risk of serious cardiovascular events.","[{'timex-value': 'PRESENT_REF', 'start': 33, 'end': 40, 'text': 'current', 'type': 'DATE', 'value': 'PRESENT_REF'}]","[(33, 40, DATE)]"
9,Incidence of heart failure increased by 10% in women since 1979.,"[{'timex-value': '1979', 'start': 59, 'end': 63, 'text': '1979', 'type': 'DATE', 'value': '1979'}]","[(59, 63, DATE)]"


## Libraries

### Medical 
- https://github.com/ymnliu/MedTimer
- https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-01208-9

## Debug

In [3]:
!conda env list

# conda environments:
#
                         /home/qudratealahyratu/anaconda3
                         /home/qudratealahyratu/anaconda3/envs/common
                         /home/qudratealahyratu/anaconda3/envs/gym_gpu
                         /home/qudratealahyratu/anaconda3/envs/gym_tf_gpu
                         /home/qudratealahyratu/anaconda3/envs/knowledge_graph
base                  *  /home/qudratealahyratu/anaconda3/envs/scifact
                         /home/qudratealahyratu/anaconda3/envs/scifact2



In [45]:
!/home/qudratealahyratu/anaconda3/envs/scifact/bin/pip install sutime

Collecting sutime
  Downloading sutime-1.0.1-py3-none-any.whl (28 kB)
Collecting JPype1<2.0.0,>=1.1.2
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 4.9 MB/s eta 0:00:01
Installing collected packages: JPype1, sutime
Successfully installed JPype1-1.3.0 sutime-1.0.1
