# **Medical Case Reports and Biomedical Literature**

PHEE Datasource 5,000 annotated events from medical case reports and biomedical literature, making it the largest such public dataset to date. 

The dataset is extracted from MEDLINE case reports and each sentence features two levels of annotations: coarse-grained and fine-grained.  

*   The coarse-grained annotations contain event trigger word/phrase,event type and text spans indicating the event’s associated subject, treatment, and effect.
*   The fine-grained annotations contain patient demographic information, the context information about the treatments including drug dosage levels, administration routes, frequency, and attributes relating to events (Sun, 2022).

The train, development and test sets are to be recreated with stratified sampling using the demographic information, then try three models: Sequence Labeling, Extractive QA, Generative QA to classify pertinent information in the dataset.

In [4]:
import pickle
import os
import glob

# **Parse Data Files**

In [None]:
def concat_txt_file(input_path,output_path):
  """
    Parse the text files and save the data to .pkl file

    @p:
    input_path (str): Path of the directory with input data
    output_path (str): Path of the directory to save pkl file

  """
  
  temp_dict={'id':[],'text':[]}

  for file_name in glob.glob(input_path + "/*.txt"):
    id=file_name.split('.')[0].split('/')[-1]
    temp_dict['id'].append(id)

    with open(file_name) as f:
      lines=f.readlines()[0]
    
    temp_dict['text'].append(lines)

    with open(output_path, 'wb') as handle:
      pickle.dump(temp_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

def concat_ann_file(input_path,output_path):
  """
    Parse the ann files and save the data to .pkl file

    @p:
    input_path (str): Path of the directory with input data
    output_path (str): Path of the directory to save pkl file

  """

  temp_dict={'id':[],'Subject':[],'Treatment':[],
             'Potential_therapeutic_event':[],
             'Drug':[],'Effect':[],'Adverse_event':[],
             'Race':[],'Age':[],'Gender':[],
             'Population':[],'Disorder':[],'Duration':[],
             'Time_elapsed':[],'Route':[],'Freq':[],
             'Dosage':[],'Combination.Drug':[]}

  label_lst=['Subject','Treatment',
             'Potential_therapeutic_event',
             'Drug','Effect','Adverse_event',
             'Race','Age','Gender',
             'Population','Disorder','Duration',
             'Time_elapsed','Route','Freq',
             'Dosage','Combination.Drug']

  for file_name in glob.glob(input_path + "/*.ann")[0:10]:
    id=file_name.split('.')[0].split('/')[-1]

    temp_dict['id'].append(id)

    ck_cnt=0
    

    with open(file_name, 'r') as document_anno_file:
      lines = document_anno_file.readlines()
      
      s, t, p, d, e, a, r, ag, g, pop, dis, dur, t_e, rout, freq, dos, com = ([] for i in range(17))

      temp_arr_lst=[s, t, p, d, e, a, r, ag, g, pop, dis, dur, t_e, rout, freq, dos, com]

      for line in lines:
        standoff_line=line.split("\t")
        standoff_line.pop(0)

        identifier=standoff_line[0].split()[0].strip()
        
        if identifier=='Drug': d.append(standoff_line[1].strip())
        elif identifier=='Effect': e.append(standoff_line[1].strip())
        elif identifier=='Adverse_event': a.append(standoff_line[1].strip())
        elif identifier=='Subject': s.append(standoff_line[1].strip())
        elif identifier=='Treatment': t.append(standoff_line[1].strip())
        elif identifier=='Potential_therapeutic_event': p.append(standoff_line[1].strip())
        elif identifier=='Race': r.append(standoff_line[1].strip())
        elif identifier=='Age': ag.append(standoff_line[1].strip())
        elif identifier=='Gender': g.append(standoff_line[1].strip())
        elif identifier=='Population': pop.append(standoff_line[1].strip())
        elif identifier=='Disorder': dis.append(standoff_line[1].strip())
        elif identifier=='Duration': dur.append(standoff_line[1].strip())
        elif identifier=='Time_elapsed': t_e.append(standoff_line[1].strip())
        elif identifier=='Route': rout.append(standoff_line[1].strip())
        elif identifier=='Freq': freq.append(standoff_line[1].strip())
        elif identifier=='Dosage': dos.append(standoff_line[1].strip())
        elif identifier=='Combination.Drug': com.append(standoff_line[1].strip())

        for i in range(len(label_lst)):
          temp_dict[label_lst[i]].append(temp_arr_lst[i])

    with open(output_path, 'wb') as handle:
      pickle.dump(temp_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [5]:
path_train='/content/drive/MyDrive/PHEE/input/clean/train'
path_dev='/content/drive/MyDrive/PHEE/input/clean/dev'
path_test='/content/drive/MyDrive/PHEE/input/clean/test'

parse_PHEE_data.concat_txt_file(path_train,'/content/drive/MyDrive/PHEE/output/train_txt.pickle')
concat_txt_file(path_dev,'/content/drive/MyDrive/PHEE/output/dev_txt.pickle')
concat_txt_file(path_test,'/content/drive/MyDrive/PHEE/output/test_txt.pickle')
concat_ann_file(path_train,'/content/drive/MyDrive/PHEE/output/train_ann.pickle')
concat_ann_file(path_dev,'/content/drive/MyDrive/PHEE/output/dev_ann.pickle')
concat_ann_file(path_test,'/content/drive/MyDrive/PHEE/output/test_ann.pickle')

NameError: ignored

# **Load Parsed Data Files**

In [None]:
file_name_lst=['/content/drive/MyDrive/PHEE/output/train_txt.pickle',
'/content/drive/MyDrive/PHEE/output/dev_txt.pickle',
'/content/drive/MyDrive/PHEE/output/test_txt.pickle',
'/content/drive/MyDrive/PHEE/output/train_ann.pickle',
'/content/drive/MyDrive/PHEE/output/dev_ann.pickle',
'/content/drive/MyDrive/PHEE/output/test_ann.pickle']

for f in file_name_lst:
  with open(f, 'rb') as handle:
      d = pickle.load(handle)
      print(len(d))

2
2
2
18
18
18
