# **Parse PHEE Data**

PHEE data is stored in .txt files and .ann files, which are parsed and saved into dictionaries for future analysis

In [12]:
import pickle
import os
import glob
import pandas as pd

In [2]:
def concat_txt_file(input_path,output_path):
  """
    Parse the text files and save the data to .pkl file

    @p:
    input_path (str): Path of the directory with input data
    output_path (str): Path of the directory to save pkl file

  """
  
  temp_dict={}

  for file_name in glob.glob(input_path + "/*.txt"):
    id=file_name.split('.')[0].split('/')[-1]

    with open(file_name) as f:
      lines=f.readlines()[0]
    
    temp_dict[id]=lines

    with open(output_path, 'wb') as handle:
      pickle.dump(temp_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

def concat_ann_file(input_path,output_path,):
  """
    Parse the ann files and save the specific key to .pkl file

    @p:
    input_path (str): Path of the directory with input data
    output_path (str): Path of the directory to save pkl file

  """

  label_lst=['Subject','Treatment',
              'Potential_therapeutic_event',
              'Drug','Effect','Adverse_event',
              'Race','Age','Gender',
              'Population','Disorder','Duration',
              'Time_elapsed','Route','Freq',
              'Dosage','Combination.Drug']

  for l in label_lst:
    temp_dict={}

    for file_name in glob.glob(input_path + "/*.ann"):
      id=file_name.split('.')[0].split('/')[-1]

      with open(file_name, 'r') as document_anno_file:
        lines = document_anno_file.readlines()
        
        temp_lst=[]

        for line in lines:
          standoff_line=line.split("\t")
          standoff_line.pop(0)

          identifier=standoff_line[0].split()[0].strip()
          
          if identifier==l: temp_lst.append(standoff_line[1].strip())
        
        temp_dict[id]=l

    with open(output_path+'_{}.pickle'.format(l), 'wb') as handle:
      pickle.dump(temp_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [3]:
path_train='/content/drive/MyDrive/PHEE/input/clean/train'
path_dev='/content/drive/MyDrive/PHEE/input/clean/dev'
path_test='/content/drive/MyDrive/PHEE/input/clean/test'

concat_txt_file(path_train,'/content/drive/MyDrive/PHEE/output/train_txt.pickle')
concat_txt_file(path_dev,'/content/drive/MyDrive/PHEE/output/dev_txt.pickle')
concat_txt_file(path_test,'/content/drive/MyDrive/PHEE/output/test_txt.pickle')


concat_ann_file(path_train,'/content/drive/MyDrive/PHEE/output/train_ann')
concat_ann_file(path_dev,'/content/drive/MyDrive/PHEE/output/dev_ann')
concat_ann_file(path_test,'/content/drive/MyDrive/PHEE/output/test_ann')

In [4]:
def dict_pkl_load(filename):
  """
    Load the dictionary data from a pickle file into a variable

    @P:
    filename (str): Name of the pkl file
    varname : Name of the df to save the pkl data

    @R:
    varname : Containing the pkl data
  """

  with open(filename, 'rb') as handle:
      df = pickle.load(handle)

  return df

train_txt_dict=dict_pkl_load('/content/drive/MyDrive/PHEE/output/train_txt.pickle')
dev_txt_dict=dict_pkl_load('/content/drive/MyDrive/PHEE/output/dev_txt.pickle')
test_txt_dict=dict_pkl_load('/content/drive/MyDrive/PHEE/output/test_txt.pickle')

# **Merge Dictionaries and Form 1 Dataframe**

In [10]:
def Merge(dict1, dict2):
    """
      Add the contents of dict2 to dict1

      @P:
      dict1 (dict): Dictionary that has info added to it
      dict2 (dict): Dictionary that has the contents to add to dict 1
    """
    dict2.update(dict1)
    return dict2

train_dev_dict=Merge(train_txt_dict, dev_txt_dict)
text_dict=Merge(train_dev_dict, test_txt_dict)

In [18]:
df=pd.DataFrame.from_dict(text_dict,orient='index',columns=['Text'])
df.reset_index(inplace=True)

In [20]:
def add_labels_dict(df_data,Merge):
  """
    Add the label data to the dataframe with the text

    @P:
    Merge: function that merges dictionaries
    df_data (df): Dataframe to hold the information

    @R:
    df_data : Containing the pkl data
  """
  
  label_lst=['Subject','Treatment',
              'Potential_therapeutic_event',
              'Drug','Effect','Adverse_event',
              'Race','Age','Gender',
              'Population','Disorder','Duration',
              'Time_elapsed','Route','Freq',
              'Dosage','Combination.Drug']

  train_path='/content/drive/MyDrive/PHEE/output/train_ann'
  dev_path='/content/drive/MyDrive/PHEE/output/dev_ann'
  test_path='/content/drive/MyDrive/PHEE/output/test_ann'

  for l in label_lst:

    with open(train_path+'_{}.pickle'.format(l), 'rb') as handle:
        train_d=pickle.load(handle)

    with open(dev_path+'_{}.pickle'.format(l), 'rb') as handle:
        dev_d=pickle.load(handle)

    with open(test_path+'_{}.pickle'.format(l), 'rb') as handle:
        test_d=pickle.load(handle)

    train_dev_dict=Merge(train_d, dev_d)
    final_dict=Merge(train_dev_dict, test_d)

    df=pd.DataFrame.from_dict(final_dict,orient='index',columns=[l])
    df.reset_index(inplace=True)

    df_data=df_data.merge(df,how='left',on='index')

  return df_data

raw_df=add_labels_dict(df,Merge)

Unnamed: 0,index,Text,Subject,Treatment,Potential_therapeutic_event,Drug,Effect,Adverse_event,Race,Age,Gender,Population,Disorder,Duration,Time_elapsed,Route,Freq,Dosage,Combination.Drug
0,12851279_15,Tifacogin administration was associated with a...,Subject,Treatment,Potential_therapeutic_event,Drug,Effect,Adverse_event,Race,Age,Gender,Population,Disorder,Duration,Time_elapsed,Route,Freq,Dosage,Combination.Drug
1,14522628_1,Generalised cutaneous rash associated with gan...,Subject,Treatment,Potential_therapeutic_event,Drug,Effect,Adverse_event,Race,Age,Gender,Population,Disorder,Duration,Time_elapsed,Route,Freq,Dosage,Combination.Drug
2,16046172_2,One patient was an 80-year-old woman who was a...,Subject,Treatment,Potential_therapeutic_event,Drug,Effect,Adverse_event,Race,Age,Gender,Population,Disorder,Duration,Time_elapsed,Route,Freq,Dosage,Combination.Drug
3,16288069_1,An objective causality assessment suggests tha...,Subject,Treatment,Potential_therapeutic_event,Drug,Effect,Adverse_event,Race,Age,Gender,Population,Disorder,Duration,Time_elapsed,Route,Freq,Dosage,Combination.Drug
4,17275666_2,Valproic acid is commonly and effectively used...,Subject,Treatment,Potential_therapeutic_event,Drug,Effect,Adverse_event,Race,Age,Gender,Population,Disorder,Duration,Time_elapsed,Route,Freq,Dosage,Combination.Drug


In [22]:
raw_df.to_pickle('/content/drive/MyDrive/PHEE/output/data_df.pkl')