# **Parse PHEE Data**

PHEE data is stored in .txt files and .ann files, which are parsed and then stored in a dataframe.

The dataframe with the data is exported as a '.pkl'
file.

**Important Notes for Running Notebook:**


*   Need Clean Data to be saved in a directory, 'Input'
*   Need to have a directory, 'Output'



In [3]:
import pickle
import os
import glob
import pandas as pd

# **Define Functions**



1.   **parse_txt_file**: Parse '.txt' files
2.   **parse_ann_file**: Parse '.txt' files
3.   **Merge**: Helper function to merge two 
dictionaries
4.   **pkl_load_dict**: Helper function to load dictionaries from pkl file
5.   **df_creation**: Create df of raw data and save for future analysis




In [15]:
def parse_txt_file(input_path,output_path):
  """
    Parse the text files and save the data to .pkl file

    @p:
    input_path (str): Path of the directory with input data
    output_path (str): Path of the directory to save pkl file

  """
  
  temp_dict={}

  for file_name in glob.glob(input_path + "/*.txt"):
    id=file_name.split('.')[0].split('/')[-1]

    with open(file_name) as f:
      lines=f.readlines()[0]
    
    temp_dict[id]=lines

    with open(output_path, 'wb') as handle:
      pickle.dump(temp_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

def parse_ann_file(input_path,output_path):
  """
    Parse the ann files and save the specific key to .pkl file

    @p:
    input_path (str): Path of the directory with input data
    output_path (str): Path of the directory to save pkl file

  """

  label_lst=['Subject','Negation_cue',
              'Potential_therapeutic_event',
              'Drug','Effect','Adverse_event',
              'Race','Age','Gender',
              'Population','Disorder','Duration',
              'Time_elapsed','Route','Freq',
              'Dosage','Combination.Drug','Treat-Disorder',
             'Treatment','Severity_cue','Severity',
             'Time_elapsed','Speculation_cue','Sub-Disorder',
             ]



  id_list=[]

  for l in label_lst:
    temp_dict={}

    for file_name in glob.glob(input_path + "/*.ann"):
      id=file_name.split('.')[0].split('/')[-1]

      with open(file_name, 'r') as document_anno_file:
        lines = document_anno_file.readlines()

        temp_lst=[]

        for line in lines:
          standoff_line=line.split("\t")
          standoff_line.pop(0)

          identifier=standoff_line[0].split()[0].strip()
          
          if identifier==l: temp_lst.append(standoff_line[1].strip())
          
          id_list.append(identifier)

        temp_lst_str='||'.join(temp_lst)
        temp_dict[id]=temp_lst_str

    print(set(id_list))
    with open(output_path+'_{}.pickle'.format(l), 'wb') as handle:
      pickle.dump(temp_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

def Merge(dict1, dict2):
  """
    Add the contents of dict2 to dict1

    @P:
    dict1 (dict): Dictionary that has info added to it
    dict2 (dict): Dictionary that has the contents to add to dict 1
  """
  dict2.update(dict1)
  return dict2

def pkl_load_dict(filename):
  """
    Load the dictionary data from a pickle file into a variable

    @P:
    filename (str): Name of the pkl file
    varname : Name of the df to save the pkl data

    @R:
    varname : Containing the pkl data
  """

  with open(filename, 'rb') as handle:
      df = pickle.load(handle)

  return df

def df_creation(df_data,Merge,output_path):
  """
    Add the data from the ann to the dataframe with the text

    @P:
    Merge: function that merges dictionaries
    df_data (df): Dataframe to hold the information
    output_path (str): Path to save final dataframe

  """
  
  label_lst=['Subject','Negation_cue',
              'Potential_therapeutic_event',
              'Drug','Effect','Adverse_event',
              'Race','Age','Gender',
              'Population','Disorder','Duration',
              'Time_elapsed','Route','Freq',
              'Dosage','Combination.Drug','Treat-Disorder',
             'Treatment','Severity_cue','Severity',
             'Time_elapsed','Speculation_cue','Sub-Disorder',
             ]
  
  train_path='/content/drive/MyDrive/PHEE/output/train_ann'
  dev_path='/content/drive/MyDrive/PHEE/output/dev_ann'
  test_path='/content/drive/MyDrive/PHEE/output/test_ann'

  for l in label_lst:

    with open(train_path+'_{}.pickle'.format(l), 'rb') as handle:
        train_d=pickle.load(handle)

    with open(dev_path+'_{}.pickle'.format(l), 'rb') as handle:
        dev_d=pickle.load(handle)

    with open(test_path+'_{}.pickle'.format(l), 'rb') as handle:
        test_d=pickle.load(handle)

    train_dev_dict=Merge(train_d, dev_d)
    final_dict=Merge(train_dev_dict, test_d)

    df=pd.DataFrame.from_dict(final_dict,orient='index',columns=[l])
    df.reset_index(inplace=True)

    df_data=df_data.merge(df,how='left',on='index')

  df_data.to_pickle(output_path)

  print('The shape of the final data set is: {}'.format(df_data.shape))


**Parse the Text and Ann Files**

In [16]:
path_train='/content/drive/MyDrive/PHEE/input/clean/train'
path_dev='/content/drive/MyDrive/PHEE/input/clean/dev'
path_test='/content/drive/MyDrive/PHEE/input/clean/test'

parse_txt_file(path_train,'/content/drive/MyDrive/PHEE/output/train_txt.pickle')
parse_txt_file(path_dev,'/content/drive/MyDrive/PHEE/output/dev_txt.pickle')
parse_txt_file(path_test,'/content/drive/MyDrive/PHEE/output/test_txt.pickle')


parse_ann_file(path_train,'/content/drive/MyDrive/PHEE/output/train_ann')
parse_ann_file(path_dev,'/content/drive/MyDrive/PHEE/output/dev_ann')
parse_ann_file(path_test,'/content/drive/MyDrive/PHEE/output/test_ann')

{'Combination', 'Age', 'Adverse_event:T16', 'Gender', 'Adverse_event:T19', 'Potential_therapeutic_event:T17', 'Potential_therapeutic_event:T3', 'Adverse_event:T24', 'Population', 'Adverse_event:T27', 'Adverse_event:T7', 'Race', 'Combination:T20', 'Combination:T40', 'Combination:T24', 'Potential_therapeutic_event:T16', 'is_a', 'Combination:T16', 'Adverse_event:T15', 'Potential_therapeutic_event:T11', 'Combination:T9', 'Combination:T13', 'Combination:T21', 'Combination:T17', 'Adverse_event:T2', 'Drug', 'Negation_cue', 'Adverse_event:T4', 'Potential_therapeutic_event:T8', 'Combination:T2', 'Adverse_Effect', 'Treatment', 'Potential_therapeutic_event:T12', 'has_cue', 'Combination:T7', 'has', 'Adverse_event:T26', 'Severity_cue', 'Freq', 'Potential_therapeutic_event:T24', 'Combination:T10', 'Effect', 'Adverse_event:T12', 'Potential_therapeutic_event:T13', 'Dosage', 'Combination:T6', 'Subject', 'Combination:T14', 'Combination:T23', 'Combination:T8', 'Potential_therapeutic_event:T27', 'Route', 

**Load the Parsed File Data and Merge the Text Data**

In [None]:
train_txt_dict=pkl_load_dict('/content/drive/MyDrive/PHEE/output/train_txt.pickle')
dev_txt_dict=pkl_load_dict('/content/drive/MyDrive/PHEE/output/dev_txt.pickle')
test_txt_dict=pkl_load_dict('/content/drive/MyDrive/PHEE/output/test_txt.pickle')

In [None]:
train_dev_dict=Merge(train_txt_dict, dev_txt_dict)
text_dict=Merge(train_dev_dict, test_txt_dict)
df=pd.DataFrame.from_dict(text_dict,orient='index',columns=['Text'])
df.reset_index(inplace=True)

**Create Dataframe of Raw Data**

In [None]:
df_creation(df,Merge,'/content/drive/MyDrive/PHEE/output/data_df.pkl')

The shape of the final data set is: (4832, 19)
