In [1]:
# Import our Python client
from hawc_client import HawcClient
# We will need pandas for DataFrames and getpass for login
import pandas as pd
import numpy
import getpass


# Login to HAWC

In [54]:
# Define the domain you're trying to reach and your username here
domain = input("Domain: ")
username = input("Username: ")
# You will then be prompted for your password
password = getpass.getpass("Password: ")

# This is our client instance, which we will be using to access HAWC
client = HawcClient(domain)
client.authenticate(username,password)

print("Authentication successful")

Domain:  https://hawcprd.epa.gov
Username:  rabstejnek.daniel@epa.gov
Password:  ·············


Authentication successful


# Upload Excel file

In [70]:
# Get the pathname for your file
filepath = input("Excel filepath: ")
hero_column = input("HERO column name: ")
tag_column = input("Tag column name: ")
# Now we read in the file as a dataframe
excel_df = pd.read_excel(filepath)
if hero_column not in excel_df.columns:
    raise Exception(f"Column '{hero_column}' not in Excel sheet")
if tag_column not in excel_df.columns:
    raise Exception(f"Column '{tag_column}' not in Excel sheet")
print("Excel file read")
print(excel_df[[hero_column,tag_column]])

Excel filepath:  C:\Users\drabstej\OneDrive - Environmental Protection Agency (EPA)\Profile\Documents\PFAS-CF4_PQ_09082020_for_HAWC Tree.xlsx
HERO column name:  HERO ID
Tag column name:  Tag


Excel file read
     HERO ID                                        Tag
0    1053534                            TIAB - Excluded
1    1089845                            TIAB - Excluded
2    1182073                            TIAB - Excluded
3    1186894               TIAB - Supplemental material
4    1186894  Supplemental Material Type -> Mechanistic
..       ...                                        ...
426  6590794                            TIAB - Excluded
427  6590799                            TIAB - Excluded
428  6591429                            TIAB - Excluded
429  6591843                            TIAB - Excluded
430  6591991                            TIAB - Excluded

[431 rows x 2 columns]


# Import HERO IDs

In [71]:
assessment_id = input("Assessment ID: ")

Assessment ID:  100500223


In [37]:
# Ensure that all of the HERO ids are in HAWC as references.
hero_ids = set(excel_df[hero_column])
print("HERO Import Parameters")
title = input("Import title: ")
description = input("Import description: ")

HERO Import Parameters


Import title:  PFAS-CF4_PQ_09082020
Import description:  


In [38]:
print("Importing HERO IDs...")
client.lit.import_hero(assessment_id,title,description,hero_ids)

Importing HERO IDs...


{'assessment': 100500223,
 'search_type': 'i',
 'source': 2,
 'title': 'PFAS-CF4_PQ_09082020',
 'slug': 'pfas-cf4_pq_09082020',
 'description': '',
 'search_string': '6590464,6590470,6588427,6590476,6588432,4155416,6588447,6588451,6590501,6588460,6588461,6588464,6590515,6590536,6588490,6588491,6588494,6588497,6588498,6588504,6588506,6590570,3569773,6588548,6588549,6588551,6590602,6588561,6588562,6588567,6590616,6590618,6590631,6590632,6588590,6590645,6590649,6590650,6590652,6588607,6588609,6590657,6590659,6588620,6590673,5413076,6588629,6590677,6590679,5923033,6586614,6586615,6586616,6586617,6588663,6588664,6586622,6586624,6586625,6586629,6590729,6586635,6586637,6586638,6586640,6586642,6586643,6588692,6586649,6586652,6586653,6586659,6586660,6588715,6586670,6586671,6320436,1089845,6586677,6588729,6586682,6586685,6586689,6586690,6586695,6586697,6590794,6588749,6328654,6588751,6586704,6586705,6590799,6586707,6586709,6586710,6586711,6586712,6586714,6586716,6586718,6588767,6586720,6586721,6

# Map Excel tags to HAWC tags

## Excel tags

In [72]:
excel_tags = excel_df[tag_column].unique()
print("Tags in Excel:")
print(pd.Series(excel_tags))

Tags in Excel:
0                                       TIAB - Excluded
1                          TIAB - Supplemental material
2             Supplemental Material Type -> Mechanistic
3     Supplemental Material Type -> Non-inhalation r...
4            Supplemental Material Type -> Case reports
5                Supplemental Material Type -> Exposure
6                    Supplemental Material Type -> ADME
7                 Supplemental Material Type -> Mixture
8                                  Full-text - Excluded
9                                       TIAB - Included
10                    Full-text - Supplemental material
11                              Evidence Type -> Animal
12       Supplemental Material Type -> No original data
13    Supplemental Material Type -> Conference abstr...
dtype: object


## HAWC tags

In [73]:
# Get and print assessment tags
assessment_tags = client.lit.tags(assessment_id)
print("Tags in HAWC:")
print(assessment_tags[["id","nested_name"]])

Tags in HAWC:
           id                                        nested_name
0   100506391                                          Inclusion
1   100506393                             Inclusion|Animal Study
2   100507002                              Inclusion|Human Study
3   100506395                                          Exclusion
4   100506396                                     Exclusion|TIAB
5   100506397                                Exclusion|Full-text
6   100506987                                       Supplemental
7   100506988                           Supplemental|Mechanistic
8   100506989                                  Supplemental|ADME
9   100506990  Supplemental|Non-inhalation route of administr...
10  100506991                              Supplemental|Mixtures
11  100506992                              Supplemental|Exposure
12  100506993                   Supplemental|Conference abstract
13  100506994                          Supplemental|Case reports
14  1005069

## Map tags

In [74]:
print("Map the Excel tags with their matching HAWC tag ID.")
tag_id_mapping = dict()
tag_name_mapping = dict()
for excel_tag in excel_tags:
    value = input(excel_tag)
    if value == "":
        tag_name_mapping[excel_tag] = "<omitted>"
        print("Tag omitted")
        continue
    tag_id = int(value)
    tag_id_mapping[excel_tag] = tag_id
    tag_name_mapping[excel_tag] = assessment_tags[assessment_tags['id']==tag_id].iloc[0]['nested_name']
    print(f"'{excel_tag}' mapped to '{tag_name_mapping[excel_tag]}'")

Map the Excel tags with their matching HAWC tag ID.


TIAB - Excluded 100506396


'TIAB - Excluded' mapped to 'Exclusion|TIAB'


TIAB - Supplemental material 


Tag omitted


Supplemental Material Type -> Mechanistic 100506988


'Supplemental Material Type -> Mechanistic' mapped to 'Supplemental|Mechanistic'


Supplemental Material Type -> Non-inhalation route of administration 100506990


'Supplemental Material Type -> Non-inhalation route of administration' mapped to 'Supplemental|Non-inhalation route of administration'


Supplemental Material Type -> Case reports 100506994


'Supplemental Material Type -> Case reports' mapped to 'Supplemental|Case reports'


Supplemental Material Type -> Exposure 100506992


'Supplemental Material Type -> Exposure' mapped to 'Supplemental|Exposure'


Supplemental Material Type -> ADME 100506989


'Supplemental Material Type -> ADME' mapped to 'Supplemental|ADME'


Supplemental Material Type -> Mixture 100506991


'Supplemental Material Type -> Mixture' mapped to 'Supplemental|Mixtures'


Full-text - Excluded 100506397


'Full-text - Excluded' mapped to 'Exclusion|Full-text'


TIAB - Included 


Tag omitted


Full-text - Supplemental material 


Tag omitted


Evidence Type -> Animal 100506393


'Evidence Type -> Animal' mapped to 'Inclusion|Animal Study'


Supplemental Material Type -> No original data 100506995


'Supplemental Material Type -> No original data' mapped to 'Supplemental|No original data'


Supplemental Material Type -> Conference abstracts 100506993


'Supplemental Material Type -> Conference abstracts' mapped to 'Supplemental|Conference abstract'


## Result

In [75]:
tag_mapping_df = pd.DataFrame(list(tag_name_mapping.items()),columns=["Excel Tag", "HAWC Tag"])
tag_mapping_df

Unnamed: 0,Excel Tag,HAWC Tag
0,TIAB - Excluded,Exclusion|TIAB
1,TIAB - Supplemental material,<omitted>
2,Supplemental Material Type -> Mechanistic,Supplemental|Mechanistic
3,Supplemental Material Type -> Non-inhalation r...,Supplemental|Non-inhalation route of administr...
4,Supplemental Material Type -> Case reports,Supplemental|Case reports
5,Supplemental Material Type -> Exposure,Supplemental|Exposure
6,Supplemental Material Type -> ADME,Supplemental|ADME
7,Supplemental Material Type -> Mixture,Supplemental|Mixtures
8,Full-text - Excluded,Exclusion|Full-text
9,TIAB - Included,<omitted>


# Import reference tags

In [76]:
hero_tag_df = excel_df.copy()
hero_tag_df[tag_column] = hero_tag_df[tag_column].map(tag_id_mapping)
hero_tag_df = hero_tag_df[[hero_column,tag_column]].rename(columns={hero_column:'hero_id',tag_column:'tag_id'}).dropna()

In [77]:
# Export references ids and corresponding HERO ids for given assessment.
ref_to_hero = client.lit.reference_ids(assessment_id)
# Create dataframes that have HAWC reference ids with corresponding tag ids
ref_tag_df = ref_to_hero.merge(hero_tag_df, on="hero_id")[['reference_id','tag_id']].drop_duplicates()

In [78]:
# Import the tags
# The operation can be either "append" or "replace"
client.lit.import_reference_tags(assessment_id,csv=ref_tag_df.to_csv(index=False),operation="append")

Unnamed: 0,reference_id,tag_id
0,101010540,100506396
1,101010541,100506396
2,101010542,100506396
3,101010543,100506988
4,101010544,100506396
...,...,...
392,101010926,100506396
393,101010927,100506396
394,101010928,100506396
395,101010929,100506396
