# Migrate data from ToxRefDB to HAWC

Resource: User Guide (https://nepis.epa.gov/Exe/ZyPDF.cgi/P1015KWT.PDF?Dockey=P1015KWT.PDF)

In [None]:
import os

import django
import pandas as pd
import psycopg2
import requests
from asgiref.sync import sync_to_async  # access the ORM safely within the notebook
from django.urls import reverse
from psycopg2.extras import DictCursor

In [None]:
# update environment
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "hawc.main.settings.local")
django.setup()

In [None]:
from hawc.apps.animalv2 import constants
from hawc.apps.assessment.models import DoseUnits, Species, Strain
from hawc.apps.vocab.models import Observation

### Set up API access
https://hawc.readthedocs.io/latest/client/#api-access

In [None]:
# login
session = requests.Session()

# base_url = https://hawcproject.org
base_url = "http://127.0.0.1:8000"

login = requests.post(
    f"{base_url}/user/api/token-auth/",
    json={"username": "admin@hawcproject.org", "password": "pw"},
    timeout=10,
)
session.headers.update(Authorization=f"Token {login.json()['token']}")
login

In [None]:
# define urls
get_study_url = reverse("study:api:study-list")

# animalv2
animalgroup_url = reverse("animalv2:api:animal-group-list")
chem_url = reverse("animalv2:api:chemical-list")
data_extraction_url = reverse("animalv2:api:data-extraction-list")
dose_group_url = reverse("animalv2:api:dose-group-list")
dose_response_group_level_data_url = reverse("animalv2:api:dose-response-group-level-data-list")
endpoint_url = reverse("animalv2:api:endpoint-list")
experiment_url = reverse("animalv2:api:experiment-list")
observation_time_url = reverse("animalv2:api:observation-time-list")
treatment_url = reverse("animalv2:api:treatment-list")

# vocab
toxrefdb_vocab_url = reverse("vocab:api:toxrefdb-nested")

### Set up database functions

In [None]:
def connect_to_db(db_name, user, password, host, port):
    conn = psycopg2.connect(dbname=db_name, user=user, password=password, host=host, port=port)
    print(f"Connected to {db_name} successfully!")
    return conn


def fetch_data_from_source(query, conn):
    with conn.cursor() as cursor:
        cursor.execute(query)
        return cursor.fetchall()


def create_model_objects(url, data_list):
    responses = []
    for data in data_list:
        response = session.post(f"{base_url}/{url}", data)
        responses.append(response)
        if response.status_code != 201:
            print("status failed: ", response.json())

    return responses


def get_model_objects(url):
    response = session.get(f"{base_url}/{url}")
    return response


# set to toxref db
source_conn = connect_to_db("toxref_icf", "hawc", "", "localhost", "5432")
cur = source_conn.cursor(cursor_factory=DictCursor)

### Create ToxRefDB Query

In [None]:
# Study types are found in the User Guide (https://nepis.epa.gov/Exe/ZyPDF.cgi/P1015KWT.PDF?Dockey=P1015KWT.PDF)
# Data is split by study to avoid reading too much into memory at once.
# (The official downloadable ToxRefDB seems to be 66.5 MB, so this shouldn't be an issue.)
STUDY_TYPES = ["CHR", "SUB", "SAC", "DEV", "MGR", "REP", "DNT", "ACU", "OTH"]

# Define query for study dose-response data
## left join to include studies without tg or dose info
query = """SELECT * FROM prod_toxrefdb_2_1.chemical
LEFT JOIN prod_toxrefdb_2_1.study ON study.chemical_id=chemical.chemical_id
LEFT JOIN prod_toxrefdb_2_1.tg ON tg.study_id=study.study_id
LEFT JOIN prod_toxrefdb_2_1.dose ON dose.study_id=study.study_id
LEFT JOIN prod_toxrefdb_2_1.dtg ON dtg.tg_id=tg.tg_id AND dose.dose_id=dtg.dose_id
LEFT JOIN prod_toxrefdb_2_1.tg_effect ON tg.tg_id=tg_effect.tg_id
LEFT JOIN prod_toxrefdb_2_1.effect ON effect.effect_id=tg_effect.effect_id
LEFT JOIN prod_toxrefdb_2_1.endpoint ON endpoint.endpoint_id=effect.endpoint_id
LEFT JOIN prod_toxrefdb_2_1.dtg_effect ON tg_effect.tg_effect_id=dtg_effect.tg_effect_id AND dtg.dtg_id=dtg_effect.dtg_id
WHERE study.study_type = %s"""

chem_study_query = """SELECT * FROM prod_toxrefdb_2_1.chemical
LEFT JOIN prod_toxrefdb_2_1.study ON study.chemical_id=chemical.chemical_id
LEFT JOIN prod_toxrefdb_2_1.dose ON dose.study_id=study.study_id
WHERE study.study_type = %s;
"""

### Create Experiment and Chemical objects
Uses info from ToxRefDB study and chemical tables

In [None]:
dsstox_array = []
dsstox_ids = []

# Read in all chemical dsstox_substance_ids
for type in STUDY_TYPES:
    cur.execute(chem_study_query, (type,))
    rows = cur.fetchall()
    dsstox = set(row["dsstox_substance_id"] for row in rows)
    dsstox_ids.extend(dsstox)

# Create new DSSTox objects if missing
dsstox_array.extend([{"dtxsid": value} for value in dsstox_ids])

print("### creating dsstox items ###")
responses = create_model_objects(reverse("assessment:api:dsstox-list"), dsstox_array)

In [None]:
# map ToxRef design field values to HAWC constants
experiment_type_map = {
    "CHR": constants.ExperimentDesign.CH,
    "SUB": constants.ExperimentDesign.SB,
    "DEV": constants.ExperimentDesign.DV,
    "MGR": constants.ExperimentDesign.R2,  #  2-generation reproductive
    "REP": constants.ExperimentDesign.RP,
    "DNT": constants.ExperimentDesign.OT,  #  other
}

In [None]:
print("### Create a new HAWC Experiment and Chemical for each ToxRef study item ###")

study_to_exp_map = {}
toxref_to_hawc_chem_map = {}

for type in STUDY_TYPES:
    cur.execute(chem_study_query, (type,))
    rows = cur.fetchall()

    # Read into a df for data manipulation
    column_names = [desc[0] for desc in cur.description]
    df = pd.DataFrame(rows, columns=column_names)
    df = df.loc[:, ~df.columns.duplicated()]

    unique_studies = df["study_id"].unique()

    for study_id in unique_studies:
        # get the first row for each study
        row = df[df["study_id"] == study_id].iloc[0]

        # Create a new experiment object
        # assumption: ToxRefDB study_citation will map to HAWC Study full_citation
        # assumption: all ToxRef studies will exist in HAWC
        study = get_model_objects(get_study_url + f"?full_citation={row['study_citation']}")

        experiment_data = {
            "study_id": study.id if study else 1,
            "name": f'{row["dose_end"]} {row["dose_end_unit"]} {row["admin_route"]} {row["study_type"]} {row["preferred_name"]}',  # Ex: 30 day oral CHR Isazofos
            "design": experiment_type_map[row["study_type"].upper()],
            "has_multiple_generations": True if row["study_type"] == "MGR" else False,
            "guideline": row["study_type_guideline"],
            "comments": row["study_comment"] or "",
        }
        experiment = create_model_objects(experiment_url, [experiment_data])[0].json()
        # map ToxRefDB study to new  experiment
        study_to_exp_map[row["study_id"]] = experiment.get("id")

        # Create a new HAWC Chemical for each ToxRef study (many-to-one study->chemical to one-to-one study->chemical)
        vehicle = row["vehicle"]
        if vehicle is None:
            inhalation_study = row["admin_route"].lower() == "inhalation"
            vehicle = "not reported, assumed clean air." if inhalation_study else "not reported"

        chemical_data = {
            "name": row["preferred_name"],
            "dtxsid_id": row["dsstox_substance_id"],
            "cas": row["casrn"],
            "experiment": experiment.get("id"),
            "source": row["substance_source_name"] or "",
            "purity": row["substance_purity"] or "",
            "comments": row["substance_comment"] or "",
            "vehicle": vehicle,
        }
        chemical = create_model_objects(chem_url, [chemical_data])[0].json()
        toxref_to_hawc_chem_map[row["chemical_id"]] = chemical.get("id")

print(study_to_exp_map)
print(toxref_to_hawc_chem_map)

### Create HAWC Animalgroup objects

Uses info from ToxRefDB tg and study tables

In [None]:
# map ToxRef field values to HAWC constants
life_stage_map = {
    "adult-pregnancy": constants.Lifestage.AG,
    "fetal": constants.Lifestage.DEV,  # verify
    "adult": constants.Lifestage.ADULT,
    "juvenile": constants.Lifestage.JUV,
    None: "",
}

route_exposure_map = {"oral": constants.RouteExposure.OR}

variance_type_map = {
    "NA": constants.VarianceType.NA.label,
    "NR": constants.VarianceType.NR.label,
    "SD": constants.VarianceType.SD.label,
    "SE": constants.VarianceType.SE.label,
    "": constants.VarianceType.SD.label,
}

observation_time_map = {
    "week": constants.ObservationTimeUnits.WK.label,
    "lactation week": constants.ObservationTimeUnits.WK.label,
    "month": constants.ObservationTimeUnits.MON.label,
    "day": constants.ObservationTimeUnits.DAY.label,
    "LD": constants.ObservationTimeUnits.DAY.label,  # TODO: day vs lactation day?
    "GD": constants.ObservationTimeUnits.GD.label,
    "PND": constants.ObservationTimeUnits.PND.label,
}

treatment_related_map = {
    "true": constants.TreatmentRelatedEffect.YES.label,
    "false": constants.TreatmentRelatedEffect.NO.label,
}

Create a new HAWC animalgroup, species, and strain if needed for each ToxRef tg

In [None]:
animalgroup_objects = []
toxref_to_hawc_tg_map = {}

for type in STUDY_TYPES:
    cur.execute(query, (type,))
    rows = cur.fetchall()

    # Read into a df for data manipulation
    column_names = [desc[0] for desc in cur.description]
    df = pd.DataFrame(rows, columns=column_names)
    df = df.loc[:, ~df.columns.duplicated()]

    # filter unique treatment groups
    unique_tgs = df["tg_id"].unique()
    for tg_id in unique_tgs:
        row = df[df["tg_id"] == tg_id].iloc[0]
        experiment_id = study_to_exp_map[row["study_id"]]

        dose_duration = row["dose_duration"]
        if pd.isna(row["dose_duration"]):
            dose_duration = ""

        # TODO: ask preferred name. Ex: 13 Day Oral Tebupirimfos
        exposure_duration_description = f'{dose_duration} {row["dose_duration_unit"]}'
        treatment_data = {
            "experiment": experiment_id,
            "name": f'{exposure_duration_description} {row["admin_route"]} {row["preferred_name"]}',
            "chemical_id": toxref_to_hawc_chem_map[row["chemical_id"]],
            "route_of_exposure": route_exposure_map[row["admin_route"].lower()],
            "exposure_duration": dose_duration,
            "exposure_duration_description": exposure_duration_description,
            "comments": row["tg_comment"] or "",
        }
        treatment = create_model_objects(treatment_url, [treatment_data])
        toxref_to_hawc_tg_map[tg_id] = treatment[0].json().get("id")

        # create new animalgroup using tg and tg_effect
        if pd.notna(row["tg_effect_id"]):
            # create new species and strain if necessary
            species, created = await sync_to_async(Species.objects.get_or_create)(
                name=row["species"]
            )
            strain, created = await sync_to_async(Strain.objects.get_or_create)(
                name=row["strain"], species_id=species.id
            )

            if len(row["generation"]) > 2:
                gen = constants.Generation.F1  # default generation?
            else:
                gen = row["generation"]

            sex = constants.Sex.COMBINED if row["sex"] == "MF" else row["sex"]
            animalgroup_data = {
                "experiment": experiment_id,
                "name": f"{sex} {strain.name} {species.name}",  # ex: female wistar rat
                "sex": sex,
                "comments": row["tg_comment"] or "",
                "generation": gen,
                "lifestage_at_assessment": life_stage_map[row["life_stage"]],
                "lifestage_at_exposure": life_stage_map[row["life_stage"]],
                "species_id": species.id,
                "strain_id": strain.id,
            }
            create_model_objects(animalgroup_url, [animalgroup_data])[0].json()

print(toxref_to_hawc_tg_map)

Create HAWC dosegroup, endpoint, dataextraction, and dose_response_group_level_data from ToxRef dose, dtg, dtg_effect, tg, tg_effect

In [None]:
# get toxrefdb vocab terms
toxrefdb_terms_df = pd.DataFrame(get_model_objects(toxrefdb_vocab_url).json())
toxrefdb_terms_df = toxrefdb_terms_df.rename(
    columns={
        "name_term_id": "name_term",
        "system_term_id": "system_term",
        "effect_term_id": "effect_term",
        "effect_subtype_term_id": "effect_subtype_term",
    }
)

In [None]:
# arbitrary, unsure of the difference btw id and dose_id
dose_group_id = 13000
toxref_to_hawc_endpoint_map = {}

pd.set_option("future.no_silent_downcasting", True)

for type in STUDY_TYPES:
    cur.execute(query, (type,))
    rows = cur.fetchall()

    # Read into a df for data manipulation
    column_names = [desc[0] for desc in cur.description]
    df = pd.DataFrame(rows, columns=column_names)
    df = df.loc[:, ~df.columns.duplicated()]

    # filter unique doses
    unique_doses = df["dose_id"].unique()
    for dose_id in unique_doses:
        dose_df = df[df["dose_id"] == dose_id]

        # format data
        dose_df.loc[:, "effect_var_type"] = dose_df["effect_var_type"].fillna(
            constants.VarianceType.NA.label
        )
        dose_df.loc[:, "dtg_effect_comment"] = dose_df["dtg_effect_comment"].fillna("")
        dose_df.loc[:, "effect_val_unit"] = dose_df["effect_val_unit"].fillna(
            ""
        )  # set this to the default
        dose_df["critical_effect"] = (
            dose_df["critical_effect"]
            .fillna(False)
            .replace({"True": True, "False": False})
            .astype(bool)
        )

        # filter unique dose treatment groups and effects for each dose
        unique_dtg = dose_df["dtg_id"].unique()
        unique_tg_effect = dose_df["tg_effect_id"].unique()

        # create a hawc dosegroup for each toxrefdb dtg
        for dtg_id in unique_dtg:
            row = dose_df[dose_df["dtg_id"] == dtg_id].iloc[0]

            dose_adjusted = row["dose_adjusted"]
            if pd.isna(dose_adjusted):
                dose_adjusted = row["conc"] if not pd.isna(row["conc"]) else 0
                dose_unit, created = await sync_to_async(DoseUnits.objects.get_or_create)(
                    name="ppm"
                )
            else:
                dose_unit = await sync_to_async(DoseUnits.objects.get)(name="mg/kg/d")

            dosegroup_data = {
                "dose": dose_adjusted,
                "dose_group_id": dose_group_id,  # TODO: id vs dose_group_id?
                "dose_units": dose_unit,
                "treatment_id": toxref_to_hawc_tg_map[row["tg_id"]],
            }
            create_model_objects(dose_group_url, [dosegroup_data])[0].json()
            # increment manual dose group id
            dose_group_id += 1

        # filter unique treatment group effects
        for tg_effect_id in sorted(unique_tg_effect):
            dtg_effect_df = dose_df[dose_df["tg_effect_id"] == tg_effect_id]

            for dtg_effect_id in dtg_effect_df["dtg_effect_id"]:
                row = dtg_effect_df[dtg_effect_df["dtg_effect_id"] == dtg_effect_id].iloc[0]

                # get endpoint mapping
                endpoint_vocab = {
                    "name": row["effect_desc"],
                    "system": row["endpoint_category"],
                    "effect": row["endpoint_type"],
                    "effect_subtype": row["endpoint_target"],
                }
                # get term id info from existing toxrefdb df
                condition = (
                    toxrefdb_terms_df[list(endpoint_vocab)] == pd.Series(endpoint_vocab)
                ).all(axis=1)
                term_info = toxrefdb_terms_df[condition].iloc[0]
                endpoint_data = {"experiment": study_to_exp_map[row["study_id"]], **term_info}
                if (
                    f"{row["endpoint_id"]} {study_to_exp_map[row["study_id"]]}"
                    not in toxref_to_hawc_endpoint_map.keys()
                ):
                    # create a new endpoint
                    endpoint_id = (
                        create_model_objects(endpoint_url, [endpoint_data])[0].json().get("id")
                    )
                    toxref_to_hawc_endpoint_map[
                        f"{row["endpoint_id"]} {study_to_exp_map[row["study_id"]]}"
                    ] = endpoint_id
                else:
                    endpoint_id = toxref_to_hawc_endpoint_map[
                        f"{row["endpoint_id"]} {study_to_exp_map[row["study_id"]]}"
                    ]

                # Create new observation time model if needed
                if not pd.isna(row["time_unit"]):
                    obs_time_data = {
                        "observation_time": row["time"],
                        "observation_time_text": row["time"],
                        "observation_time_units": observation_time_map[row["time_unit"]],
                        "endpoint_id": endpoint_id,
                    }
                    obs_time = create_model_objects(observation_time_url, [obs_time_data])[0].json()

                    # if data extraction info is available, create model
                    data_extraction_data = {
                        "experiment": study_to_exp_map[row["study_id"]],
                        "treatment_id": toxref_to_hawc_tg_map[row["tg_id"]],
                        "endpoint_id": endpoint_id,
                        "dataset_type": constants.DatasetType.NOT_REPORTED,  # no toxrefdb equivalent
                        "method_to_control_for_litter_effects": constants.MethodToControlForLitterEffects.NR.label,  # no toxrefdb equivalent
                        "is_qualitative_only": False
                        if row["no_quant_data_reported"] == "f"
                        else True,
                        "variance_type": variance_type_map[row["effect_var_type"]],
                        "response_units": str(row["effect_val_unit"])[
                            :32
                        ],  # this is limited to 32 characters, unsure if we standardise somewhere
                        "dose_response_observations": row["effect_comment"] or "NA",
                        "observation_timepoint_id": obs_time.get("id"),
                        "result_details": f"{row['dtg_effect_comment']} {row['effect_comment']}",
                    }
                    data_extraction = create_model_objects(
                        data_extraction_url, [data_extraction_data]
                    )[0].json()

                    statistically_significant = (
                        constants.StatisticallySignificant.YES
                        if row["treatment_related"] == "t"
                        else constants.StatisticallySignificant.NO
                    )

                    # select appropriate dose column
                    dose_col = "dose_adjusted"
                    if pd.isna(dose_adjusted):
                        dose_col = "conc"
                    dose_adjusted = row[dose_col]

                    # calculate NOEL: lowest dose with a critical effect
                    filtered_df = dtg_effect_df[dtg_effect_df["critical_effect"]]
                    LOEL = filtered_df[dose_col].min()

                    # calculate LOEL: highest dose with no critical effect
                    filtered_df = dtg_effect_df[~dtg_effect_df["critical_effect"]]
                    NOEL = filtered_df[dose_col].max()

                    effect_val = row["effect_val"]
                    if pd.isna(effect_val):
                        effect_val = 0

                    dose_response_group_level_data = {
                        "data_extraction_id": data_extraction.get("id"),
                        "treatment_name": f'{row["dose_duration"]} {row["dose_duration_unit"]} {row["admin_route"]} {row["preferred_name"]}',  # from treatment model
                        "dose": dose_adjusted,  # TODO: check this out in terms mapping w dtg
                        "n": None if pd.isna(row["n"]) else row["n"],
                        "response": effect_val,  # TODO verify mapping and default
                        "variance": None if pd.isna(row["effect_var"]) else row["effect_var"],
                        "treatment_related_effect": treatment_related_map[
                            str(row["treatment_related"]).lower()
                        ],
                        "statistically_significant": statistically_significant,
                        "p_value": 0.05
                        if row["treatment_related"] == "t"
                        else "",  # TODO: ask about default p_value
                        "NOEL": round(NOEL)
                        if not pd.isna(NOEL)
                        else -999,  # HAWC requires ints not floats
                        "LOEL": round(LOEL)
                        if not pd.isna(LOEL)
                        else -999,  # HAWC requires ints not floats
                    }
                    create_model_objects(
                        dose_response_group_level_data_url, [dose_response_group_level_data]
                    )

## Migrate observation Data

In [None]:
obs_query = (
    """SELECT study_id, endpoint_id, tested_status, reported_status FROM prod_toxrefdb_2_1.obs"""
)

obs_data = fetch_data_from_source(obs_query, source_conn)

# only migrate toxrefdb observations linked to an existing toxrefdb endpoint effect?
obs_mapping = [
    {
        "experiment_id": study_to_exp_map[row[0]],
        # key: endpoint_id and experiment_id
        "endpoint_id": f"{toxref_to_hawc_endpoint_map[row[1]]} {study_to_exp_map[row[0]]}",
        "tested_status": row[2],
        "reported_status": row[3],
    }
    for row in obs_data
]

for row in obs_mapping:
    await sync_to_async(Observation.objects.get_or_create)(**row)

In [None]:
# Close the cursor and connection
cur.close()
source_conn.close()