In [55]:
import pandas as pd
import json
from pathlib import Path
from textwrap import dedent
import psycopg2

In [56]:
connection = psycopg2.connect("dbname=hawc user=hawc")

studies_excel = pd.read_excel("data/chloroform-studies.xlsx")
original_studies = studies_excel["original_id"].tolist()
writes = {}

In [57]:
def simple_query(table, field_ids, field, *additional_fields):
    str_field_ids = ", ".join([str(id) for id in field_ids])
    
    other_clauses = ""
    for _field in additional_fields:
        other_clauses += f'OR "{_field}" IN ({str_field_ids}) '

    return dedent(
        f"""
        SELECT * FROM "{table}"
        WHERE "{field}" IN ({str_field_ids}) {other_clauses}
        """
    )

## Study

In [58]:
# Study table
query = simple_query("study_study",original_studies,"reference_ptr_id")
study_table = pd.read_sql_query(query, connection)
print(f"Number of records: {len(study_table)}")
study_table.head(1)

Number of records: 19


Unnamed: 0,reference_ptr_id,short_citation,full_citation,coi_reported,coi_details,funding_source,summary,study_identifier,contact_author,ask_author,published,bioassay,epi,epi_meta,in_vitro,editable
0,471401,"Hoechst Aktiengesellschaft, 1990, 1069586",Baeder C and Hofmann T. Chloroform: Supplement...,6,The study was sponsored by Hoechst AG and Dow ...,The study was sponsored by Hoechst AG and Dow ...,"<p>Dam body weight, dams with live fetuses, fe...","Hoechst Aktiengesellschaft, 1990",False,,True,True,False,False,False,True


In [59]:
# Add tables to dict
writes["study"] = {
    "study_study": study_table.to_json(),
}

## Animal

In [60]:
# Experiment table
query = simple_query("animal_experiment",original_studies,"study_id")
experiment_table = pd.read_sql_query(query, connection)
print(f"Number of records: {len(experiment_table)}")
experiment_table.head(1)

Number of records: 10


Unnamed: 0,id,name,type,description,created,last_updated,study_id,cas,purity_available,purity,chemical,chemical_source,guideline_compliance,vehicle,purity_qualifier,has_multiple_generations,dtxsid_id
0,100500739,2-Week Inhalation,St,,2020-12-04 16:04:55.735909-05:00,2020-12-04 16:04:55.735936-05:00,101043904,67-66-3,False,,Chloroform,not reported,,air,,False,DTXSID1020306


In [61]:
# Animal group table
experiment_ids = experiment_table["id"].tolist()
query = simple_query("animal_animalgroup",experiment_ids,"experiment_id")
animal_group_table = pd.read_sql_query(query, connection)
print(f"Number of records: {len(animal_group_table)}")
animal_group_table.head(1)

Number of records: 20


Unnamed: 0,id,experiment_id,name,species_id,strain_id,sex,siblings_id,created,last_updated,dosing_regime_id,generation,lifestage_assessed,lifestage_exposed,animal_source,comments,diet
0,203169,201725,Wistar Rat (Fetuses),1,3,C,,2017-08-01 12:26:04.839703-04:00,2018-10-26 17:07:39.537774-04:00,202167,F1,Developmental,Developmental,HOE:WISKI(SPF71),<p>Mature initially virginal Wistar rats from ...,Altromin 1310


In [62]:
# Animal group parent table
animal_group_ids = animal_group_table["id"].tolist()
query = simple_query("animal_animalgroup_parents",animal_group_ids,"from_animalgroup_id","to_animalgroup_id")
animal_group_parents_table = pd.read_sql_query(query, connection)
print(f"Number of records: {len(animal_group_parents_table)}")
animal_group_parents_table.head(1)

Number of records: 1


Unnamed: 0,id,from_animalgroup_id,to_animalgroup_id
0,201505,203169,203130


In [63]:
# Endpoint table
query = simple_query("animal_endpoint",animal_group_ids,"animal_group_id")
endpoint_table = pd.read_sql_query(query, connection)
print(f"Number of records: {len(endpoint_table)}")
endpoint_table.head(1)

Number of records: 111


Unnamed: 0,baseendpoint_ptr_id,animal_group_id,response_units,data_type,NOEL,LOEL,FEL,data_reported,data_extracted,system,...,effect_subtype,observation_time_text,expected_adversity_direction,litter_effect_notes,litter_effects,effect_subtype_term_id,effect_term_id,organ_term_id,system_term_id,name_term_id
0,100200718,203130,g/100 g BW,C,1,2,-999,True,True,Whole Body,...,Clinical Observation,GD1-21,3,Findings were statistically evaluated separate...,YS,,,,,


In [64]:
# Base endpoint table
endpoint_ids = endpoint_table["baseendpoint_ptr_id"].tolist()
query = simple_query("assessment_baseendpoint",endpoint_ids,"id")
base_endpoint_table = pd.read_sql_query(query, connection)
print(f"Number of records: {len(base_endpoint_table)}")
base_endpoint_table.head(1)

Number of records: 111


Unnamed: 0,id,assessment_id,name,created,last_updated
0,227913,100000061,"Fetuses, Live",2017-07-25 16:25:38.882943-04:00,2020-08-17 08:37:59.958166-04:00


In [65]:
# Endpoint group table
query = simple_query("animal_endpointgroup",endpoint_ids,"endpoint_id")
endpoint_group_table = pd.read_sql_query(query, connection)
print(f"Number of records: {len(endpoint_group_table)}")
endpoint_group_table.head(1)

Number of records: 510


Unnamed: 0,id,dose_group_id,n,incidence,response,variance,significant,significance_level,endpoint_id,lower_ci,upper_ci
0,245938,3,20.0,19.0,95.0,,True,0.05,227913,,


In [66]:
# Add tables to dict
writes["animal"] = {
    "animal_experiment": experiment_table.to_json(),
    "animal_animalgroup": animal_group_table.to_json(),
    "animal_animalgroup_parents": animal_group_parents.to_json(),
    "animal_endpoint": endpoint_table.to_json(),
    "assessment_baseendpoint": base_endpoint_table.to_json(),
    "animal_endpointgroup": endpoint_group_table.to_json(),
}

## Risk of Bias

In [67]:
# Risk of bias table
query = simple_query("riskofbias_riskofbias",original_studies,"study_id")
rob_table = pd.read_sql_query(query, connection)
print(f"Number of records: {len(rob_table)}")
rob_table.head(1)

Number of records: 70


Unnamed: 0,id,created,last_updated,author_id,final,study_id,active
0,209895,2017-06-11 20:10:57.093631-04:00,2017-07-13 04:13:42.077103-04:00,641,False,471401,False


In [68]:
# Risk of bias score table
rob_ids = rob_table["id"].tolist()
query = simple_query("riskofbias_riskofbiasscore",rob_ids,"riskofbias_id")
rob_score_table = pd.read_sql_query(query, connection)
print(f"Number of records: {len(rob_score_table)}")
rob_score_table.head(1)

Number of records: 686


Unnamed: 0,id,score,notes,metric_id,riskofbias_id,is_default,label,bias_direction
0,235518,22,<p>Study authors did not indicate whether inve...,206006,209894,True,,0


In [69]:
# Risk of bias score override table
rob_score_ids = rob_score_table["id"].tolist()
query = simple_query("riskofbias_riskofbiasscoreoverrideobject",rob_score_ids,"score_id")
rob_score_override_table = pd.read_sql_query(query, connection)
print(f"Number of records: {len(rob_score_override_table)}")
rob_score_override_table.head(1)


Number of records: 20


Unnamed: 0,id,object_id,content_type_id,score_id
0,4076,100517941,21,100573291


In [70]:
# Add tables to dict
writes["rob"] = {
    "riskofbias_riskofbias": rob_table.to_json(),
    "riskofbias_riskofbiasscore": rob_score_table.to_json(),
    "riskofbias_riskofbiasscoreoverrideobject": rob_score_override_table.to_json(),
}

## Write tables to file

In [71]:
fn = Path('data/raw/writes.json')
fn.write_text(json.dumps(writes))

1293389