# Process datasets

## ToxRefDB v2.

Download data from this supplemental and unzip:
https://www.sciencedirect.com/science/article/pii/S0890623819300875#upi0005

In [1]:
from pathlib import Path
import pandas as pd

In [2]:
root = Path(r"~/Desktop").expanduser()

### prepare dichotomous data

In [3]:
dichotomous_tr = (
    pd.read_csv(root / 'dichotomous_tr.csv')
    .drop(columns='Unnamed: 0')
    .assign(dataset='toxrefdbv2', meta="{\"cancer\": false}")
)
dichotomous_tr.head()

Unnamed: 0,id,doses,ns,incidences,dataset,meta
0,1002_137_1500,777867,46384846,1310,toxrefdbv2,"{""cancer"": false}"
1,1002_137_1644,777867,46384846,1271019,toxrefdbv2,"{""cancer"": false}"
2,1002_137_2635,550640,45484646,66811,toxrefdbv2,"{""cancer"": false}"
3,1002_137_826,550640,45484646,10015,toxrefdbv2,"{""cancer"": false}"
4,1002_268_1436,777867,46384846,1310,toxrefdbv2,"{""cancer"": false}"


In [4]:
cancer_tr = (
    pd.read_csv(root / 'cancer_tr.csv')
    .drop(columns='Unnamed: 0')
    .assign(dataset='toxrefdbv2', meta="{\"cancer\": true}")
)
cancer_tr.head()

Unnamed: 0,id,doses,ns,incidences,dataset,meta
0,1002_19_531,777867,50474950,2116,toxrefdbv2,"{""cancer"": true}"
1,1002_194_2884,777867,50474950,3127,toxrefdbv2,"{""cancer"": true}"
2,1006_137_776,161862243,10101010109,3,toxrefdbv2,"{""cancer"": true}"
3,106_268_1814,301003001000,44442,1,toxrefdbv2,"{""cancer"": true}"
4,1069_137_1156,1949147,50505050,43511,toxrefdbv2,"{""cancer"": true}"


In [5]:
d = pd.concat([dichotomous_tr, cancer_tr])
assert d.id.unique().all()
print(d.shape[0])
d.head()

5040


Unnamed: 0,id,doses,ns,incidences,dataset,meta
0,1002_137_1500,777867,46384846,1310,toxrefdbv2,"{""cancer"": false}"
1,1002_137_1644,777867,46384846,1271019,toxrefdbv2,"{""cancer"": false}"
2,1002_137_2635,550640,45484646,66811,toxrefdbv2,"{""cancer"": false}"
3,1002_137_826,550640,45484646,10015,toxrefdbv2,"{""cancer"": false}"
4,1002_268_1436,777867,46384846,1310,toxrefdbv2,"{""cancer"": false}"


### prepare continuous data

In [6]:
continuous_BW = (
    pd.read_csv(root / 'continuous_BW.csv')
    .drop(columns='Unnamed: 0')
    .assign(dataset='toxrefdbv2', meta="{\"body_weight\": true}")
)
continuous_BW.head()

Unnamed: 0,id,doses,ns,means,stdevs,dataset,meta
0,1002_220_119,550640,18232521,2459227323202685,1113383540813,toxrefdbv2,"{""body_weight"": true}"
1,1002_220_1910,550640,18232521,5588513450636180,251465410321421,toxrefdbv2,"{""body_weight"": true}"
2,1002_52_1768,550640,42374236,"46.5,45.3,45.8,43.1","4.7,3.9,4.8,4.6",toxrefdbv2,"{""body_weight"": true}"
3,1006_300_1096,161862243,10101010109,"0.6,0.63,0.64,0.66,0.63,0.71","0.05,0.05,0.05,0.05,0.04,0.06",toxrefdbv2,"{""body_weight"": true}"
4,1011_52_1768,201005001000,44444,"13.07,12.7,12.18,12.1,10.45","1.25,0.83,0.57,1.3,0.47",toxrefdbv2,"{""body_weight"": true}"


In [7]:
continuous_NotBW = (
    pd.read_csv(root / 'continuous_NotBW.csv')
    .drop(columns='Unnamed: 0')
    .assign(dataset='toxrefdbv2', meta="{\"body_weight\": false}")
)
continuous_NotBW.head()

Unnamed: 0,id,doses,ns,means,stdevs,dataset,meta
0,100_217_238,27141356783,101091010,"54.8,54.6,53.1,53.9,52.9","1.45,1.06,1.28,1.46,1",toxrefdbv2,"{""body_weight"": false}"
1,1006_187_1263,172069281,201010101017,"1.2,1,1.3,1,1.4,2.5","1.4,1.3,1.5,1.3,1.2,1.4",toxrefdbv2,"{""body_weight"": false}"
2,1006_200_2354,161862243,201010101017,"8.58,8.72,7.95,9.58,8.56,16.51","2.13,3.21,1.32,2.12,2.12,8.64",toxrefdbv2,"{""body_weight"": false}"
3,1006_284_161,161862243,201010101017,"969.4,981.7,999.2,1001,1038,1161","108.7,95.5,63.7,63.9,128.7,218.8",toxrefdbv2,"{""body_weight"": false}"
4,1006_32_1907,161862243,201010101017,"42.68,43.85,45.6,40.77,37.76,64.98","4.92,7.64,6.2,4.99,4.37,16.41",toxrefdbv2,"{""body_weight"": false}"


In [8]:
c = pd.concat([continuous_BW, continuous_NotBW])
assert c.id.unique().all()
print(c.shape[0])
c.head()

2115


Unnamed: 0,id,doses,ns,means,stdevs,dataset,meta
0,1002_220_119,550640,18232521,2459227323202685,1113383540813,toxrefdbv2,"{""body_weight"": true}"
1,1002_220_1910,550640,18232521,5588513450636180,251465410321421,toxrefdbv2,"{""body_weight"": true}"
2,1002_52_1768,550640,42374236,"46.5,45.3,45.8,43.1","4.7,3.9,4.8,4.6",toxrefdbv2,"{""body_weight"": true}"
3,1006_300_1096,161862243,10101010109,"0.6,0.63,0.64,0.66,0.63,0.71","0.05,0.05,0.05,0.05,0.04,0.06",toxrefdbv2,"{""body_weight"": true}"
4,1011_52_1768,201005001000,44444,"13.07,12.7,12.18,12.1,10.45","1.25,0.83,0.57,1.3,0.47",toxrefdbv2,"{""body_weight"": true}"


### write to disk

In [9]:
d.to_csv('toxrefdbv2_dichotomous.csv.zip', index=False)
c.to_csv('toxrefdbv2_continuous.csv.zip', index=False)