# Process datasets

## ToxRefDB v2.

Download data from this supplemental and unzip:
https://www.sciencedirect.com/science/article/pii/S0890623819300875#upi0005

In [1]:
from pathlib import Path
import pandas as pd

In [2]:
root = Path(r"~/Desktop").expanduser()

### prepare dichotomous data

In [3]:
dichotomous_tr = (
    pd.read_csv(root / 'dichotomous_tr.csv')
    .drop(columns='Unnamed: 0')
    .assign(dtype="D")
)
dichotomous_tr.head()

Unnamed: 0,id,doses,ns,incidences,dtype
0,1002_137_1500,777867,46384846,1310,D
1,1002_137_1644,777867,46384846,1271019,D
2,1002_137_2635,550640,45484646,66811,D
3,1002_137_826,550640,45484646,10015,D
4,1002_268_1436,777867,46384846,1310,D


In [4]:
cancer_tr = (
    pd.read_csv(root / 'cancer_tr.csv')
    .drop(columns='Unnamed: 0')
    .assign(dtype="DC")
)
cancer_tr.head()

Unnamed: 0,id,doses,ns,incidences,dtype
0,1002_19_531,777867,50474950,2116,DC
1,1002_194_2884,777867,50474950,3127,DC
2,1006_137_776,161862243,10101010109,3,DC
3,106_268_1814,301003001000,44442,1,DC
4,1069_137_1156,1949147,50505050,43511,DC


In [5]:
d = pd.concat([dichotomous_tr, cancer_tr])


assert d.id.is_unique

assert d.id.notnull().all()
assert d.doses.notnull().all()
assert d.ns.notnull().all()
assert d.incidences.notnull().all()

print(d.shape[0])
d.head()

5040


Unnamed: 0,id,doses,ns,incidences,dtype
0,1002_137_1500,777867,46384846,1310,D
1,1002_137_1644,777867,46384846,1271019,D
2,1002_137_2635,550640,45484646,66811,D
3,1002_137_826,550640,45484646,10015,D
4,1002_268_1436,777867,46384846,1310,D


### prepare continuous data

In [6]:
continuous_BW = (
    pd.read_csv(root / 'continuous_BW.csv')
    .drop(columns='Unnamed: 0')
    .assign(dtype='C', meta="{\"body_weight\": true}")
)
continuous_BW.head()

Unnamed: 0,id,doses,ns,means,stdevs,dtype,meta
0,1002_220_119,550640,18232521,2459227323202685,1113383540813,C,"{""body_weight"": true}"
1,1002_220_1910,550640,18232521,5588513450636180,251465410321421,C,"{""body_weight"": true}"
2,1002_52_1768,550640,42374236,"46.5,45.3,45.8,43.1","4.7,3.9,4.8,4.6",C,"{""body_weight"": true}"
3,1006_300_1096,161862243,10101010109,"0.6,0.63,0.64,0.66,0.63,0.71","0.05,0.05,0.05,0.05,0.04,0.06",C,"{""body_weight"": true}"
4,1011_52_1768,201005001000,44444,"13.07,12.7,12.18,12.1,10.45","1.25,0.83,0.57,1.3,0.47",C,"{""body_weight"": true}"


In [7]:
continuous_NotBW = (
    pd.read_csv(root / 'continuous_NotBW.csv')
    .drop(columns='Unnamed: 0')
    .assign(dtype='C', meta="{\"body_weight\": false}")
)
continuous_NotBW.head()

Unnamed: 0,id,doses,ns,means,stdevs,dtype,meta
0,100_217_238,27141356783,101091010,"54.8,54.6,53.1,53.9,52.9","1.45,1.06,1.28,1.46,1",C,"{""body_weight"": false}"
1,1006_187_1263,172069281,201010101017,"1.2,1,1.3,1,1.4,2.5","1.4,1.3,1.5,1.3,1.2,1.4",C,"{""body_weight"": false}"
2,1006_200_2354,161862243,201010101017,"8.58,8.72,7.95,9.58,8.56,16.51","2.13,3.21,1.32,2.12,2.12,8.64",C,"{""body_weight"": false}"
3,1006_284_161,161862243,201010101017,"969.4,981.7,999.2,1001,1038,1161","108.7,95.5,63.7,63.9,128.7,218.8",C,"{""body_weight"": false}"
4,1006_32_1907,161862243,201010101017,"42.68,43.85,45.6,40.77,37.76,64.98","4.92,7.64,6.2,4.99,4.37,16.41",C,"{""body_weight"": false}"


In [8]:
c = pd.concat([continuous_BW, continuous_NotBW])

assert c.id.is_unique

assert c.id.notnull().all()
assert c.doses.notnull().all()
assert c.ns.notnull().all()
assert c.means.notnull().all()
assert c.stdevs.notnull().all()

print(c.shape[0])
c.head()

2115


Unnamed: 0,id,doses,ns,means,stdevs,dtype,meta
0,1002_220_119,550640,18232521,2459227323202685,1113383540813,C,"{""body_weight"": true}"
1,1002_220_1910,550640,18232521,5588513450636180,251465410321421,C,"{""body_weight"": true}"
2,1002_52_1768,550640,42374236,"46.5,45.3,45.8,43.1","4.7,3.9,4.8,4.6",C,"{""body_weight"": true}"
3,1006_300_1096,161862243,10101010109,"0.6,0.63,0.64,0.66,0.63,0.71","0.05,0.05,0.05,0.05,0.04,0.06",C,"{""body_weight"": true}"
4,1011_52_1768,201005001000,44444,"13.07,12.7,12.18,12.1,10.45","1.25,0.83,0.57,1.3,0.47",C,"{""body_weight"": true}"


### Duplication when merging datasets.

In [9]:
data = pd.concat([c, d], ignore_index=True)

In [10]:
data[data.duplicated(subset='id', keep=False)]

Unnamed: 0,id,doses,ns,means,stdevs,dtype,meta,incidences
1461,2048_277_610,303003000,2120239,"1,1,1.8,2.1","2.4,1.7,3,2.7",C,"{""body_weight"": false}",
1558,2763_277_610,123877,1617158,"0.4,1.5,2,3.8","0.5,2.2,1.6,2.9",C,"{""body_weight"": false}",
1672,3406_277_610,21050,28313230,"0.9,0.8,0.5,2.8","0.96,1.28,1.11,3.24",C,"{""body_weight"": false}",
2062,791_277_610,5006307501000,111112129,"2,4.9,6.5,12.8,13.2","1.3,4.7,3.6,2.7,1.7",C,"{""body_weight"": false}",
3143,2048_277_610,303003000,2120239,,,D,,2142.0
3743,2763_277_610,123877,1617158,,,D,,246.0
4533,3406_277_610,21050,28313230,,,D,,1.0
6511,791_277_610,5006307501000,111112129,,,D,,22109.0


In [11]:
# only keep the first when there are duplicates...
data = pd.concat([c, d], ignore_index=True).drop_duplicates(subset='id')

In [12]:
data[data.duplicated(subset='id', keep=False)]

Unnamed: 0,id,doses,ns,means,stdevs,dtype,meta,incidences


### write to disk

In [13]:
assert data.id.is_unique
assert data.id.notnull().all()
assert data.doses.notnull().all()
assert data.ns.notnull().all()
assert data.dtype.notnull().all()

print(data.shape)
data.to_csv(root / 'toxrefdb_v2.csv.zip', index=False)

(7151, 8)
