# Library Imports

In [3]:
import os
import pickle
import numpy as np
import pandas as pd
from src.helper_functions import load_data

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Load Data

In [4]:
_, _, x_te, y_te = load_data()

In [7]:
raw_data = './data/raw/tox21/'
x_te_dense = pd.read_csv(raw_data+'tox21_dense_test.csv.gz', index_col=0, compression="gzip")
y_te = pd.read_csv(raw_data+'tox21_labels_test.csv.gz', index_col=0, compression="gzip")

In [11]:
x_te_dense.iloc[0:2,0:10]

Unnamed: 0,AW,AWeight,Arto,BertzCT,Chi0,Chi1,Chi10,Chi2,Chi3,Chi3c
NCGC00261900-01,26124820.0,12.688,2.226,3.226,37.329,25.44,3.663,24.2,20.222,4.565
NCGC00260869-01,8333337.0,17.5,2.167,2.923,16.353,10.872,1.193,11.116,9.279,2.693


In [12]:
y_te.head(2)

Unnamed: 0,NR.AhR,NR.AR,NR.AR.LBD,NR.Aromatase,NR.ER,NR.ER.LBD,NR.PPAR.gamma,SR.ARE,SR.ATAD5,SR.HSE,SR.MMP,SR.p53
NCGC00261900-01,0.0,1.0,,0.0,0.0,0.0,0.0,,0.0,0.0,,0.0
NCGC00260869-01,0.0,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


 modeled_feature_names was aggregated in the Jupyter Notebook: input_parameters.ipynb

In [4]:
modeled_feature_names= pickle.load(open('./data/processed/modeled_feature_names.pkl', 'rb'))

Verify that compounds are in the same order in x_te and y_te:

In [5]:
all(y_te.index==x_te_dense.index)

True

How many target tests do compounds in the test set measure as toxic?:

In [6]:
nToxic = y_te.sum(axis=1).astype('int')
nToxic.value_counts()

0    416
1    110
2     53
3     31
4     19
5     10
6      4
7      3
8      1
dtype: int64

# Create a fully labeled x dataframe:

In [7]:
x=pd.DataFrame(x_te,index=x_te_dense.index,columns=np.array(modeled_feature_names))

In [8]:
x

Unnamed: 0,AW,AWeight,Arto,BertzCT,Chi0,Chi1,Chi10,Chi2,Chi3,Chi3c,...,dfs8:685185923,dfs8:1231366183,dfs8:77893,dfs8:74857143,dfs8:-1779586931,dfs8:-1046300056,dfs8:-429125098,dfs8:63298235,dfs8:284573023,dfs8:1056993476
NCGC00261900-01,2.612482e+07,12.688,2.226,3.226,37.329,25.440,3.663,24.200,20.222,4.565,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCGC00260869-01,8.333337e+06,17.500,2.167,2.923,16.353,10.872,1.193,11.116,9.279,2.693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCGC00261776-01,4.074000e+00,12.464,2.364,3.043,14.681,10.826,2.149,9.980,9.469,1.342,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCGC00261380-01,8.000005e+06,13.827,2.080,2.845,16.778,11.720,0.777,10.139,8.207,1.251,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCGC00261842-01,4.838000e+00,14.509,2.087,2.880,16.872,10.920,0.413,10.035,7.719,2.090,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NCGC00357168-01,2.000000e+00,16.820,1.600,1.157,4.121,2.414,0.000,1.354,0.707,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCGC00357283-01,3.714000e+00,13.208,2.000,2.134,11.096,7.296,0.072,5.671,4.091,0.612,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCGC00357210-01,1.905000e+00,16.017,2.000,2.295,5.276,3.305,0.000,2.885,2.290,0.471,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCGC00357118-01,4.186000e+00,15.674,2.190,2.851,15.088,9.935,0.366,9.720,8.086,2.262,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Save each compound in a separate file in the folder indicating how many targets they met:

In [9]:
for i in range(0,len(x)):
    folder = 'non-toxic' if nToxic[i]==0 else str(nToxic[i])
    pickle.dump(x.iloc[i],\
                open('./data/processed/'+folder+'/'+x.iloc[i].name+'.pkl','wb'))

# Verify a written file is readable:

In [10]:
test=pickle.load(open('./data/processed/8/NCGC00357111-01.pkl', 'rb'))
print(test.shape)
test

(1644,)


AW                   3.011
AWeight             14.540
Arto                 2.143
BertzCT              2.746
Chi0                10.293
                     ...  
dfs8:-1046300056     0.000
dfs8:-429125098      0.000
dfs8:63298235        0.000
dfs8:284573023       0.000
dfs8:1056993476      0.000
Name: NCGC00357111-01, Length: 1644, dtype: float64