In [1]:
# make sure  numpy, scipy, pandas, sklearn are installed, otherwise run
# pip install numpy scipy pandas scikit-learn
import numpy as np
import pandas as pd
from scipy import io
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

I'm running this sample code on the sample data downloaded and now in the raw data folder indicated below.

In [2]:
import os
os.chdir('/Users/sean/CloudStation/Metis/projects/project5')
!pwd

/Users/sean/CloudStation/Metis/projects/project5


In [3]:
# load data
raw_data = './data/raw/tox21/'
y_tr = pd.read_csv(raw_data+'tox21_labels_train.csv.gz', index_col=0, compression="gzip")
y_te = pd.read_csv(raw_data+'tox21_labels_test.csv.gz', index_col=0, compression="gzip")
x_tr_dense = pd.read_csv(raw_data+'tox21_dense_train.csv.gz', index_col=0, compression="gzip").values
x_te_dense = pd.read_csv(raw_data+'tox21_dense_test.csv.gz', index_col=0, compression="gzip").values
x_tr_sparse = io.mmread(raw_data+'tox21_sparse_train.mtx.gz').tocsc()
x_te_sparse = io.mmread(raw_data+'tox21_sparse_test.mtx.gz').tocsc()

In [10]:
x_tr_sparse > 0

<12060x272776 sparse matrix of type '<class 'numpy.bool_'>'
	with 3913072 stored elements in Compressed Sparse Column format>

In [9]:
sparse_col_idx = ((x_tr_sparse > 0).mean(0) > 0.05).A.ravel()
print(sparse_col_idx.shape)
sparse_col_idx

(272776,)


array([ True,  True,  True, ..., False, False, False])

In [4]:
# filter out very sparse features
sparse_col_idx = ((x_tr_sparse > 0).mean(0) > 0.05).A.ravel()
x_tr = np.hstack([x_tr_dense, x_tr_sparse[:, sparse_col_idx].A])
x_te = np.hstack([x_te_dense, x_te_sparse[:, sparse_col_idx].A])

In [5]:
# Build a random forest model for all twelve assays
for target in y_tr.columns:
    rows_tr = np.isfinite(y_tr[target]).values
    rows_te = np.isfinite(y_te[target]).values
    rf = RandomForestClassifier(n_estimators=100,  n_jobs=4)
    rf.fit(x_tr[rows_tr], y_tr[target][rows_tr])
    p_te = rf.predict_proba(x_te[rows_te])
    auc_te = roc_auc_score(y_te[target][rows_te], p_te[:, 1])
    print("%15s: %3.5f" % (target, auc_te))

         NR.AhR: 0.89417
          NR.AR: 0.69302
      NR.AR.LBD: 0.56947
   NR.Aromatase: 0.78082
          NR.ER: 0.77124
      NR.ER.LBD: 0.80767
  NR.PPAR.gamma: 0.71732
         SR.ARE: 0.77111
       SR.ATAD5: 0.80937
         SR.HSE: 0.80175
         SR.MMP: 0.92110
         SR.p53: 0.80248


In [6]:
y_tr['NR.AhR'].unique()

array([nan,  0.,  1.])

In [7]:
x_tr_dense.shape

(12060, 801)

In [15]:
raw_data = './data/raw/tox21/'
df=pd.read_csv(raw_data+'tox21_dense_train.csv.gz', index_col=0, compression="gzip")
df

Unnamed: 0,AW,AWeight,Arto,BertzCT,Chi0,Chi1,Chi10,Chi2,Chi3,Chi3c,...,W3D,W3DH,WNSA1,WNSA2,WNSA3,WPSA1,WPSA2,WPSA3,grav,rygr
NCGC00178831-03,5.436720e+07,13.053,2.176,3.194,23.112,15.868,1.496,15.127,12.592,2.619,...,2687.469,9241.018,115.371,-915.496,-39.983,290.078,2301.941,59.492,88.147,3.708
NCGC00166114-03,1.268818e+07,22.123,2.065,3.137,21.033,13.718,1.937,13.187,11.951,2.502,...,2184.384,3234.199,194.740,-1029.609,-34.205,235.360,1244.323,82.906,134.852,4.131
NCGC00263563-01,3.076932e+06,13.085,2.154,3.207,46.896,29.958,3.806,30.105,25.569,7.819,...,13803.524,76582.899,238.004,-4358.946,-106.537,868.685,15909.444,135.335,216.852,5.075
NCGC00013058-02,7.168569e+07,12.832,2.029,3.380,51.086,32.045,1.806,29.090,21.603,5.222,...,13807.345,50498.175,226.312,-2785.555,-61.923,763.288,9394.859,125.509,238.265,4.640
NCGC00167516-01,7.989702e+06,12.936,2.124,3.573,70.295,46.402,3.604,42.132,32.570,7.002,...,43231.286,163659.229,850.869,-21136.699,-367.122,1798.703,44681.209,362.168,317.901,7.845
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NCGC00261292-01,1.428572e+07,14.255,2.000,2.628,9.259,6.309,0.157,5.468,4.484,0.758,...,306.364,1435.590,76.419,-190.192,-13.757,100.624,250.402,19.275,29.148,2.581
NCGC00261245-01,1.193182e+07,13.674,2.061,2.920,21.142,15.382,1.201,12.713,10.576,1.082,...,2528.642,12293.627,94.878,-595.491,-22.275,324.131,2034.439,49.446,93.636,3.666
NCGC00260828-01,1.081800e+01,12.374,2.045,3.128,33.242,20.457,0.806,19.711,14.799,4.733,...,9171.300,44070.070,267.400,-2656.568,-104.039,874.679,8689.849,144.294,91.670,8.054
NCGC00260687-01,3.229000e+00,12.543,2.267,2.700,10.251,7.381,0.587,6.455,5.857,0.810,...,391.790,1815.417,39.578,-105.234,-9.967,146.565,389.732,23.879,28.201,2.954


In [19]:
list(df.columns)

['AW',
 'AWeight',
 'Arto',
 'BertzCT',
 'Chi0',
 'Chi1',
 'Chi10',
 'Chi2',
 'Chi3',
 'Chi3c',
 'Chi3ch',
 'Chi4',
 'Chi4c',
 'Chi4ch',
 'Chi4pc',
 'Chi5',
 'Chi5ch',
 'Chi6',
 'Chi6ch',
 'Chi7',
 'Chi8',
 'Chi9',
 'Chiv0',
 'Chiv1',
 'Chiv10',
 'Chiv2',
 'Chiv3',
 'Chiv3c',
 'Chiv3ch',
 'Chiv4',
 'Chiv4c',
 'Chiv4ch',
 'Chiv4pc',
 'Chiv5',
 'Chiv5ch',
 'Chiv6',
 'Chiv6ch',
 'Chiv7',
 'Chiv8',
 'Chiv9',
 'DZ',
 'EstateVSA0',
 'EstateVSA1',
 'EstateVSA10',
 'EstateVSA2',
 'EstateVSA3',
 'EstateVSA4',
 'EstateVSA5',
 'EstateVSA6',
 'EstateVSA7',
 'EstateVSA8',
 'EstateVSA9',
 'GATSe1',
 'GATSe2',
 'GATSe3',
 'GATSe4',
 'GATSe5',
 'GATSe6',
 'GATSe7',
 'GATSe8',
 'GATSm1',
 'GATSm2',
 'GATSm3',
 'GATSm4',
 'GATSm5',
 'GATSm6',
 'GATSm7',
 'GATSm8',
 'GATSp1',
 'GATSp2',
 'GATSp3',
 'GATSp4',
 'GATSp5',
 'GATSp6',
 'GATSp7',
 'GATSp8',
 'GATSv1',
 'GATSv2',
 'GATSv3',
 'GATSv4',
 'GATSv5',
 'GATSv6',
 'GATSv7',
 'GATSv8',
 'GMTI',
 'GMTIV',
 'Geto',
 'Getov',
 'Gravto',
 'Hato',
 'Hatov',