# Sample Solution: Random Forest

In [1]:
# make sure  numpy, scipy, pandas, sklearn are installed, otherwise run
# pip install numpy scipy pandas scikit-learn
import numpy as np
import pandas as pd
from scipy import io
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
# load data
raw_data = './data/raw/tox21/'
y_tr = pd.read_csv(raw_data+'tox21_labels_train.csv.gz', index_col=0, compression="gzip")
y_te = pd.read_csv(raw_data+'tox21_labels_test.csv.gz', index_col=0, compression="gzip")
x_tr_dense = pd.read_csv(raw_data+'tox21_dense_train.csv.gz', index_col=0, compression="gzip").values
x_te_dense = pd.read_csv(raw_data+'tox21_dense_test.csv.gz', index_col=0, compression="gzip").values
x_tr_sparse = io.mmread(raw_data+'tox21_sparse_train.mtx.gz').tocsc()
x_te_sparse = io.mmread(raw_data+'tox21_sparse_test.mtx.gz').tocsc()

In [3]:
# filter out very sparse features
sparse_col_idx = ((x_tr_sparse > 0).mean(0) > 0.05).A.ravel()
x_tr = np.hstack([x_tr_dense, x_tr_sparse[:, sparse_col_idx].A])
x_te = np.hstack([x_te_dense, x_te_sparse[:, sparse_col_idx].A])

In [4]:
# Build a random forest model for all twelve assays
for target in y_tr.columns:
    rows_tr = np.isfinite(y_tr[target]).values
    rows_te = np.isfinite(y_te[target]).values
    rf = RandomForestClassifier(n_estimators=100,  n_jobs=4)
    rf.fit(x_tr[rows_tr], y_tr[target][rows_tr])
    p_te = rf.predict_proba(x_te[rows_te])
    auc_te = roc_auc_score(y_te[target][rows_te], p_te[:, 1])
    print("%15s: %3.5f" % (target, auc_te))

         NR.AhR: 0.90156
          NR.AR: 0.73193
      NR.AR.LBD: 0.66420
   NR.Aromatase: 0.77599
          NR.ER: 0.78895
      NR.ER.LBD: 0.73332
  NR.PPAR.gamma: 0.71327
         SR.ARE: 0.77014
       SR.ATAD5: 0.81653
         SR.HSE: 0.79723
         SR.MMP: 0.92850
         SR.p53: 0.79932


In [5]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

In [6]:
y_testing=y_te[target][~np.isnan(y_te[target])]
y_hat_testing=rf.predict(x_te[rows_te])
print(np.array([['TN','FP'],['FN','TP']]))
print(confusion_matrix(y_testing,y_hat_testing))
print('f1:{0:0.5f}'.format(f1_score(y_testing,y_hat_testing)))
print('recall:{0:0.5f}'.format(recall_score(y_testing,y_hat_testing)))
print('precision:{0:0.5f}'.format(precision_score(y_testing,y_hat_testing)))

[['TN' 'FP']
 ['FN' 'TP']]
[[574   1]
 [ 39   2]]
f1:0.09091
recall:0.04878
precision:0.66667


# Exploring Source Data

In [7]:
x_tr_dense.shape

(12060, 801)

In [8]:
raw_data = './data/raw/tox21/'
df=pd.read_csv(raw_data+'tox21_dense_train.csv.gz', index_col=0, compression="gzip")
df.head()

Unnamed: 0,AW,AWeight,Arto,BertzCT,Chi0,Chi1,Chi10,Chi2,Chi3,Chi3c,...,W3D,W3DH,WNSA1,WNSA2,WNSA3,WPSA1,WPSA2,WPSA3,grav,rygr
NCGC00178831-03,54367200.0,13.053,2.176,3.194,23.112,15.868,1.496,15.127,12.592,2.619,...,2687.469,9241.018,115.371,-915.496,-39.983,290.078,2301.941,59.492,88.147,3.708
NCGC00166114-03,12688180.0,22.123,2.065,3.137,21.033,13.718,1.937,13.187,11.951,2.502,...,2184.384,3234.199,194.74,-1029.609,-34.205,235.36,1244.323,82.906,134.852,4.131
NCGC00263563-01,3076932.0,13.085,2.154,3.207,46.896,29.958,3.806,30.105,25.569,7.819,...,13803.524,76582.899,238.004,-4358.946,-106.537,868.685,15909.444,135.335,216.852,5.075
NCGC00013058-02,71685690.0,12.832,2.029,3.38,51.086,32.045,1.806,29.09,21.603,5.222,...,13807.345,50498.175,226.312,-2785.555,-61.923,763.288,9394.859,125.509,238.265,4.64
NCGC00167516-01,7989702.0,12.936,2.124,3.573,70.295,46.402,3.604,42.132,32.57,7.002,...,43231.286,163659.229,850.869,-21136.699,-367.122,1798.703,44681.209,362.168,317.901,7.845


In [9]:
df.columns[:20]

Index(['AW', 'AWeight', 'Arto', 'BertzCT', 'Chi0', 'Chi1', 'Chi10', 'Chi2',
       'Chi3', 'Chi3c', 'Chi3ch', 'Chi4', 'Chi4c', 'Chi4ch', 'Chi4pc', 'Chi5',
       'Chi5ch', 'Chi6', 'Chi6ch', 'Chi7'],
      dtype='object')

In [10]:
df=pd.read_csv(raw_data+'tox21_sparse_colnames.txt.gz', index_col=0, compression="gzip")
df.head()

ecfp2:-1473889692
ecfp2:-1473889681
ecfp2:-415800397
ecfp2:1028
ecfp2:1039
