# Sample Solution: Random Forest

In [1]:
# make sure  numpy, scipy, pandas, sklearn are installed, otherwise run
# pip install numpy scipy pandas scikit-learn
import numpy as np
import pandas as pd
from scipy import io
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
# load data
raw_data = './data/raw/tox21/'
y_tr = pd.read_csv(raw_data+'tox21_labels_train.csv.gz', index_col=0, compression="gzip")
y_te = pd.read_csv(raw_data+'tox21_labels_test.csv.gz', index_col=0, compression="gzip")
x_tr_dense = pd.read_csv(raw_data+'tox21_dense_train.csv.gz', index_col=0, compression="gzip").values
x_te_dense = pd.read_csv(raw_data+'tox21_dense_test.csv.gz', index_col=0, compression="gzip").values
x_tr_sparse = io.mmread(raw_data+'tox21_sparse_train.mtx.gz').tocsc()
x_te_sparse = io.mmread(raw_data+'tox21_sparse_test.mtx.gz').tocsc()

In [3]:
# filter out very sparse features
sparse_col_idx = ((x_tr_sparse > 0).mean(0) > 0.05).A.ravel()
x_tr = np.hstack([x_tr_dense, x_tr_sparse[:, sparse_col_idx].A])
x_te = np.hstack([x_te_dense, x_te_sparse[:, sparse_col_idx].A])

In [4]:
# Build a random forest model for all twelve assays
for target in y_tr.columns:
    rows_tr = np.isfinite(y_tr[target]).values
    rows_te = np.isfinite(y_te[target]).values
    rf = RandomForestClassifier(n_estimators=100,  n_jobs=4)
    rf.fit(x_tr[rows_tr], y_tr[target][rows_tr])
    p_te = rf.predict_proba(x_te[rows_te])
    auc_te = roc_auc_score(y_te[target][rows_te], p_te[:, 1])
    print("%15s: %3.5f" % (target, auc_te))

         NR.AhR: 0.90156
          NR.AR: 0.73193
      NR.AR.LBD: 0.66420
   NR.Aromatase: 0.77599
          NR.ER: 0.78895
      NR.ER.LBD: 0.73332
  NR.PPAR.gamma: 0.71327
         SR.ARE: 0.77014
       SR.ATAD5: 0.81653
         SR.HSE: 0.79723
         SR.MMP: 0.92850
         SR.p53: 0.79932


In [5]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

In [6]:
y_testing=y_te[target][~np.isnan(y_te[target])]
y_hat_testing=rf.predict(x_te[rows_te])
print(np.array([['TN','FP'],['FN','TP']]))
print(confusion_matrix(y_testing,y_hat_testing))
print('f1:{0:0.5f}'.format(f1_score(y_testing,y_hat_testing)))
print('recall:{0:0.5f}'.format(recall_score(y_testing,y_hat_testing)))
print('precision:{0:0.5f}'.format(precision_score(y_testing,y_hat_testing)))

[['TN' 'FP']
 ['FN' 'TP']]
[[574   1]
 [ 39   2]]
f1:0.09091
recall:0.04878
precision:0.66667
