# Example: train ATF2 and make predictions for HepG2


----

In [1]:
import time
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
TF='ATF2'
DATA_DIR='/mnt/vdisk/data/synapse/'
TRAIN_CELL_TYPES=['GM12878','H1-hESC']
VALID_CELL_TYPE='MCF-7'
TEST_CELL_TYPE='HepG2'

### Load data

In [2]:
fc_train=[pd.read_hdf(DATA_DIR+'fold_cov_data/'+cl+'_dnase_fold_cov.hdf',
                      'dnase_fold_cov',index_col=(0,1,2))
          for cl in TRAIN_CELL_TYPES ]
fc_valid=pd.read_hdf(DATA_DIR+'fold_cov_data/'+VALID_CELL_TYPE+'_dnase_fold_cov.hdf',
                     'dnase_fold_cov',index_col=(0,1,2))
fc_test=pd.read_hdf(DATA_DIR+'fold_cov_data/'+TEST_CELL_TYPE+'_dnase_fold_cov.hdf',
                     'dnase_fold_cov',index_col=(0,1,2))

motif=pd.read_hdf(DATA_DIR+'motif_data/'+TF+'_motif.hdf','motif',index_col=(0,1,2))
          
labels=pd.read_hdf(DATA_DIR+'extended_labels/'+TF+'_labels.hdf','labels')

Opening /mnt/vdisk/data/synapse/fold_cov_data/GM12878_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/H1-hESC_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/MCF-7_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/HepG2_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/motif_data/ATF2_motif.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/extended_labels/ATF2_labels.hdf in read-only mode


### Create train,valid,test data

In [7]:
# packing columns into the train dataset takes some time
# valid and test much faster
start=time.time()

x_train=[fc_train[0].values, #cell line specific column
         motif.values[:,3:] ] + [ #motif scores
         x.values for x in fc_train[1:]] + [ #other cell line fc values
         labels[tf].values for tf in TRAIN_CELL_TYPES[1:] ] #other cell line labels
x_train=np.column_stack(x_train)
y_train=labels[TRAIN_CELL_TYPES[0]].values

print int((time.time()-start)/60),'min'

4 min


In [10]:
x_valid=np.array(x_train)
x_valid[:,0]=fc_valid.values.flatten()
y_valid=labels[VALID_CELL_TYPE].values

In [None]:
x_test=np.array(x_train)
x_test[:,0]=fc_test.values
y_test=labels[TEST_CELL_TYPE].values

### Train

- it crashed, probably 60 million data points is too much for rfc

In [None]:
start=time.time()

clf=RandomForestClassifier(n_estimators=5,n_jobs=12)
clf.fit(x_train,y_train)

print (time.time()-start),'s'

### Evaluate

In [None]:
start=time.time()

print 'auc:',roc_auc_score(y_valid[-N:],clf.predict_proba(x_valid[-N:])[:,1])

print (time.time()-start),'s'

### Predict

In [None]:
start=time.time()

y_test_pred=clf.predict_proba(x_valid[-N:])[:,1]

print (time.time()-start),'s'