# Train MAX and make predictions for liver

---

Takes something like 10-20 minutes, and uses 50GB!

----

In [1]:
import time
import pandas as pd
import numpy as np
import subprocess
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import SGDClassifier

In [2]:
TF='MAX'
TRAIN_CELL_TYPES=['A549','GM12878','H1-hESC','HCT116','HeLa-S3','HepG2','K562']
VALID_CELL_TYPE='SK-N-SH'
TEST_CELL_TYPE='liver'

DATA_DIR='/mnt/vdisk/data/synapse/'

### Load data

In [3]:
start=time.time()

# load fold coverage tables for all train cell lines
fc_train=[pd.read_hdf(DATA_DIR+'fold_cov_data/'+cl+'_dnase_fold_cov.hdf',
                      'dnase_fold_cov')
          for cl in TRAIN_CELL_TYPES ]

#load fold coverage table for valid cell line
fc_valid=pd.read_hdf(DATA_DIR+'fold_cov_data/'+VALID_CELL_TYPE+'_dnase_fold_cov.hdf',
                     'dnase_fold_cov')

#load fold coverage table for the final submission cell line
fc_test=pd.read_hdf(DATA_DIR+'fold_cov_data/'+TEST_CELL_TYPE+'_dnase_fold_cov.hdf',
                     'dnase_fold_cov')

#load motif table for the transciption factor
motif=pd.read_hdf(DATA_DIR+'motif_data/'+TF+'_motif.hdf','motif')
          
#load labels for the transcription factor
labels=pd.read_hdf(DATA_DIR+'extended_labels/'+TF+'_labels.hdf','labels')

print (time.time()-start),'s'

Opening /mnt/vdisk/data/synapse/fold_cov_data/A549_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/GM12878_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/H1-hESC_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/HCT116_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/HeLa-S3_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/HepG2_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/K562_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/SK-N-SH_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/liver_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/motif_data/MAX_motif.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/extended_labels/MAX_labels.hdf in read-only mode
66.8543329239 s


### Create train,valid,test data

In [5]:
start=time.time()

# packing columns into the train dataset
x_train=[fc_train[0].values,                                #cell line specific fc column
         motif.values ] + [                                 #motif scores
         x.values for x in fc_train[1:]] + [                #other cell line fc values
         labels[tf].values for tf in TRAIN_CELL_TYPES[1:] ] #other cell line labels
x_train=np.column_stack(x_train)
y_train=labels[TRAIN_CELL_TYPES[0]].values.astype('int')

print (time.time()-start),'s'

28.6939280033 s


In [6]:
start=time.time()

x_valid=np.array(x_train)
x_valid[:,0]=fc_valid.values.flatten()
y_valid=labels[VALID_CELL_TYPE].values

print (time.time()-start),'s'

69.1574690342 s


### Train SGD logistic reg
- it doesn't seem to use the 12 cores for most of the time

In [7]:
start=time.time()

clf = SGDClassifier(loss='log', class_weight='balanced', n_jobs=12)
clf.fit(x_train,y_train)

print (time.time()-start),'s'

286.342453003 s


### Evaluate

In [8]:
start=time.time()

print 'auc:',roc_auc_score(y_valid,clf.predict_proba(x_valid)[:,1])

print (time.time()-start),'s'

auc: 0.901056582005
101.869518042 s


### Predict

In [9]:
start=time.time()

x_test=np.array(x_train)
x_test[:,0]=fc_test.values.flatten()

print (time.time()-start),'s'

72.5808000565 s


In [10]:
start=time.time()

y_test_pred=clf.predict_proba(x_test)[:,1]

print (time.time()-start),'s'

17.4699280262 s


### Annotate predictions with the test regions
- It has to be in the exact order of the test regions
    - https://www.synapse.org/#!Synapse:syn6131484/wiki/402044
- I missed the very frst line when creating the tables, I just add a 0 there

In [11]:
start=time.time()

#load index
idx=pd.read_hdf(DATA_DIR+'/annotations/test_regions.hdf',
                   'test_regions').index

print (time.time()-start),'s'

Opening /mnt/vdisk/data/synapse//annotations/test_regions.hdf in read-only mode
4.26892399788 s


In [12]:
start=time.time()
res_df=pd.DataFrame(np.concatenate([[0],y_test_pred]),index=idx)
print (time.time()-start),'s'

2.97968912125 s


### The slowest part of the whole process is to write the tsv, so now i just make it in hdf

In [13]:
start=time.time()

#slooooooooooow
#res_df.to_csv(
#    TF+'_test.h',sep='\t',header=False,compression='gzip')

res_df.to_hdf(TF+'_'+TEST_CELL_TYPE+'_test.hdf','preds')

print (time.time()-start),'s'

1.68503689766 s


### Join the results using shell commands
- Not too elegant but faster

In [14]:
start=time.time()
np.savetxt(TF+'_'+TEST_CELL_TYPE+'_test.txt',np.concatenate([[0],y_test_pred]))
print (time.time()-start),'s'

151.168292999 s


In [15]:
start=time.time()

cmd = ' paste '
cmd+= ' <(zcat '+DATA_DIR+'/annotations/test_regions.blacklistfiltered.bed.gz ) '
cmd+= TF+'_'+TEST_CELL_TYPE +'_test.txt '
cmd+= ' | gzip -c -1  > '+'F.'+TF+'.'+TEST_CELL_TYPE+'.tab.gz'
print cmd
subprocess.check_output(cmd,shell=True, executable='/bin/bash')

print (time.time()-start),'s'

 paste  <(zcat /mnt/vdisk/data/synapse//annotations/test_regions.blacklistfiltered.bed.gz ) MAX_liver_test.txt  | gzip -c -1  > F.MAX.liver.tab.gz
46.2804298401 s
