# Train FOXA2 and make predictions for liver

---

Takes something like 10-20 minutes, and uses 50GB!

----

In [1]:
import time
import pandas as pd
import numpy as np
import subprocess
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import SGDClassifier

In [2]:
TF='FOXA2'
TRAIN_CELL_TYPES=['HepG2']
TEST_CELL_TYPE='liver'

DATA_DIR='/mnt/vdisk/data/synapse/'

### Load data

In [3]:
start=time.time()

# load fold coverage tables for all train cell lines
fc_train=[pd.read_hdf(DATA_DIR+'fold_cov_data/'+cl+'_dnase_fold_cov.hdf',
                      'dnase_fold_cov')
          for cl in TRAIN_CELL_TYPES ]

#load fold coverage table for the final submission cell line
fc_test=pd.read_hdf(DATA_DIR+'fold_cov_data/'+TEST_CELL_TYPE+'_dnase_fold_cov.hdf',
                     'dnase_fold_cov')

#load motif table for the transciption factor
motif=pd.read_hdf(DATA_DIR+'motif_data/'+TF+'_motif.hdf','motif')
          
#load labels for the transcription factor
labels=pd.read_hdf(DATA_DIR+'extended_labels/'+TF+'_labels.hdf','labels')

print (time.time()-start),'s'

Opening /mnt/vdisk/data/synapse/fold_cov_data/HepG2_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/liver_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/motif_data/FOXA2_motif.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/extended_labels/FOXA2_labels.hdf in read-only mode
48.0579001904 s


In [5]:
start=time.time()

# fix index not set before
motif.set_index(['chr1','600','800'],inplace=True)

print (time.time()-start),'s'

25.516919136 s


### Create train,valid,test data

In [6]:
start=time.time()

# packing columns into the train dataset
x_train=[fc_train[0].values,                                #cell line specific fc column
         motif.values ] + [                                 #motif scores
         x.values for x in fc_train[1:]] + [                #other cell line fc values
         labels[tf].values for tf in TRAIN_CELL_TYPES[1:] ] #other cell line labels
x_train=np.column_stack(x_train)
y_train=labels[TRAIN_CELL_TYPES[0]].values.astype('int')

print (time.time()-start),'s'

6.75908899307 s


### Train SGD logistic reg
- it doesn't seem to use the 12 cores for most of the time

In [7]:
start=time.time()

clf = SGDClassifier(loss='log', class_weight='balanced', n_jobs=12)
clf.fit(x_train,y_train)

print (time.time()-start),'s'

167.779392958 s


### Predict

In [8]:
start=time.time()

x_test=np.array(x_train)
x_test[:,0]=fc_test.values.flatten()

print (time.time()-start),'s'

3.7090780735 s


In [9]:
start=time.time()

y_test_pred=clf.predict_proba(x_test)[:,1]

print (time.time()-start),'s'

3.8049929142 s


### Annotate predictions with the test regions
- It has to be in the exact order of the test regions
    - https://www.synapse.org/#!Synapse:syn6131484/wiki/402044
- I missed the very frst line when creating the tables, I just add a 0 there

In [10]:
start=time.time()

#load index
idx=pd.read_hdf(DATA_DIR+'/annotations/test_regions.hdf',
                   'test_regions').index

print (time.time()-start),'s'

Opening /mnt/vdisk/data/synapse//annotations/test_regions.hdf in read-only mode
1.21383810043 s


In [11]:
start=time.time()
res_df=pd.DataFrame(np.concatenate([[0],y_test_pred]),index=idx)
print (time.time()-start),'s'

0.283663988113 s


### The slowest part of the whole process is to write the tsv, so now i just make it in hdf

In [12]:
start=time.time()

#slooooooooooow
#res_df.to_csv(
#    TF+'_test.h',sep='\t',header=False,compression='gzip')

res_df.to_hdf(TF+'_'+TEST_CELL_TYPE+'_test.hdf','preds')

print (time.time()-start),'s'

1.72001600266 s


### Join the results using shell commands
- Not too elegant but faster

In [13]:
start=time.time()
np.savetxt(TF+'_'+TEST_CELL_TYPE+'_test.txt',np.concatenate([[0],y_test_pred]))
print (time.time()-start),'s'

148.453009129 s


In [14]:
start=time.time()

cmd = ' paste '
cmd+= ' <(zcat '+DATA_DIR+'/annotations/test_regions.blacklistfiltered.bed.gz ) '
cmd+= TF+'_'+TEST_CELL_TYPE +'_test.txt '
cmd+= ' | gzip -c -1  > '+'F.'+TF+'.'+TEST_CELL_TYPE+'.tab.gz'
print cmd
subprocess.check_output(cmd,shell=True, executable='/bin/bash')

print (time.time()-start),'s'

 paste  <(zcat /mnt/vdisk/data/synapse//annotations/test_regions.blacklistfiltered.bed.gz ) FOXA2_liver_test.txt  | gzip -c -1  > F.FOXA2.liver.tab.gz
43.1568770409 s
