# Train CTCF and make predictions for PC-3

---

Takes something like 10-20 minutes, and uses 50GB!

----

In [1]:
import time
import pandas as pd
import numpy as np
import subprocess
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import SGDClassifier

In [2]:
TF='CTCF'
TRAIN_CELL_TYPES=['A549','H1-hESC','HeLa-S3','HepG2','IMR-90','K562']
VALID_CELL_TYPE='MCF-7'
TEST_CELL_TYPE='PC-3'

DATA_DIR='/mnt/vdisk/data/synapse/'

### Load data

In [3]:
start=time.time()

# load fold coverage tables for all train cell lines
fc_train=[pd.read_hdf(DATA_DIR+'fold_cov_data/'+cl+'_dnase_fold_cov.hdf',
                      'dnase_fold_cov')
          for cl in TRAIN_CELL_TYPES ]

#load fold coverage table for valid cell line
fc_valid=pd.read_hdf(DATA_DIR+'fold_cov_data/'+VALID_CELL_TYPE+'_dnase_fold_cov.hdf',
                     'dnase_fold_cov')

#load fold coverage table for the final submission cell line
fc_test=pd.read_hdf(DATA_DIR+'fold_cov_data/'+TEST_CELL_TYPE+'_dnase_fold_cov.hdf',
                     'dnase_fold_cov')

#load motif table for the transciption factor
motif=pd.read_hdf(DATA_DIR+'motif_data/'+TF+'_motif.hdf','motif')
          
#load labels for the transcription factor
labels=pd.read_hdf(DATA_DIR+'extended_labels/'+TF+'_labels.hdf','labels')

print (time.time()-start),'s'

Opening /mnt/vdisk/data/synapse/fold_cov_data/A549_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/H1-hESC_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/HeLa-S3_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/HepG2_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/IMR-90_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/K562_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/MCF-7_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/PC-3_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/motif_data/CTCF_motif.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/extended_labels/CTCF_labels.hdf in read-only mode
39.6938438416 s


In [4]:
start=time.time()

# fix index not set before
motif.set_index(['chr1','600','800'],inplace=True)

print (time.time()-start),'s'

25.6841320992 s


### Create train,valid,test data

In [5]:
start=time.time()

# packing columns into the train dataset
x_train=[fc_train[0].values,                                #cell line specific fc column
         motif.values ] + [                                 #motif scores
         x.values for x in fc_train[1:]] + [                #other cell line fc values
         labels[tf].values for tf in TRAIN_CELL_TYPES[1:] ] #other cell line labels
x_train=np.column_stack(x_train)
y_train=labels[TRAIN_CELL_TYPES[0]].values.astype('int')

print (time.time()-start),'s'

30.3075699806 s


In [6]:
start=time.time()

x_valid=np.array(x_train)
x_valid[:,0]=fc_valid.values.flatten()
y_valid=labels[VALID_CELL_TYPE].values

print (time.time()-start),'s'

11.412348032 s


### Train SGD logistic reg
- it doesn't seem to use the 12 cores for most of the time

In [7]:
start=time.time()

clf = SGDClassifier(loss='log', class_weight='balanced', n_jobs=12)
clf.fit(x_train,y_train)

print (time.time()-start),'s'

206.934782028 s


### Evaluate

In [8]:
start=time.time()

print 'auc:',roc_auc_score(y_valid,clf.predict_proba(x_valid)[:,1])

print (time.time()-start),'s'

auc: 0.982476254558
41.2699642181 s


### Predict

In [9]:
start=time.time()

x_test=np.array(x_train)
x_test[:,0]=fc_test.values.flatten()

print (time.time()-start),'s'

15.7102320194 s


In [10]:
start=time.time()

y_test_pred=clf.predict_proba(x_test)[:,1]

print (time.time()-start),'s'

5.61556887627 s


### Annotate predictions with the test regions
- It has to be in the exact order of the test regions
    - https://www.synapse.org/#!Synapse:syn6131484/wiki/402044
- I missed the very frst line when creating the tables, I just add a 0 there

In [11]:
start=time.time()

#load index
idx=pd.read_hdf(DATA_DIR+'/annotations/test_regions.hdf',
                   'test_regions').index

print (time.time()-start),'s'

Opening /mnt/vdisk/data/synapse//annotations/test_regions.hdf in read-only mode
1.17665290833 s


In [12]:
start=time.time()
res_df=pd.DataFrame(np.concatenate([[0],y_test_pred]),index=idx)
print (time.time()-start),'s'

0.391808986664 s


### The slowest part of the whole process is to write the tsv, so now i just make it in hdf

In [13]:
start=time.time()

#slooooooooooow
#res_df.to_csv(
#    TF+'_test.h',sep='\t',header=False,compression='gzip')

res_df.to_hdf(TF+'_test.hdf','preds')

print (time.time()-start),'s'

1.76118803024 s


### Join the results using shell commands
- Not too elegant but faster

In [14]:
start=time.time()
np.savetxt(TF+'_test.txt',np.concatenate([[0],y_test_pred]))
print (time.time()-start),'s'

142.592308998 s


In [15]:
start=time.time()

cmd = ' paste '
cmd+= ' <(zcat '+DATA_DIR+'/annotations/test_regions.blacklistfiltered.bed.gz ) '
cmd+= TF+'_test.txt '
cmd+= ' > '+TF+'_test.tsv'
print cmd
subprocess.check_output(cmd,shell=True, executable='/bin/bash')

print (time.time()-start),'s'

 paste  <(zcat /mnt/vdisk/data/synapse//annotations/test_regions.blacklistfiltered.bed.gz ) CTCF_test.txt  > CTCF_test.tsv
41.1684250832 s


In [16]:
%%bash
head CTCF_test.tsv

chr1	600	800	0.000000000000000000e+00
chr1	650	850	3.803902094166237668e-08
chr1	700	900	3.803902094166237668e-08
chr1	750	950	3.803902094166237668e-08
chr1	800	1000	3.803902094166237668e-08
chr1	850	1050	3.803902094166237668e-08
chr1	900	1100	3.803902094166237668e-08
chr1	950	1150	3.803902094166237668e-08
chr1	1000	1200	3.803902094166237668e-08
chr1	1050	1250	3.803902094166237668e-08
