# Train TAF1 and make predictions for HepG2

---

Takes long time, and uses 50GB!

----

In [1]:
import time
import pandas as pd
import numpy as np
import subprocess
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import SGDClassifier

In [2]:
TF='TAF1'

TRAIN_CELL_TYPES=['GM12878','H1-hESC','HeLa-S3','K562','SK-N-SH']
TEST_CELL_TYPE='HepG2'

TRAIN_CHRS=["chr10","chr11","chr12","chr13","chr14",
            "chr15","chr16","chr17","chr18","chr19",
            "chr2","chr20","chr22","chr3","chr4"]
VALID_CHRS=["chr5","chr6","chr7","chr9","chrX"]
TEST_CHRS=["chr1","chr21","chr8"] #order

DATA_DIR='/mnt/vdisk/data/synapse/'

### Load data

In [3]:
start=time.time()

# load fold coverage tables for all train cell lines
fc_train=[pd.read_hdf(DATA_DIR+'fold_cov_data/'+cl+'_dnase_fold_cov.hdf',
                      'dnase_fold_cov')
          for cl in TRAIN_CELL_TYPES ]

#load fold coverage table for the final submission cell line
fc_test=pd.read_hdf(DATA_DIR+'fold_cov_data/'+TEST_CELL_TYPE+'_dnase_fold_cov.hdf',
                     'dnase_fold_cov')

#load motif table for the transciption factor
motif=pd.read_hdf(DATA_DIR+'motif_data/'+TF+'_motif.hdf','motif')
          
#load labels for the transcription factor
labels=pd.read_hdf(DATA_DIR+'extended_labels/'+TF+'_labels.hdf','labels')

print (time.time()-start),'s'

Opening /mnt/vdisk/data/synapse/fold_cov_data/GM12878_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/H1-hESC_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/HeLa-S3_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/K562_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/SK-N-SH_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/fold_cov_data/HepG2_dnase_fold_cov.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/motif_data/TAF1_motif.hdf in read-only mode
Opening /mnt/vdisk/data/synapse/extended_labels/TAF1_labels.hdf in read-only mode
27.2252309322 s


### Create training data
- Selecting the training,testing, and validation chroms takes long time

In [4]:
start0=time.time()

X_train,y_train=[],[]
for i,cell_line_fc in enumerate(fc_train):
    print TRAIN_CELL_TYPES[i],
    start=time.time()
    
    #X
    #select the chromosomes
    cell_line_df=pd.concat([cell_line_fc.loc[TRAIN_CHRS],
                            motif.loc[TRAIN_CHRS]],axis=1)
    #rename dnase fc to uniform column name
    cell_line_df.rename(columns={cell_line_df.columns[0]:'dnase_fc'},
                        inplace=True)
    #add df to a list to be concatenated later
    X_train.append(cell_line_df)
    
    #y
    y_train.append(labels.loc[TRAIN_CHRS,TRAIN_CELL_TYPES[i]].values)
    
    print (time.time()-start),'s'
    
#concatenate all cell line dfs
X_train=pd.concat(X_train,ignore_index=True)
y_train=np.concatenate(y_train)

print (time.time()-start0),'s'

GM12878 103.727802992 s
H1-hESC 95.8695471287 s
HeLa-S3 96.44520998 s
K562 99.7958650589 s
SK-N-SH 98.3663249016 s
584.202143908 s


In [5]:
X_train.head()

Unnamed: 0,dnase_fc,TAF1_motif_mean,TAF1_motif_max,TAF1_motif_q99,TAF1_motif_q95,TAF1_motif_q90,TAF1_motif_q75,TAF1_motif_q50
0,0.0,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882
1,0.0,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882
2,0.0,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882
3,0.0,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882
4,0.0,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882


### Create validation data

In [6]:
start0=time.time()

X_valid,y_valid=[],[]
for i,cell_line_fc in enumerate(fc_train):
    print TRAIN_CELL_TYPES[i],
    start=time.time()
    
    #select the chromosomes
    cell_line_df=pd.concat([cell_line_fc.loc[VALID_CHRS],
                            motif.loc[VALID_CHRS]],axis=1)
    #rename dnase fc to uniform column name
    cell_line_df.rename(columns={cell_line_df.columns[0]:'dnase_fc'},
                        inplace=True)
    #add df to a list to be concatenated later
    X_valid.append(cell_line_df)
    
    #y
    y_valid.append(labels.loc[VALID_CHRS,TRAIN_CELL_TYPES[i]].values)
    
    print (time.time()-start),'s'
    
#concatenate all cell line dfs
X_valid=pd.concat(X_valid,ignore_index=True)
y_valid=np.concatenate(y_valid)

print (time.time()-start0),'s'

GM12878 39.3330061436 s
H1-hESC 34.5296738148 s
HeLa-S3 39.3249709606 s
K562 38.8769490719 s
SK-N-SH 36.0859560966 s
197.314650059 s


In [7]:
X_valid.head()

Unnamed: 0,dnase_fc,TAF1_motif_mean,TAF1_motif_max,TAF1_motif_q99,TAF1_motif_q95,TAF1_motif_q90,TAF1_motif_q75,TAF1_motif_q50
0,0.0,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882
1,0.0,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882
2,0.0,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882
3,0.0,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882
4,0.0,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882


### Create test data

In [8]:
start0=time.time()
    
#select the chromosomes
X_test=pd.concat([fc_test.loc[TEST_CHRS],
                  motif.loc[TEST_CHRS]],axis=1)

#rename dnase fc to uniform column name
X_test.rename(columns={X_test.columns[0]:'dnase_fc'},inplace=True)

#drop index
X_test.reset_index(inplace=True,drop=True)

print (time.time()-start0),'s'

17.0965480804 s


In [9]:
X_test.head()

Unnamed: 0,dnase_fc,TAF1_motif_mean,TAF1_motif_max,TAF1_motif_q99,TAF1_motif_q95,TAF1_motif_q90,TAF1_motif_q75,TAF1_motif_q50
0,0.0,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882
1,0.0,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882
2,0.0,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882
3,0.0,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882
4,0.0,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882,20.018882


### Train SGD logistic reg
- it doesn't seem to use the 12 cores for most of the time

In [15]:
start=time.time()

clf = SGDClassifier(loss='log', class_weight='balanced', n_jobs=12)
clf.fit(X_train,y_train)

print (time.time()-start),'s'

542.100310087 s


### Evaluate

In [16]:
start=time.time()

print 'auc:',roc_auc_score(y_valid,clf.predict_proba(X_valid)[:,1])

print (time.time()-start),'s'

auc: 0.908673791441
41.521982193 s


### Predict

In [17]:
start=time.time()

y_test_pred=clf.predict_proba(X_test)[:,1]

print (time.time()-start),'s'

0.860918998718 s


### Annotate predictions with the test regions
- It has to be in the exact order of the test regions
    - https://www.synapse.org/#!Synapse:syn6131484/wiki/402044
- I missed the very frst line when creating the tables, I just add a 0 there

### Join the results using shell commands
- Not too elegant but faster

In [19]:
start=time.time()
np.savetxt(TF+'_'+TEST_CELL_TYPE+'_test.txt',np.concatenate([[0],y_test_pred]))
print (time.time()-start),'s'

21.9166607857 s


In [21]:
start=time.time()

cmd = ' paste '
cmd+= ' <(zcat '+DATA_DIR+'/annotations/ladder_regions.blacklistfiltered.bed.gz ) '
cmd+= TF+'_'+TEST_CELL_TYPE +'_test.txt '
cmd+= ' | gzip -c -1  > '+'L.'+TF+'.'+TEST_CELL_TYPE+'.tab.gz'
print cmd
subprocess.check_output(cmd,shell=True, executable='/bin/bash')

print (time.time()-start),'s'

 paste  <(zcat /mnt/vdisk/data/synapse//annotations/ladder_regions.blacklistfiltered.bed.gz ) TAF1_HepG2_test.txt  | gzip -c -1  > L.TAF1.HepG2.tab.gz
6.8108549118 s
