# Train and test CTCF binding

---

An example notebook using the prepared data. It only uses few training and validation chromosomes to be able to finish relatively fast.

----

In [1]:
import time
import pandas as pd
import numpy as np
import subprocess
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import SGDClassifier

### Define training and validation cell lines and chromosomes

- I just randomly selected a training cell line for validation
- I just randomly selected some chromosomes for training and validation

In [2]:
TF='CTCF'
TRAIN_CELL_TYPES=['A549','H1-hESC','HeLa-S3','HepG2','IMR-90','K562']
VALID_CELL_TYPE='MCF-7'

#TRAIN_CHRS=["chr10","chr11","chr12","chr13","chr14",
#            "chr15","chr16","chr17","chr18","chr19",
#            "chr2","chr20","chr22","chr3","chr4"]
#VALID_CHRS=["chr5","chr6","chr7","chr9","chrX"]
#faster for testing
TRAIN_CHRS=["chr10","chr11"]
VALID_CHRS=["chr9"]

### Load the prepared data tables
- motifs, dnase fold coverage, and the labels

In [3]:
start=time.time()

# load fold coverage tables for all train cell lines
fc_train=[pd.read_hdf(
        '/tables/dnase_fc/'+cl+'_dnase_fold_cov.hdf',
        'dnase_fold_cov')
          for cl in TRAIN_CELL_TYPES ]

#load fold coverage table for the final submission cell line
fc_valid=pd.read_hdf(
    '/tables/dnase_fc/'+VALID_CELL_TYPE+'_dnase_fold_cov.hdf',
    'dnase_fold_cov')

#load motif table for the transciption factor
motif=pd.read_hdf('/tables/motif/'+TF+'_motif.hdf','motif')
          
#load labels for the transcription factor
labels=pd.read_hdf('/tables/labels/'+TF+'_labels.hdf','labels')

print (time.time()-start),'s'

Opening /tables/dnase_fc/A549_dnase_fold_cov.hdf in read-only mode
Opening /tables/dnase_fc/H1-hESC_dnase_fold_cov.hdf in read-only mode
Opening /tables/dnase_fc/HeLa-S3_dnase_fold_cov.hdf in read-only mode
Opening /tables/dnase_fc/HepG2_dnase_fold_cov.hdf in read-only mode
Opening /tables/dnase_fc/IMR-90_dnase_fold_cov.hdf in read-only mode
Opening /tables/dnase_fc/K562_dnase_fold_cov.hdf in read-only mode
Opening /tables/dnase_fc/MCF-7_dnase_fold_cov.hdf in read-only mode
Opening /tables/motif/CTCF_motif.hdf in read-only mode
Opening /tables/labels/CTCF_labels.hdf in read-only mode
133.76041007 s


### Create training data

- Select the training chromosomes
- Stack different inputs into one input


- As I remeber the slowest part is the indexing not the column stacking. I don't know why getting data using an index is that slow.

In [4]:
start0=time.time()

X_train,y_train=[],[]
for i,cl_fc in enumerate(fc_train):
    print 'Adding:',TRAIN_CELL_TYPES[i],
    start=time.time()
    
    #X
    #select the training chromosomes
    cl_df=pd.concat([cl_fc.loc[TRAIN_CHRS],
                     motif.loc[TRAIN_CHRS]],axis=1)
    #rename dnase fc to a uniform column name
    cl_df.rename(columns={cl_df.columns[0]:
                          'dnase_fc'},inplace=True)
    #add df to a list to be concatenated later
    X_train.append(cl_df)
    
    #y
    y_train.append(labels.loc[
            TRAIN_CHRS,TRAIN_CELL_TYPES[i]].values)
    
    print 'Done:', (time.time()-start),'s'
    
#concatenate all cell line dfs
X_train=pd.concat(X_train,ignore_index=True)
y_train=np.concatenate(y_train).astype(int)

print (time.time()-start0),'s'

Adding: A549 Done: 15.2808690071 s
Adding: H1-hESC Done: 18.9058570862 s
Adding: HeLa-S3 Done: 87.4024350643 s
Adding: HepG2 Done: 82.5313339233 s
Adding: IMR-90 Done: 108.552444935 s
Adding: K562 Done: 51.8257849216 s
387.416275024 s


In [9]:
X_train.head(10000).tail()

Unnamed: 0,dnase_fc,CTCF_motif_mean,CTCF_motif_max,CTCF_motif_q99,CTCF_motif_q95,CTCF_motif_q90,CTCF_motif_q75,CTCF_motif_q50
9995,0.0,6.099886,7.315357,7.259808,6.991489,6.836953,6.686817,6.452157
9996,0.0,6.175891,7.315357,7.259808,6.984213,6.847057,6.715857,6.484656
9997,0.044593,6.048253,7.315357,7.275012,6.955609,6.836365,6.703036,6.438557
9998,0.267558,6.032314,7.315357,7.141457,6.964345,6.840757,6.705417,6.478757
9999,0.55159,6.134696,7.315357,7.244215,6.987797,6.860341,6.705417,6.456557


### Create validation data

In [6]:
start0=time.time()
    
#X
#select the chromosomes
X_valid=pd.concat([fc_valid.loc[VALID_CHRS],
                  motif.loc[VALID_CHRS]],axis=1)
#rename dnase fc to uniform column name
X_valid.rename(columns={X_valid.columns[0]:
                        'dnase_fc'},inplace=True)
#drop index
X_valid.reset_index(inplace=True,drop=True)

#y
y_valid=labels.loc[VALID_CHRS,
                   VALID_CELL_TYPE].values.astype(int)

print (time.time()-start0),'s'

94.182956934 s


In [8]:
X_valid.head(10000).tail()

Unnamed: 0,dnase_fc,CTCF_motif_mean,CTCF_motif_max,CTCF_motif_q99,CTCF_motif_q95,CTCF_motif_q90,CTCF_motif_q75,CTCF_motif_q50
9995,0.250047,6.129298,7.440556,7.361156,7.008417,6.906833,6.663777,6.460157
9996,0.20876,5.981045,7.440556,7.382198,7.024705,6.929213,6.663777,6.480457
9997,0.166605,5.791557,7.440556,7.339114,6.996809,6.864453,6.614897,6.427657
9998,0.124433,5.782106,7.360157,7.332407,6.928401,6.800025,6.610097,6.412156
9999,0.12349,5.94834,7.360157,7.332407,6.989661,6.906292,6.694957,6.433157


### Train SGD logistic reg
- Actually it doesn't seem to use the 12 cores for most of the time

In [16]:
start=time.time()

clf = SGDClassifier(loss='log', class_weight='balanced', n_jobs=12)
clf.fit(X_train,y_train)

print (time.time()-start),'s'

117.571002007 s


### Evaluate

In [17]:
start=time.time()

print 'auc:',roc_auc_score(y_valid,clf.predict_proba(X_valid)[:,1])

print (time.time()-start),'s'

auc: 0.971938896536
1.24858117104 s
