# Very naive deep learning on the vector of surrounding bases

---

### Data

Naive feature vectors. The original sequence of validation/test and train data does not overlap! ( but train data points can overlap with train data points, and test-validation can overlap with test-validation data ) This overlapping does not lead to unintentional label leakage!



### Notes


---

Instruct theano to use gpu

In [1]:
import os
os.environ['THEANO_FLAGS']='device=gpu'

import sys
sys.path.append('../my_modules')
from loading_utils import read_my_data

import subprocess
import time

import numpy as np
import pandas as pd

import os,subprocess
workdir='/mnt/Data1/ribli/methylation_code/modelling'
subprocess.call(['mkdir',workdir])
os.chdir(workdir)

Using gpu device 0: GeForce GTX 670 (CNMeM is disabled, CuDNN not available)


### Load data

In [2]:
train_id,train_x,train_y = read_my_data(
    fname='../prepare_data/big_train_feat_vect.csv')
test_id,test_x,test_y = read_my_data(
    fname='../prepare_data/big_test_feat_vect.csv')

Loading data... 
Loading data... 


### Load annotation

In [3]:
annot=pd.read_csv('../explore_data/relevant_annotations.csv',sep='\t',header=None)
annot.columns=['id','Regulatory_Feature_Group','Relation_to_UCSC_CpG_Island',
    'Strand','Infinium_Design_Type','Random_Loci','Methyl27_Loci']
annot.fillna(0,inplace=True)
train_merged=pd.DataFrame(train_id,columns=['id']).merge(annot,on=['id'])
test_merged=pd.DataFrame(test_id,columns=['id']).merge(annot,on=['id'])

### Select inidces for islands

In [4]:
cg_exl_idx=np.array([x[499]==2 and x[500]==3 for x in train_x])
annot_idx=np.array(np.zeros(len(train_x)),dtype=bool)
annot_idx[train_merged[train_merged.Relation_to_UCSC_CpG_Island=='Island'].index]=True
train_idx=cg_exl_idx & annot_idx
train_idx_0=cg_exl_idx & annot_idx & (train_y ==0)
train_idx_1=cg_exl_idx & annot_idx & (train_y ==1)


cg_exl_idx=np.array([x[499]==2 and x[500]==3 for x in test_x])
annot_idx=np.array(np.zeros(len(test_x)),dtype=bool)
annot_idx[test_merged[test_merged.Relation_to_UCSC_CpG_Island=='Island'].index]=True
test_idx=cg_exl_idx & annot_idx
test_idx_0=cg_exl_idx & annot_idx & (test_y==0)
test_idx_1=cg_exl_idx & annot_idx & (test_y==1)

### Reshape x data

In [5]:
#make it image like
train_x,test_x=[x.reshape((-1,1,1000,1)) for x in (train_x,test_x)]

### Build Convnet

In [6]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D,MaxPooling2D

input_dim=train_x.shape[2]
activation='relu'
loss='binary_crossentropy'
optimizer='adadelta'
init='uniform'
pool_size=(8,1)
window_size=5
dense_n=64

model = Sequential()

#Convolution layer 1
model.add(Convolution2D(20,window_size,1, border_mode='valid',input_shape=(1,input_dim,1)))
model.add(Activation(activation))
model.add(MaxPooling2D(pool_size=pool_size))
#model.add(Dropout(0.25))

#Convolution layer 2
model.add(Convolution2D(50,window_size,1, border_mode='valid'))
model.add(Activation(activation))
model.add(MaxPooling2D(pool_size=pool_size))
#model.add(Dropout(0.25))

#Dense layer
model.add(Flatten())
model.add(Dense(dense_n,activation=activation))
#model.add(Dropout(0.5))

#final layer
model.add(Dense(1, activation='sigmoid'))

#compile model
model.compile(loss=loss,optimizer=optimizer,class_mode='binary')

Using Theano backend.


### Train and test model

In [16]:
from keras.callbacks import ModelCheckpoint,EarlyStopping

def fit_keras_model(model,train_x,train_y,test_x,test_y,validation_split=0.05):
    start=time.time()
    
    #callbacks
    best_model=ModelCheckpoint('best_model',save_best_only=True,verbose=1)
    early_stop=EarlyStopping(patience=7,verbose=1)
    
    #train it
    callb_hist=model.fit(train_x,train_y,nb_epoch = 100,
                         show_accuracy=True,verbose=1,
                        validation_split=validation_split,
                        callbacks=[best_model,early_stop])
    #predict
    model.load_weights('best_model')
    train_pred=model.predict_classes(train_x).ravel()
    test_pred=model.predict_classes(test_x).ravel()

    #check errors
    print 'train score:',list((train_pred==train_y)).count(True)/float(len(train_y))
    print 'test score:',list((test_pred==test_y)).count(True)/float(len(test_y))

    print 'It took:',time.time()-start    
    return train_pred,test_pred

### All labels

In [9]:
N_train,N_test=train_x.shape[0],test_x.shape[0]
train_pred,test_pred=fit_keras_model(
    model,train_x[train_idx],train_y[train_idx],test_x[test_idx],test_y[test_idx])

Train on 81434 samples, validate on 4287 samples
Epoch 1/100
Epoch 00000: val_loss improved from inf to 0.36029, saving model to best_model
Epoch 2/100
Epoch 00001: val_loss improved from 0.36029 to 0.29447, saving model to best_model
Epoch 3/100
Epoch 00002: val_loss improved from 0.29447 to 0.28997, saving model to best_model
Epoch 4/100
Epoch 00003: val_loss did not improve
Epoch 5/100
Epoch 00004: val_loss improved from 0.28997 to 0.26687, saving model to best_model
Epoch 6/100
Epoch 00005: val_loss did not improve
Epoch 7/100
Epoch 00006: val_loss improved from 0.26687 to 0.25854, saving model to best_model
Epoch 8/100
Epoch 00007: val_loss improved from 0.25854 to 0.21578, saving model to best_model
Epoch 9/100
Epoch 00008: val_loss did not improve
Epoch 10/100
Epoch 00009: val_loss did not improve
Epoch 11/100
Epoch 00010: val_loss improved from 0.21578 to 0.20582, saving model to best_model
Epoch 12/100
Epoch 00011: val_loss improved from 0.20582 to 0.20443, saving model to bes

In [10]:
print '\ndata and prediction balance:'
print 1-np.mean(train_y[train_idx])
print 1-np.mean(test_y[test_idx])

print 1-np.mean(train_pred)
print 1-np.mean(test_pred)


data and prediction balance:
0.871303414566
0.871566632757
0.900479462442
0.903865717192


### Balance labels

In [26]:
bal_train_x=np.concatenate([train_x[train_idx_0][:np.sum(train_idx_1)],train_x[train_idx_1]])
bal_train_y=np.concatenate([train_y[train_idx_0][:np.sum(train_idx_1)],train_y[train_idx_1]])

bal_test_x=np.concatenate([test_x[test_idx_0][:np.sum(test_idx_1)],test_x[test_idx_1]])
bal_test_y=np.concatenate([test_y[test_idx_0][:np.sum(test_idx_1)],test_y[test_idx_1]])


#shuffle them
#set seed to make the selection reproducible
rng=np.random.RandomState(42)
new_idx=rng.permutation(len(bal_train_y))
bal_train_x=bal_train_x[new_idx]
bal_train_y=bal_train_y[new_idx]

new_idx=rng.permutation(len(bal_test_y))
bal_test_x=bal_test_x[new_idx]
bal_test_y=bal_test_y[new_idx]

print np.mean(bal_train_y)
print np.mean(bal_test_y)

0.5
0.5


In [37]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D,MaxPooling2D

input_dim=train_x.shape[2]
activation='relu'
loss='binary_crossentropy'
optimizer='adadelta'
init='uniform'
pool_size=(8,1)
window_size=5
dense_n=64

model = Sequential()

#Convolution layer 1
model.add(Convolution2D(20,window_size,1, border_mode='valid',input_shape=(1,input_dim,1)))
model.add(Activation(activation))
model.add(MaxPooling2D(pool_size=pool_size))
#model.add(Dropout(0.25))

#Convolution layer 2
model.add(Convolution2D(50,window_size,1, border_mode='valid'))
model.add(Activation(activation))
model.add(MaxPooling2D(pool_size=pool_size))
#model.add(Dropout(0.25))

#Dense layer
model.add(Flatten())
model.add(Dense(dense_n,activation=activation))
#model.add(Dropout(0.5))

#final layer
model.add(Dense(1, activation='sigmoid'))

#compile model
model.compile(loss=loss,optimizer=optimizer,class_mode='binary')

In [38]:
train_pred,test_pred=fit_keras_model(
    model,bal_train_x,bal_train_y,bal_test_x,bal_test_y,validation_split=0.2)

print '\nprediction balance:'
print np.mean(train_pred)
print np.mean(test_pred)

Train on 17651 samples, validate on 4413 samples
Epoch 1/100
Epoch 00000: val_loss improved from inf to 0.69360, saving model to best_model
Epoch 2/100
Epoch 00001: val_loss improved from 0.69360 to 0.63448, saving model to best_model
Epoch 3/100
Epoch 00002: val_loss improved from 0.63448 to 0.58496, saving model to best_model
Epoch 4/100
Epoch 00003: val_loss did not improve
Epoch 5/100
Epoch 00004: val_loss improved from 0.58496 to 0.55746, saving model to best_model
Epoch 6/100
Epoch 00005: val_loss did not improve
Epoch 7/100
Epoch 00006: val_loss improved from 0.55746 to 0.55508, saving model to best_model
Epoch 8/100
Epoch 00007: val_loss improved from 0.55508 to 0.51213, saving model to best_model
Epoch 9/100
Epoch 00008: val_loss improved from 0.51213 to 0.50228, saving model to best_model
Epoch 10/100
Epoch 00009: val_loss did not improve
Epoch 11/100
Epoch 00010: val_loss improved from 0.50228 to 0.49315, saving model to best_model
Epoch 12/100
Epoch 00011: val_loss did not 