In [4]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt

import innvestigate.utils as iutils
import innvestigate.utils.tests.networks.imagenet
import innvestigate.utils.visualizations as ivis
import sys



from keras.models import Model
from keras.layers import Input
from keras.layers.merge import Multiply
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution1D, AveragePooling1D

plt.rcParams["figure.figsize"] = (17, 20)




In [26]:
def PREPROCESS(lines):
    data_n = len(lines) - 1
    SEQ = zeros((data_n, 34, 4), dtype=int)
    CA = zeros((data_n, 1), dtype=float)
    
    for l in range(1, data_n+1):
        data = lines[l].split()
        seq = data[0]
        for i in range(34):
            if seq[i] in "Aa":
                SEQ[l-1, i, 0] = 1
            elif seq[i] in "Cc":
                SEQ[l-1, i, 1] = 1
            elif seq[i] in "Gg":
                SEQ[l-1, i, 2] = 1
            elif seq[i] in "Tt":
                SEQ[l-1, i, 3] = 1
        CA[l-1,0] = float(data[1])

    return SEQ, CA

In [29]:
print("Usage: python DeepCpf1.py input.txt output.txt")
print("input.txt must include 3 columns with single header row")
print("\t1st column: sequence index")
print("\t2nd column: 34bp target sequence")
print("\t3rd column: binary chromain information of the target sequence\n")

print("DeepCpf1 currently requires python=2.7.12, theano=0.7.0, keras=0.3.3")
print("DeepCpf1 available on GitHub requires pre-obtained binary chromatin information (DNase-seq narraow peak data from ENCODE)")
print("DeepCpf1 web tool, available at http://data.snu.ac.kr/DeepCpf1, provides entire pipeline including binary chromatin accessibility for 125 cell lines\n")

if len(sys.argv) < 3:
    sys.exit()

print("Building models")
Seq_deepCpf1_Input_SEQ = Input(shape=(34,4))
Seq_deepCpf1_C1 = Convolution1D(80, 5, activation='relu')(Seq_deepCpf1_Input_SEQ)
Seq_deepCpf1_P1 = AveragePooling1D(2)(Seq_deepCpf1_C1)
Seq_deepCpf1_F = Flatten()(Seq_deepCpf1_P1)
Seq_deepCpf1_DO1= Dropout(0.3)(Seq_deepCpf1_F)
Seq_deepCpf1_D1 = Dense(80, activation='relu')(Seq_deepCpf1_DO1)
Seq_deepCpf1_DO2= Dropout(0.3)(Seq_deepCpf1_D1)
Seq_deepCpf1_D2 = Dense(40, activation='relu')(Seq_deepCpf1_DO2)
Seq_deepCpf1_DO3= Dropout(0.3)(Seq_deepCpf1_D2)
Seq_deepCpf1_D3 = Dense(40, activation='relu')(Seq_deepCpf1_DO3)
Seq_deepCpf1_DO4= Dropout(0.3)(Seq_deepCpf1_D3)
Seq_deepCpf1_Output = Dense(1, activation='linear')(Seq_deepCpf1_DO4)
Seq_deepCpf1 = Model(inputs=[Seq_deepCpf1_Input_SEQ], outputs=[Seq_deepCpf1_Output])

DeepCpf1_Input_SEQ = Input(shape=(34,4))
DeepCpf1_C1 = Convolution1D(80, 5, activation='relu')(DeepCpf1_Input_SEQ)
DeepCpf1_P1 = AveragePooling1D(2)(DeepCpf1_C1)
DeepCpf1_F = Flatten()(DeepCpf1_P1)
DeepCpf1_DO1= Dropout(0.3)(DeepCpf1_F)
DeepCpf1_D1 = Dense(80, activation='relu')(DeepCpf1_DO1)
DeepCpf1_DO2= Dropout(0.3)(DeepCpf1_D1)
DeepCpf1_D2 = Dense(40, activation='relu')(DeepCpf1_DO2)
DeepCpf1_DO3= Dropout(0.3)(DeepCpf1_D2)
DeepCpf1_D3_SEQ = Dense(40, activation='relu')(DeepCpf1_DO3)

DeepCpf1_Input_CA = Input(shape=(1,))
DeepCpf1_D3_CA = Dense(40, activation='relu')(DeepCpf1_Input_CA)
DeepCpf1_M = Multiply()([DeepCpf1_D3_SEQ, DeepCpf1_D3_CA])

DeepCpf1_DO4= Dropout(0.3)(DeepCpf1_M)
DeepCpf1_Output = Dense(1, activation='linear')(DeepCpf1_DO4)
DeepCpf1 = Model(inputs=[DeepCpf1_Input_SEQ, DeepCpf1_Input_CA], outputs=[DeepCpf1_Output])

#print("Loading weights for the models")
#Seq_deepCpf1.load_weights('weights/Seq_deepCpf1_weights.h5')
#DeepCpf1.load_weights('weights/DeepCpf1_weights.h5')

print("Loading training data")
FILE = open("training.txt", "r")
data = FILE.readlines()
SEQ, CA = PREPROCESS(data)
FILE.close()

print("Compiling Model")
Seq_deepCpf1.compile(optimizer='adam', loss='mean_squared_error')

#print("Training on training data")
#Seq_deepCpf1.fit(SEQ, CA, batch_size=50, verbose=0)
#DeepCpf1_SCORE = DeepCpf1.predict([SEQ, CA], batch_size=50, verbose=0) * 3

Usage: python DeepCpf1.py input.txt output.txt
input.txt must include 3 columns with single header row
	1st column: sequence index
	2nd column: 34bp target sequence
	3rd column: binary chromain information of the target sequence

DeepCpf1 currently requires python=2.7.12, theano=0.7.0, keras=0.3.3
DeepCpf1 available on GitHub requires pre-obtained binary chromatin information (DNase-seq narraow peak data from ENCODE)
DeepCpf1 web tool, available at http://data.snu.ac.kr/DeepCpf1, provides entire pipeline including binary chromatin accessibility for 125 cell lines

Building models


Loading training data
Compiling Model
Training on training data




<keras.callbacks.History at 0x261542e8>

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import svm

SEQ_train, SEQ_validation, CA_train, CA_validation = train_test_split(SEQ, CA, test_size=0.10, random_state=0)

#clf = svm.SVC(kernel='linear', C=1)
#scores = cross_val_score(clf, SEQ_train, CA_train, cv=5)
#print(scores)

print("Training on training data")
#Seq_deepCpf1.fit(SEQ[validation], CA[validation], batch_size=50, verbose=0)

Training on training data
