In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
np.random.seed(12)

## Load dna2vec

In [3]:
from dna2vec.multi_k_model import MultiKModel

filepath = 'dna2vec-20161219-0153-k3to8-100d-10c-29320Mbp-sliding-Xat.w2v'
mk_model = MultiKModel(filepath)

In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

## shuffle data

In [5]:
train = train.sample(frac=1).reset_index(drop=True)

In [6]:
train.head()

Unnamed: 0,id,sequence,label
0,369,GGCGGTGGAGGCTC,1
1,1342,TTTTTGTATTTTTA,0
2,520,CCCGAAAACAGGAC,1
3,1636,CCAGCTAATTTTTG,0
4,1982,CATGATTGGGTTCA,0


In [7]:
test.head()

Unnamed: 0,id,sequence
0,0,GCGGGGCGAGCCTC
1,1,CCAGAGTCCGATTG
2,2,GATCCTGGCTGGGA
3,3,ATCCTCCCACCTCA
4,4,TGTAGGTGATGTGC


In [8]:
def split_input(data, kmer=4):
    temp = []
    for d in data['sequence']:
        temp.append([d[i:i+kmer] for i in range(0, len(d), kmer)])
    temp = pd.DataFrame(temp).add_prefix('sequence_')
    return pd.concat([data['id'], temp,data['label']],axis=1)

In [9]:
def create_input(data):
    t = data.applymap(mk_model.vector)
    prev = np.zeros((1,1,100))
    for index, row  in t.iterrows():
        for i in row:
            prev = np.concatenate((prev, [[i.tolist()]]))
    return np.delete(prev, 0,axis=0).reshape(t.shape[0],t.shape[1],100)

In [10]:
y_train = train['label'].as_matrix()

In [11]:
def dna_model(steps=2,data_dim=32):
    model = Sequential()
    model.add(LSTM(32, return_sequences=True,
               input_shape=(steps, data_dim)))  
    model.add(LSTM(32, return_sequences=True))
    model.add(LSTM(32))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    #print(model.summary())
    return model

In [12]:

# Generate dummy training data
x_train = np.random.random((1000, 2, 32))
y_train = np.random.choice(2, 1000)

# Generate dummy validation data
x_val = np.random.random((100, 2, 32))
y_val = np.random.choice(2, 100)

In [13]:
estimator = KerasClassifier(build_fn=dna_model, epochs=5, batch_size=32, verbose=0)

In [14]:
kfold = KFold(n_splits=10, shuffle=True, random_state=12)

In [15]:
results = cross_val_score(estimator, x_train, y_train, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Baseline: 49.30% (3.52%)
