In [None]:
!pip install -q biopython

[K     |████████████████████████████████| 2.3MB 5.4MB/s 
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive')
DATA_DIR = '/content/drive/My Drive/data/ChIP-seq/'

Mounted at /content/drive


In [None]:
!cp -r '/content/drive/My Drive/dna_nn' .
!ls dna_nn

download.py  load.py


In [None]:
import gc

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.data import Dataset

from dna_nn.load import encode, encoded_shape, gen_from_fasta

%load_ext autoreload
%autoreload 2

# Conv2d

In [None]:
# generators
word_size = 3
region_size = 0
subset_size = 690 * 139

def cut(x):
    return x[:60]

x_shape = encoded_shape(range(101), word_size, region_size)
encode_func = encode(word_size, region_size)

x_shape = (60 - word_size + 1, ) + x_shape[1:]

train_file = DATA_DIR + 'motif_discovery_train.fasta'
test_file = DATA_DIR + 'motif_discovery_test.fasta'
train_gen = gen_from_fasta(train_file, [cut, encode_func])
test_gen = gen_from_fasta(test_file, [cut, encode_func])

# datasets
bacth_size = 512
prefetch = tf.data.experimental.AUTOTUNE

output_shapes = (x_shape, ())
output_types = (tf.float32, tf.float32)

train_ds = Dataset.from_generator(train_gen, output_types, output_shapes)
# takes about 30 seconds to skip the training data
val_ds = train_ds.skip(subset_size).take(690 * 10)
train_ds = train_ds.take(subset_size).shuffle(500).batch(bacth_size).prefetch(prefetch)

test_ds = Dataset.from_generator(test_gen, output_types, output_shapes)
test_ds = test_ds.take(subset_size).batch(bacth_size).prefetch(prefetch)

print('x shape:', train_ds.element_spec[0].shape)
print('y shape:', train_ds.element_spec[1].shape)

x shape: (None, 58, 64, 1)
y shape: (None,)


In [None]:
%%time
x_val, y_val = [], []
for d in val_ds:
    x_val.append(d[0])
    y_val.append(d[1])
x_val = tf.convert_to_tensor(x_val)
y_val = tf.convert_to_tensor(y_val)
validation_data = (x_val, y_val)

CPU times: user 36.1 s, sys: 2.62 s, total: 38.7 s
Wall time: 42.1 s


In [None]:
keras.backend.clear_session()

model = keras.Sequential([
    keras.layers.Conv2D(64, 3, activation='relu', input_shape=x_shape),
    keras.layers.MaxPooling2D(),
    keras.layers.Dropout(0.5),
    keras.layers.Flatten(),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(2, activation='softmax'),
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics='accuracy')
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 56, 62, 64)        640       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 28, 31, 64)        0         
_________________________________________________________________
dropout (Dropout)            (None, 28, 31, 64)        0         
_________________________________________________________________
flatten (Flatten)            (None, 55552)             0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 55552)             0         
_________________________________________________________________
dense (Dense)                (None, 128)               7110784   
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 2

In [None]:
history = model.fit(train_ds, validation_data=validation_data, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
model.evaluate(test_ds)



[0.6759299635887146, 0.5937753915786743]