<a href="https://colab.research.google.com/github/solislemuslab/dna-nn-theory/blob/master/scripts/ChIP_seq_toy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

version:
- v1: one Conv2D

# Setup

In [None]:
!pip install -q biopython

[?25l[K     |▏                               | 10kB 18.2MB/s eta 0:00:01[K     |▎                               | 20kB 4.6MB/s eta 0:00:01[K     |▍                               | 30kB 6.0MB/s eta 0:00:01[K     |▋                               | 40kB 5.8MB/s eta 0:00:01[K     |▊                               | 51kB 5.1MB/s eta 0:00:01[K     |▉                               | 61kB 5.6MB/s eta 0:00:01[K     |█                               | 71kB 6.2MB/s eta 0:00:01[K     |█▏                              | 81kB 6.7MB/s eta 0:00:01[K     |█▎                              | 92kB 7.0MB/s eta 0:00:01[K     |█▌                              | 102kB 7.0MB/s eta 0:00:01[K     |█▋                              | 112kB 7.0MB/s eta 0:00:01[K     |█▊                              | 122kB 7.0MB/s eta 0:00:01[K     |█▉                              | 133kB 7.0MB/s eta 0:00:01[K     |██                              | 143kB 7.0MB/s eta 0:00:01[K     |██▏                       

In [None]:
from google.colab import drive
drive.mount('/content/drive')
DATA_DIR = '/content/drive/My Drive/data/ChIP-seq-toy/'
LOG_DIR = '/content/drive/My Drive/dna-nn/ChIP-seq-toy/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cp -r '/content/drive/My Drive/dna_nn' .
!ls dna_nn

download.py  load.py


In [None]:
import gc

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.data import Dataset

from dna_nn.load import encode, encoded_shape, gen_from_fasta

%load_ext autoreload
%autoreload 2

# Create dataset

In [None]:
# generators
word_size = 1
region_size = 0

def cut(x):
    return x[:50]

x_shape = encoded_shape(range(101), word_size, region_size)
encode_func = encode(word_size, region_size)

x_shape = (50 - word_size + 1, ) + x_shape[1:]

train_file = DATA_DIR + 'train.fasta'
val_file = DATA_DIR + 'valid.fasta'
test_file = DATA_DIR + 'test.fasta'
train_gen = gen_from_fasta(train_file, [cut, encode_func])
val_gen = gen_from_fasta(val_file, [cut, encode_func])
test_gen = gen_from_fasta(test_file, [cut, encode_func])

# datasets
bacth_size = 512
prefetch = tf.data.experimental.AUTOTUNE

output_shapes = (x_shape, ())
output_types = (tf.float32, tf.float32)

train_ds = Dataset.from_generator(train_gen, output_types, output_shapes)
train_ds = train_ds.take(5000).shuffle(500).batch(bacth_size).prefetch(prefetch)

val_ds = Dataset.from_generator(val_gen, output_types, output_shapes)

test_ds = Dataset.from_generator(test_gen, output_types, output_shapes)
test_ds = test_ds.batch(bacth_size).prefetch(prefetch)

print('x shape:', train_ds.element_spec[0].shape)
print('y shape:', train_ds.element_spec[1].shape)

x shape: (None, 50, 4, 1)
y shape: (None,)


In [None]:
%%time
x_val, y_val = [], []
for d in val_ds:
    x_val.append(d[0])
    y_val.append(d[1])
x_val = tf.convert_to_tensor(x_val)
y_val = tf.convert_to_tensor(y_val)
validation_data = (x_val, y_val)

CPU times: user 745 ms, sys: 83.6 ms, total: 828 ms
Wall time: 635 ms


## Build model

In [None]:
keras.backend.clear_session()

model = keras.Sequential([
    keras.layers.Conv2D(128, 3, activation='relu', input_shape=x_shape),
    keras.layers.MaxPooling2D(),
    keras.layers.Flatten(),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(2, activation='softmax')
])
optimizer = keras.optimizers.Adam()
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 48, 2, 128)        1280      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 24, 1, 128)        0         
_________________________________________________________________
flatten (Flatten)            (None, 3072)              0         
_________________________________________________________________
dense (Dense)                (None, 64)                196672    
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 130       
Total params: 198,082
Trainable params: 198,082
Non-trainable params: 0
__________________________________________________

In [None]:
version = 'v1-base'

def scheduler(epoch, lr):
    return lr if (epoch % 10) else lr / 10

callbacks = [
    # keras.callbacks.CSVLogger(LOG_DIR + f'{version}.csv'),
    # keras.callbacks.TensorBoard(log_dir=LOG_DIR + f'log/{version}/'),
    # keras.callbacks.LambdaCallback(
    #     on_epoch_end=lambda epoch, logs: gc.collect(),
    #     on_train_end=lambda logs: model.save(LOG_DIR + f'{version}.h5')
    # ),
    # keras.callbacks.LearningRateScheduler(scheduler),
    # keras.callbacks.EarlyStopping(monitor="accuracy", min_delta=0.01 ,patience=5)
]

history = model.fit(train_ds, validation_data=validation_data, epochs=30, callbacks=callbacks)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
model.evaluate(test_ds)



[0.5559330582618713, 0.7122902274131775]

In [None]:
m = keras.metrics.AUC()
y_pred = model.predict((xy[0] for xy in test_ds))
y_true = [xy[1] for xy in test_ds.unbatch()]

In [None]:
m.reset_states()
# m.update_state(y_true, y_pred[range(len(y_pred)), np.argmax(y_pred, axis=1)])
m.update_state(y_true, np.argmax(y_pred, axis=1))
m.result()

<tf.Tensor: shape=(), dtype=float32, numpy=0.71232647>