In [1]:
!pip install -q biopython

[?25l[K     |▏                               | 10kB 29.7MB/s eta 0:00:01[K     |▎                               | 20kB 33.3MB/s eta 0:00:01[K     |▍                               | 30kB 22.6MB/s eta 0:00:01[K     |▋                               | 40kB 18.0MB/s eta 0:00:01[K     |▊                               | 51kB 14.1MB/s eta 0:00:01[K     |▉                               | 61kB 12.2MB/s eta 0:00:01[K     |█                               | 71kB 11.1MB/s eta 0:00:01[K     |█▏                              | 81kB 11.7MB/s eta 0:00:01[K     |█▎                              | 92kB 11.1MB/s eta 0:00:01[K     |█▌                              | 102kB 11.1MB/s eta 0:00:01[K     |█▋                              | 112kB 11.1MB/s eta 0:00:01[K     |█▊                              | 122kB 11.1MB/s eta 0:00:01[K     |█▉                              | 133kB 11.1MB/s eta 0:00:01[K     |██                              | 143kB 11.1MB/s eta 0:00:01[K     |██▏          

In [2]:
from google.colab import drive
drive.mount('/content/drive')
DATA_DIR = '/content/drive/My Drive/data/ChIP-seq/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
!cp -r '/content/drive/My Drive/dna_nn' .
!ls dna_nn

download.py  load.py


In [4]:
from itertools import product

from Bio import SeqIO
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.data import Dataset

from dna_nn.load import gen_from_fasta

In [5]:
vocab = [' '.join(p) for p in product('ACGT', repeat=1)]
vectorize_layer = keras.layers.experimental.preprocessing.TextVectorization(
    standardize=lambda x: tf.strings.regex_replace(x, '(.)', '\\1 '), ngrams=1
)
vectorize_layer.adapt(vocab)

In [6]:
def vectorize_text(x, y):
    x_index = vectorize_layer(x)
    return x_index, y

In [7]:
# generators
subset_size = 690 * 190

train_file = DATA_DIR + 'motif_discovery_train.fasta'
test_file = DATA_DIR + 'motif_discovery_test.fasta'
train_gen = gen_from_fasta(train_file, None)
test_gen = gen_from_fasta(test_file, None)

# datasets
bacth_size = 512
prefetch = tf.data.experimental.AUTOTUNE

output_shapes = ((), ())
output_types = (tf.string, tf.float32)

train_ds = Dataset.from_generator(train_gen, output_types, output_shapes)
val_ds = train_ds.skip(subset_size).take(690 * 10)
train_ds = train_ds.take(subset_size) \
            .shuffle(500) \
            .batch(bacth_size) \
            .map(vectorize_text) \
            .prefetch(prefetch)

val_ds = val_ds.map(vectorize_text) \
            .prefetch(prefetch)

test_ds = Dataset.from_generator(test_gen, output_types, output_shapes)
test_ds = test_ds.take(subset_size) \
            .batch(bacth_size) \
            .map(vectorize_text) \
            .prefetch(prefetch)

print('x shape:', train_ds.element_spec[0].shape)
print('y shape:', train_ds.element_spec[1].shape)

x shape: (None, None)
y shape: (None,)


In [8]:
%%time
x_val, y_val = [], []
for d in val_ds:
    x_val.append(d[0])
    y_val.append(d[1])
x_val = tf.convert_to_tensor(x_val)
y_val = tf.convert_to_tensor(y_val)
validation_data = (x_val, y_val)

CPU times: user 30.6 s, sys: 3.11 s, total: 33.7 s
Wall time: 36.9 s


In [11]:
keras.backend.clear_session()

model = keras.Sequential([
    keras.Input(shape=(101)),
    keras.layers.Embedding(input_dim=101, output_dim=128),
    keras.layers.Reshape((101, 128, 1)),
    keras.layers.Convolution2D(128, (1, 31), padding='same', activation='relu'),
    keras.layers.MaxPooling2D(pool_size=(1, 31)),
    keras.layers.Flatten(),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(2, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics='accuracy')
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 101, 128)          12928     
_________________________________________________________________
reshape (Reshape)            (None, 101, 128, 1)       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 101, 128, 128)     4096      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 101, 4, 128)       0         
_________________________________________________________________
flatten (Flatten)            (None, 51712)             0         
_________________________________________________________________
dropout (Dropout)            (None, 51712)             0         
_________________________________________________________________
dense (Dense)                (None, 32)                1

In [12]:
history = model.fit(train_ds, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
