In [None]:
!pip install -q biopython

%load_ext autoreload
%autoreload 2

from google.colab import drive
drive.mount('/content/drive')

[?25l[K     |▏                               | 10kB 11.7MB/s eta 0:00:01[K     |▎                               | 20kB 17.4MB/s eta 0:00:01[K     |▍                               | 30kB 21.6MB/s eta 0:00:01[K     |▋                               | 40kB 18.7MB/s eta 0:00:01[K     |▊                               | 51kB 17.8MB/s eta 0:00:01[K     |▉                               | 61kB 16.1MB/s eta 0:00:01[K     |█                               | 71kB 14.0MB/s eta 0:00:01[K     |█▏                              | 81kB 12.4MB/s eta 0:00:01[K     |█▎                              | 92kB 12.0MB/s eta 0:00:01[K     |█▌                              | 102kB 11.9MB/s eta 0:00:01[K     |█▋                              | 112kB 11.9MB/s eta 0:00:01[K     |█▊                              | 122kB 11.9MB/s eta 0:00:01[K     |██                              | 133kB 11.9MB/s eta 0:00:01[K     |██                              | 143kB 11.9MB/s eta 0:00:01[K     |██▏          

In [None]:
DATA_DIR = '/content/drive/MyDrive/data/uci/'
LOG_DIR = '/content/drive/MyDrive/dna-nn/results/'

In [None]:
!cp -r '/content/drive/MyDrive/dna-nn/dna_nn' .
!ls dna_nn

dataset.py  download.py  load.py  model.py


In [None]:
import gc

import matplotlib.pyplot as plt
import pandas as pd
from tensorflow import keras

from dna_nn.dataset import splice, splice_raw
from dna_nn.model import models, evaluate

# 2D features

In [None]:
x_shape, x_train, x_test, y_train, y_test = splice(DATA_DIR + 'splice.data')

In [None]:
models

In [None]:
key = 'cnn_zeng_4_conv2d_l2'
dataset = 'splice'

keras.backend.clear_session()
model = models[key](x_shape, 3)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics='accuracy')
model.summary()

In [None]:
csv_path = LOG_DIR + f'{key}-{dataset}-dynamics.csv'
model_path = LOG_DIR + f'{key}-{dataset}.h5'

callbacks = [
    keras.callbacks.CSVLogger(csv_path),
    keras.callbacks.LambdaCallback(
        on_epoch_end=lambda epoch, logs: gc.collect(),
        # on_train_end=lambda logs: model.save(model_path)
    ),
    keras.callbacks.ModelCheckpoint(model_path, save_best_only=True)
]

history = model.fit(x_train, y_train, epochs=50, validation_split=0.15,
                    callbacks=callbacks, verbose=3)

In [None]:
model = keras.models.load_model(model_path)
test_loss, test_accuracy = model.evaluate(x_test, y_test, verbose=3)

y_score = model.predict(x_test)
y_true = [int(y) for y in y_test]
evaluate(model, history, test_accuracy, y_score, y_true, LOG_DIR, key, dataset, 3)

In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve
import numpy as np

dataset = 'splice'
key = 'cnn_zeng_4_conv2d'
model_path = LOG_DIR + f'{key}-{dataset}.h5'

model = keras.models.load_model(model_path)
test_loss, test_accuracy = model.evaluate(x_test, y_test, verbose=3)

y_score = model.predict(x_test)
y_true = [int(y) for y in y_test]

pr = pd.DataFrame()
for cls in range(3):
    precision, recall, thresholds = precision_recall_curve(y_true, y_score[:, cls], pos_label=cls)
    temp = pd.DataFrame({
        'ovr': cls,
        'precision': precision,
        'recall': recall,
        'thresholds': np.append(thresholds, np.nan)
    })
    pr = pd.concat([pr, temp])
pr.to_csv(LOG_DIR + f'{key}-{dataset}-pr.csv', index=False)

# 1D features

In [None]:
x_shape, x_train, x_test, y_train, y_test = splice(DATA_DIR + 'splice.data', 3, 2, False)

In [None]:
models

In [None]:
key = 'deepram_recurrent_onehot'
dataset = 'splice'

keras.backend.clear_session()
model = models[key](x_shape, 3)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics='accuracy')
model.summary()

In [None]:
csv_path = LOG_DIR + f'{key}-{dataset}-dynamics.csv'
model_path = LOG_DIR + f'{key}-{dataset}.h5'

callbacks = [
    keras.callbacks.CSVLogger(csv_path),
    keras.callbacks.LambdaCallback(
        on_epoch_end=lambda epoch, logs: gc.collect(),
        # on_train_end=lambda logs: model.save(model_path)
    ),
    keras.callbacks.ModelCheckpoint(model_path, save_best_only=True)
]

history = model.fit(x_train, y_train, epochs=50, validation_split=0.15,
                    callbacks=callbacks, verbose=3)

In [None]:
model = keras.models.load_model(model_path)
test_loss, test_accuracy = model.evaluate(x_test, y_test, verbose=3)

y_score = model.predict(x_test)
y_true = [int(y) for y in y_test]
evaluate(model, history, test_accuracy, y_score, y_true, LOG_DIR, key, dataset, 3)

# Raw features

In [None]:
x_shape, x_train, x_test, y_train, y_test = splice_raw(DATA_DIR + 'splice.data')

In [None]:
models

{'cnn_deepdbp': <function dna_nn.model.cnn_deepdbp>,
 'cnn_nguyen_2_conv2d': <function dna_nn.model.cnn_nguyen_2_conv2d>,
 'cnn_nguyen_conv1d_2_conv2d': <function dna_nn.model.cnn_nguyen_conv1d_2_conv2d>,
 'cnn_zeng_2_conv2d': <function dna_nn.model.cnn_zeng_2_conv2d>,
 'cnn_zeng_3_conv2d': <function dna_nn.model.cnn_zeng_3_conv2d>,
 'cnn_zeng_4_conv2d': <function dna_nn.model.cnn_zeng_4_conv2d>,
 'cnn_zeng_4_conv2d_l2': <function dna_nn.model.cnn_zeng_4_conv2d_l2>,
 'deepram_conv1d_embed': <function dna_nn.model.deepram_conv1d_embed>,
 'deepram_conv1d_onehot': <function dna_nn.model.deepram_conv1d_onehot>,
 'deepram_conv1d_recurrent_embed': <function dna_nn.model.deepram_conv1d_recurrent_embed>,
 'deepram_conv1d_recurrent_onehot': <function dna_nn.model.deepram_conv1d_recurrent_onehot>,
 'deepram_recurrent_embed': <function dna_nn.model.deepram_recurrent_embed>,
 'deepram_recurrent_onehot': <function dna_nn.model.deepram_recurrent_onehot>}

In [None]:
key = 'cnn_deepdbp'
dataset = 'splice'

keras.backend.clear_session()
model = models[key](x_shape, 3)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics='accuracy')
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 60, 128)           7680      
_________________________________________________________________
reshape (Reshape)            (None, 60, 128, 1)        0         
_________________________________________________________________
conv2d (Conv2D)              (None, 60, 128, 128)      4096      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 60, 4, 128)        0         
_________________________________________________________________
flatten (Flatten)            (None, 30720)             0         
_________________________________________________________________
dropout (Dropout)            (None, 30720)             0         
_________________________________________________________________
dense (Dense)                (None, 32)                9

In [None]:
csv_path = LOG_DIR + f'{key}-{dataset}-dynamics.csv'
model_path = LOG_DIR + f'{key}-{dataset}.h5'

callbacks = [
    keras.callbacks.CSVLogger(csv_path),
    keras.callbacks.LambdaCallback(
        on_epoch_end=lambda epoch, logs: gc.collect(),
        # on_train_end=lambda logs: model.save(model_path)
    ),
    keras.callbacks.ModelCheckpoint(model_path, save_best_only=True)
]

history = model.fit(x_train, y_train, epochs=50, validation_split=0.15,
                    callbacks=callbacks, verbose=3)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
model = keras.models.load_model(model_path)
test_loss, test_accuracy = model.evaluate(x_test, y_test, verbose=3)

y_score = model.predict(x_test)
y_true = [int(y) for y in y_test]
evaluate(model, history, test_accuracy, y_score, y_true, LOG_DIR, key, dataset, 3)