In [None]:
!pip install -q biopython

%load_ext autoreload
%autoreload 2

from google.colab import drive
drive.mount('/content/drive')

[?25l[K     |▏                               | 10kB 21.9MB/s eta 0:00:01[K     |▎                               | 20kB 25.9MB/s eta 0:00:01[K     |▍                               | 30kB 12.4MB/s eta 0:00:01[K     |▋                               | 40kB 9.7MB/s eta 0:00:01[K     |▊                               | 51kB 9.8MB/s eta 0:00:01[K     |▉                               | 61kB 9.9MB/s eta 0:00:01[K     |█                               | 71kB 9.1MB/s eta 0:00:01[K     |█▏                              | 81kB 9.5MB/s eta 0:00:01[K     |█▎                              | 92kB 8.9MB/s eta 0:00:01[K     |█▌                              | 102kB 9.0MB/s eta 0:00:01[K     |█▋                              | 112kB 9.0MB/s eta 0:00:01[K     |█▊                              | 122kB 9.0MB/s eta 0:00:01[K     |██                              | 133kB 9.0MB/s eta 0:00:01[K     |██                              | 143kB 9.0MB/s eta 0:00:01[K     |██▏                     

In [None]:
DATA_DIR = '/content/drive/MyDrive/data/histone/'
LOG_DIR = '/content/drive/MyDrive/dna-nn/results/'

In [None]:
!cp -r '/content/drive/MyDrive/dna-nn/dna_nn' .
!ls dna_nn

dataset.py  download.py  load.py  model.py


In [None]:
import gc

import matplotlib.pyplot as plt
import pandas as pd
from tensorflow import keras

from dna_nn.dataset import h3, h3_raw
from dna_nn.model import models, evaluate

# 2D features

In [None]:
x_shape, train_ds, validation_data, test_ds = h3(DATA_DIR + 'H3.fasta')

14963 samples loaded


In [None]:
models

In [None]:
key = 'cnn_zeng_4_conv2d_l2'

keras.backend.clear_session()
model = models[key](x_shape)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics='accuracy')
model.summary()

In [None]:
csv_path = LOG_DIR + f'{key}-histone-dynamics.csv'
model_path = LOG_DIR + f'{key}-histone.h5'

callbacks = [
    keras.callbacks.CSVLogger(csv_path),
    keras.callbacks.LambdaCallback(
        on_epoch_end=lambda epoch, logs: gc.collect(),
        # on_train_end=lambda logs: model.save(model_path)
    ),
    keras.callbacks.ModelCheckpoint(model_path, save_best_only=True)
]

history = model.fit(train_ds, epochs=50, validation_data=validation_data,
                    callbacks=callbacks, verbose=3)

In [None]:
model = keras.models.load_model(model_path)
test_loss, test_accuracy = model.evaluate(test_ds, verbose=3)

y_score = model.predict(test_ds)
y_true = [int(y) for x, y in test_ds.unbatch()]
evaluate(model, history, test_accuracy, y_score, y_true, LOG_DIR, key, 'histone')

In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve
import numpy as np

dataset = 'histone'
key = 'cnn_zeng_4_conv2d'
model_path = LOG_DIR + f'{key}-histone.h5'

model = keras.models.load_model(model_path)
test_loss, test_accuracy = model.evaluate(test_ds, verbose=3)
x_shape, train_ds, validation_data, test_ds = h3(DATA_DIR + 'H3.fasta')
y_score = model.predict(test_ds)
y_true = [int(y) for x, y in test_ds.unbatch()]

precision, recall, thresholds = precision_recall_curve(y_true, y_score)
pr = pd.DataFrame({
    'precision': precision,
    'recall': recall,
    'thresholds': np.append(thresholds, np.nan)
})
pr.to_csv(LOG_DIR + f'{key}-{dataset}-pr.csv', index=False)

# 1D features

In [None]:
x_shape, train_ds, validation_data, test_ds = h3(DATA_DIR + 'H3.fasta', 3, 2, False)

14963 samples loaded


In [None]:
models

In [None]:
key = 'deepram_recurrent_onehot'

keras.backend.clear_session()
model = models[key](x_shape)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics='accuracy')
model.summary()

In [None]:
csv_path = LOG_DIR + f'{key}-histone-dynamics.csv'
model_path = LOG_DIR + f'{key}-histone.h5'

callbacks = [
    keras.callbacks.CSVLogger(csv_path),
    keras.callbacks.LambdaCallback(
        on_epoch_end=lambda epoch, logs: gc.collect(),
        # on_train_end=lambda logs: model.save(model_path)
    ),
    keras.callbacks.ModelCheckpoint(model_path, save_best_only=True)
]

history = model.fit(train_ds, epochs=50, validation_data=validation_data,
                    callbacks=callbacks, verbose=3)

In [None]:
model = keras.models.load_model(model_path)
test_loss, test_accuracy = model.evaluate(test_ds, verbose=3)

y_score = model.predict(test_ds)
y_true = [int(y) for x, y in test_ds.unbatch()]
evaluate(model, history, test_accuracy, y_score, y_true, LOG_DIR, key, 'histone')

# Raw features

In [None]:
x_shape, train_ds, validation_data, test_ds = h3_raw(DATA_DIR + 'H3.fasta')

14963 samples loaded


In [None]:
models

In [None]:
key = 'deepram_recurrent_embed'

keras.backend.clear_session()
model = models[key](x_shape)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics='accuracy')
model.summary()

In [None]:
csv_path = LOG_DIR + f'{key}-histone-dynamics.csv'
model_path = LOG_DIR + f'{key}-histone.h5'

callbacks = [
    keras.callbacks.CSVLogger(csv_path),
    keras.callbacks.LambdaCallback(
        on_epoch_end=lambda epoch, logs: gc.collect(),
        # on_train_end=lambda logs: model.save(model_path)
    ),
    keras.callbacks.ModelCheckpoint(model_path, save_best_only=True)
]

history = model.fit(train_ds, epochs=50, validation_data=validation_data,
                    callbacks=callbacks, verbose=3)

In [None]:
model = keras.models.load_model(model_path)
test_loss, test_accuracy = model.evaluate(test_ds, verbose=3)

y_score = model.predict(test_ds)
y_true = [int(y) for x, y in test_ds.unbatch()]
evaluate(model, history, test_accuracy, y_score, y_true, LOG_DIR, key, 'histone')