In [None]:
import h5py
import pandas as pd
import numpy as np
from itertools import product, islice
import gc
import random

from sklearn.metrics import precision_recall_curve, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Conv1D, Conv2D, Embedding, Dense, LSTM
from tensorflow.keras.layers import GlobalMaxPooling1D, GlobalMaxPooling2D, MaxPooling1D, MaxPooling2D
from tensorflow.keras.layers import BatchNormalization, Dropout, Flatten, Input, Reshape

In [None]:
print(tf.__version__)

# Load Data

In [None]:
# If run in Google Colab
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/')

In [None]:
# Features, x
feature_raw = h5py.File("../../../data2/features.jld", "r")
# feature_raw = h5py.File("My Drive/DNA-NN/features.jld", "r")
features = np.array(feature_raw["features"])

# Remove the cols that all samples have the same site.
features = features[:, ~np.all(features[1:] == features[:-1], axis=0)]

In [None]:
# Labels, y, 0 for false, 1 for True
response_raw = pd.read_csv("../../../data2/responses.csv").dropna()
# response_raw = pd.read_csv("My Drive/DNA-NN/responses.csv").dropna()
carb_tf = np.array(response_raw["carb"])
toby_tf = np.array(response_raw["toby"])
print(len(carb_tf))

# Split train, val, test and One hot coding

In [None]:
# One hot encoding, need to append to 10 for conv2d and 20 to conv
x = np.zeros((len(features),len(features[0]), 5))
for i in range(len(features)):
  x[i, range(len(features[i])), features[i]] = 1

# # If embeded, not used
# x = np.array(features)

In [None]:
y = np.array([1 if x else 0 for x in carb_tf])

In [None]:
index_1 = np.argwhere(y == 1).flatten().tolist()
index_0 = np.argwhere(y == 0).flatten().tolist()
print(len(index_1), len(index_0))

In [None]:
# Split x and y
random.shuffle(index_1)
random.shuffle(index_0)
train_ratio = 0.7
test_val_ratio = 0.15

# Ratio for true:false class after augmentation
# Used in training for a balanced dataset
# and use the whole dataset for val and test
# With augmentation to 4 * sample sizes

ratio_1_0 = len(index_1) / len(index_0) * 4
index_0_shorten = index_0[:int(len(index_0)*ratio_1_0)]

train_index = index_1[:int(train_ratio * len(index_1))] + \
    index_0_shorten[:int(train_ratio * len(index_0_shorten))]
test_index = index_1[int(train_ratio * len(index_1)):int((train_ratio+test_val_ratio) * len(index_1))] + \
    index_0[int(train_ratio * len(index_0)):int((train_ratio+test_val_ratio) * len(index_0))]
val_index = index_1[int((train_ratio+test_val_ratio) * len(index_1)):] + \
    index_0[int((train_ratio+test_val_ratio) * len(index_0)):]


In [None]:
print("train:", len(train_index), train_index)
print("val:", len(val_index), val_index)
print("test:", len(test_index),test_index)

In [None]:
x_train, y_train = x[train_index].tolist(), y[train_index].tolist()
x_test, y_test = x[test_index], y[test_index]
x_val, y_val = x[val_index], y[val_index]
print(len(x_train), len(x_test), len(x_val))

In [None]:
# Augment the train index for true class (class 1)
def aug_reverse_complement(x_train, y_train, index_1, train_ratio, features):
  train_index_1 = index_1[:int(train_ratio * len(index_1))]
  features_train_1 = features[train_index_1]

  # 1 - A, 2 - C, 3 - G, 4 - T, 0 - missing
  dic_complement = {0: 0, 1: 4, 2: 3, 3: 2, 4: 1}

  for feature in features_train_1:
    feature_new = [dic_complement[x] for x in np.flip(feature)]
    # One hot encoding
    feature_encode = np.zeros((len(feature_new), 5))
    feature_encode[range(len(feature_new)), feature_new] = 1
    # Append to trainig set
    y_train = np.append(y_train, [1], axis=0)  # class for true
    x_train = np.append(x_train, [feature_encode], axis=0)
  
  return x_train, y_train

In [None]:
def aug_bootstrap(x_train, y_train):
  # Randomly shuffle column for multiple times (100 for now)
  # and append to the original matrix
  index_1 = np.argwhere(y_train == 1).flatten().tolist()

  new_x_train_1 = np.transpose(np.copy(np.array(x_train)[index_1]))
  # for i in range(100):
  np.random.shuffle(new_x_train_1)
  
  x_train = np.append(x_train, np.transpose(np.array(new_x_train_1)), axis=0)
  y_train = np.append(y_train, y_train[index_1], axis=0)

  return x_train, y_train

In [None]:
x_train_new, y_train_new = aug_reverse_complement(x_train, y_train, index_1, train_ratio, features)
x_train_new, y_train_new = aug_bootstrap(x_train_new, y_train_new)

In [None]:
len(x_train_new), len(x_train_new[30]), len(y_train_new), y_train_new

In [None]:
# Not used
# x_train, x_testVal, y_train, y_testVal = train_test_split(x, y, test_size=0.3)
# x_test, x_val, y_test, y_val = train_test_split(x_testVal, y_testVal, test_size=0.5)

In [None]:
print("train, test, val len:", len(x_train), len(x_test), len(x_val))

# Fitting models
From https://github.com/solislemuslab/dna-nn-theory/blob/master/cnn/dna_nn/model.py

Crash on one-hot encoding combinging encoding sites into words of 3 sites. 

## Deepram

In [None]:
def deepram_conv1d_recurrent_onehot(x_shape, classes=2):
    model = keras.Sequential([
        Input(shape=x_shape),
        Dropout(0.5),
        Conv1D(64, 3, activation='relu'),
        MaxPooling1D(),
        Conv1D(128, 3, activation='relu'),
        MaxPooling1D(),
        LSTM(64, return_sequences=True),
        LSTM(128),
        Dense(128),
        Dropout(0.5),
        Dense(1, activation='sigmoid') if classes < 3 else Dense(
            classes, activation='softmax')
    ])
    return model

In [None]:
def deepram_recurrent_onehot(x_shape, classes=2):
    model = keras.Sequential([
        Input(shape=x_shape),
        Dropout(0.5),
        LSTM(16, return_sequences=True),
        LSTM(32),
        Dense(32),
        Dropout(0.5),
        Dense(1, activation='sigmoid') if classes < 3 else Dense(classes, activation='softmax')
    ])
    return model

In [None]:
def deepram_recurrent_embed(x_shape, classes=2):
    model = keras.Sequential([
        Input(shape=(x_shape)),
        Embedding(x_shape, 3),
        Dropout(0.5),
        LSTM(8, return_sequences=True),
        LSTM(16),
        Dense(16),
        Dropout(0.5),
        Dense(1, activation='sigmoid') if classes < 3 else Dense(classes, activation='softmax')
    ])
    return model

In [None]:
model = None
keras.backend.clear_session()
# x_shape = (len(x_train[0]), len(x_train[0][0]))
# model = deepram_recurrent_onehot(x_shape)
# model = deepram_recurrent_embed(len(x_train[0]))
model = deepram_conv1d_recurrent_onehot(x_shape)
# model = cnn_nguyen_conv1d_2_conv2d(x_shape)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics='accuracy')
model.summary()

In [None]:
LOG_DIR = "My Drive/DNA-NN/"

csv_path = LOG_DIR + 'DeepRam-dynamics.csv'
model_path = LOG_DIR + 'DeepRam.h5'

callbacks = [
    keras.callbacks.ModelCheckpoint(model_path, save_best_only=True),
    keras.callbacks.CSVLogger(csv_path),
    keras.callbacks.LambdaCallback(
        on_epoch_end=lambda epoch, logs: gc.collect(),
        # on_train_end=lambda logs: model.save(model_path)
    )
]

# Currently takes 20 - 30 min per epoch, will convert to Python script to run on CHTC.
history = model.fit(np.array(x_train), np.array(y_train), epochs=50, validation_data=( np.array(x_val), np.array(y_val) ),
                    callbacks=callbacks, verbose=1, batch_size=4)

## CNN Nguyen
Currently overfitting

In [None]:
# Try using dataset generator to avoid crashing or OOM
def encoded_shape(x_len, word_size=3, region_size=0, onehot=True, expand=True, alphabet='01234'):
    '''calculate the shape of encoding base on the sequence length'''
    dim_1 = x_len - word_size + 1
    dim_2 = ((len(alphabet) ** word_size) if onehot else 1) * (region_size + 1)
    if not region_size and not onehot:
        return (dim_1, 1) if expand else (dim_1,)
    return (dim_1, dim_2, 1) if expand else (dim_1, dim_2)

def gen_from_arrays(features, labels, word_size=3, region_size=0, onehot=True, expand=True, alphabet='01234'):
  words = [''.join(p) for p in product(alphabet, repeat=word_size)]
  word_to_idx = {word: i for i, word in enumerate(words)}
  word_to_idx_func = np.vectorize(lambda word: word_to_idx[word], otypes=[np.int8])
  def gen():
   for x, y in zip(features, labels):
    #  one hot encoding to size 10
      x = ["".join(map(str,x[i:i+word_size])) for i in range(len(x) - word_size + 1)]
      idx = word_to_idx_func(list(x))
      processed_x = np.zeros((len(idx), len(word_to_idx)))
      processed_x[range(len(idx)), idx] = 1
      processed_x = np.expand_dims(processed_x, axis=-1)
      yield processed_x, y
  return gen

train_gen = gen_from_arrays(x_train, y_train)
val_gen = gen_from_arrays(x_val, y_val)
test_gen = gen_from_arrays(x_test, y_test)

In [None]:
# datasets
batch_size = 4
prefetch = tf.data.experimental.AUTOTUNE

x_shape = encoded_shape(len(features[0]))
output_shapes = (x_shape, ())
output_types = (tf.float32, tf.float32)

train_ds = Dataset.from_generator(train_gen, output_types, output_shapes)
train_ds = train_ds.shuffle(500).batch(batch_size).prefetch(prefetch)

test_ds = Dataset.from_generator(test_gen, output_types, output_shapes)
test_ds = test_ds.batch(batch_size).prefetch(prefetch)

val_ds = Dataset.from_generator(val_gen, output_types, output_shapes)
val_ds = train_ds.shuffle(500).batch(batch_size).prefetch(prefetch)

x_val_encode, y_val_encode = [], []
for x, y in val_gen():
    x_val_encode.append(x)
    y_val_encode.append(y)
x_val_encode = np.array(x_val_encode)
y_val_encode = np.array(y_val_encode)

In [None]:
def cnn_nguyen_conv1d_2_conv2d(x_shape, classes=2):
    strides = (x_shape[0] - x_shape[1] + 1, 1) if x_shape[0] > x_shape[1] else (1, x_shape[1] - x_shape[0] + 1)
    model = keras.Sequential([
        Conv2D(16, strides, activation='relu', input_shape=x_shape),
        MaxPooling2D(),
        Conv2D(16, 3, activation='relu'),
        MaxPooling2D(),
        Conv2D(32, 3, activation='relu'),
        MaxPooling2D(),
        Flatten(),
        Dense(32, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid') if classes < 3 else Dense(classes, activation='softmax')
    ])
    return model

In [None]:
# Config tf for InternalError, 
# Failed to call ThenRnnForward with model config:InternalError
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.8
tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))

In [None]:
model = None
keras.backend.clear_session()
# x_shape = (len(x_train[0]), len(x_train[0][0]))
model = cnn_nguyen_conv1d_2_conv2d(x_shape)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics='accuracy')
model.summary()

In [None]:
LOG_DIR = "My Drive/DNA-NN/"

csv_path = LOG_DIR + 'DeepRam-dynamics.csv'
model_path = LOG_DIR + 'DeepRam.h5'

callbacks = [
    keras.callbacks.ModelCheckpoint(model_path, save_best_only=True),
    keras.callbacks.CSVLogger(csv_path),
    keras.callbacks.LambdaCallback(
        on_epoch_end=lambda epoch, logs: gc.collect(),
        # on_train_end=lambda logs: model.save(model_path)
    )
]

# Use tf dataset to generate data separatedly for one hot encoding
# after encode sites into words of 3 sites, still crash
history = model.fit(train_ds, epochs=50, validation_data=(x_val_encode, y_val_encode),
                    callbacks=callbacks, verbose=1, batch_size=4)

# Fit raw data without encoding sites into words, overfitting.
history = model.fit(np.array(x_train), np.array(y_train), epochs=50, validation_data=( np.array(x_val), np.array(y_val) ),
                    callbacks=callbacks, verbose=1, batch_size=4)