This notebook contains functionality to perform the following:

Define a dataset for training, and an out-of-domain dataset for validation. To test transfer learning success, train either a feed-forward, convolutional, or recurrent neural network on the training dataset, then evaluate this dataset on the validation set. 

In [None]:
# mount google drive

from google.colab import drive
drive.mount('/content/drive')

%cd '/content/drive/Shareddrives/NRC_Amii_Agronomics_Project/nrc-ml-plant-genomics/'

Mounted at /content/drive
/content/drive/Shareddrives/NRC_Amii_Agronomics_Project/nrc-ml-plant-genomics


In [None]:
import argparse
import keras
import warnings, logging
import numpy as np
import pandas as pd
import datetime, time, os
import json
import random
import tensorflow as tf
import math

from keras.models import Sequential, load_model, model_from_json
from keras.layers import Input, Dense, Conv1D, MaxPooling2D, Dropout, Flatten, BatchNormalization
from tensorflow.keras.optimizers import Adam  # https://stackoverflow.com/questions/62707558/importerror-cannot-import-name-adam-from-keras-optimizers
from keras.callbacks import ModelCheckpoint, EarlyStopping  # https://machinelearningmastery.com/how-to-stop-training-deep-neural-networks-at-the-right-time-using-early-stopping/
from collections import Counter

from sklearn.metrics import r2_score, accuracy_score
from scipy.stats import spearmanr  # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html

warnings.filterwarnings('ignore')
logging.disable(1000)

# tf.random.set_seed(1202)  # https://www.tensorflow.org/api_docs/python/tf/random/set_seed
# from numpy.random import seed
# seed(1202)

random.seed(1234)

nts = ["A", "T", "C", "G"]  # list of single nucleotides

def Spearman(y_true, y_pred):
     return (tf.py_function(spearmanr, [tf.cast(y_pred, tf.float32), 
                       tf.cast(y_true, tf.float32)], Tout = tf.float32) )


In [None]:
# load in datasets
# validation = pd.read_csv("data/processed/validation.csv")
# arabidopsis = pd.read_csv("data/processed/arabidopsis_processed_every16_padded.csv")

# validation = pd.read_csv("data/processed/validation_padded.csv")
# arabidopsis = pd.read_csv("data/processed/drosophila_processed.csv")

# train on ATAC data, test on tobacco
# validation = pd.read_csv("data/processed/validation_padded.csv")
# arabidopsis = pd.read_csv("data/processed/napus_processed_every16.csv")

# train on arabidopsis genome data, test on arabidopsis bac data (both 153 bp length)
# validation = pd.read_csv("data/processed/arabidopsis_bac.csv")
# arabidopsis = pd.read_csv("data/processed/arabidopsis_processed_every16_padded.csv")

# # train on drosophila genome data, test on arabidopsis bac data (bac padded to 249 bp)
# validation = pd.read_csv("data/processed/arabidopsis_bac_padded.csv")
# arabidopsis = pd.read_csv("data/processed/drosophila_processed.csv")

# train on arabidopsis bac data, test on tobacco data (no padding)
validation = pd.read_csv("data/processed/validation.csv")
arabidopsis = pd.read_csv("data/processed/arabidopsis_bac_10chunks.csv")

# # train on arabidopsis genome data, test on tobacco data (arabidopsis padded to 153 bp)
# validation = pd.read_csv("data/processed/validation.csv")
# arabidopsis = pd.read_csv("data/processed/arabidopsis_processed_every16_padded.csv")

# # train on arabidopsis bac data, test on athal genome data (genome padded to 153)
# arabidopsis = pd.read_csv("data/processed/arabidopsis_bac_10chunks.csv")
# validation = pd.read_csv("data/processed/arabidopsis_processed_every16_padded.csv")

# # train on arabidopsis genome data, test on athal bac data (genome padded to 153)
# validation = pd.read_csv("data/processed/arabidopsis_bac_10chunks.csv")
# arabidopsis = pd.read_csv("data/processed/arabidopsis_processed_every16_padded.csv")

# # train on arabidopsis genome data, test on athal bac data (genome padded to 153)
# validation = pd.read_csv("data/processed/napus_processed_every4_binary.csv")
# arabidopsis = pd.read_csv("data/processed/napus_processed_every4_binary.csv")

## Frequency-Based Model

In [None]:
args = {"target_name":"target",
        "include_mononuc_freq":1,
        "include_dinuc_freq":0,
        "include_trinuc_freq":1,
        "layer_1_size":64,
        "layer_1_activation":"relu",
        "layer_2_size":24,
        "layer_2_activation":"relu",
        "layer_3_size":0,
        "layer_2_activation":"relu",
        "output_layer_size":1,
        "output_layer_activation":"sigmoid",  # change to change ["linear", "sigmoid"]
        "loss":"binary_crossentropy",  # change to change ["mean_squared_error", "binary_crossentropy"]
        "extra_metric":"accuracy",  # change to change [Spearman, "accuracy"]
        'learning_rate':0.002,
        'batch_size':512,
        'num_epochs':100,
        'patience':20,
        'max_batch_steps':-1,
        'optimizer':'adam',
        'verbose_training':1}

In [None]:
def get_model(args, in_dim):  # initializes model architecture
    mdl = Sequential()

    # this is the only layer that is enforced. to test linear regression only, set layer_1_size to 1 and layer_1_activation to "linear"
    mdl.add(Dense(args["layer_1_size"], input_dim=in_dim, activation=args["layer_1_activation"]))

    if args["layer_2_size"] > 0:       mdl.add(Dense(args["layer_2_size"], activation=args["layer_2_activation"]))
    if args["layer_3_size"] > 0:       mdl.add(Dense(args["layer_3_size"], activation=args["layer_3_activation"]))
    if args["output_layer_size"] > 0:  mdl.add(Dense(args["output_layer_size"], activation=args["output_layer_activation"]))

    return mdl


In [None]:
def train_test_val(args, df):
    include = []  # captures all sequences we are including as input features

    if args["include_mononuc_freq"] == 1:  include += nts
    if args["include_dinuc_freq"] == 1:    include += [nt1+nt2 for nt1 in nts for nt2 in nts]
    if args["include_trinuc_freq"] == 1:   include += [nt1+nt2+nt3 for nt1 in nts for nt2 in nts for nt3 in nts]

    for item in include:  # create new columns with the counts of sequences in "include"
      print("including", item)
      df[item] = df.sequence.str.count(item)

    # add numbers to fill out for the extra NTs'

    train_df = df[df.set == "train"]
    X_train = np.array(train_df[include])
    y_train = np.array(train_df["target"].tolist())

    val_df = df[df.set == "val"]
    X_val = np.array(val_df[include])
    y_val = np.array(val_df["target"].tolist())

    test_df = df[df.set == "test"]
    X_test = np.array(test_df[include])
    y_test = np.array(test_df["target"].tolist())

    return X_train, y_train, X_val, y_val, X_test, y_test


def prepare_validation(args, df):
    include = []  # captures all sequences we are including as input features

    if args["include_mononuc_freq"] == 1:  include += nts
    if args["include_dinuc_freq"] == 1:    include += [nt1+nt2 for nt1 in nts for nt2 in nts]
    if args["include_trinuc_freq"] == 1:   include += [nt1+nt2+nt3 for nt1 in nts for nt2 in nts for nt3 in nts]

    for item in include:  # create new columns with the counts of sequences in "include"
      print("including", item)
      df[item] = df.sequence.str.count(item)

    X_test = np.array(df[include])
    y_test = np.array(df["target"].tolist())

    return X_test, y_test


In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = train_test_val(args, arabidopsis)
X_real, y_real = prepare_validation(args, validation)

including A
including T
including C
including G
including AAA
including AAT
including AAC
including AAG
including ATA
including ATT
including ATC
including ATG
including ACA
including ACT
including ACC
including ACG
including AGA
including AGT
including AGC
including AGG
including TAA
including TAT
including TAC
including TAG
including TTA
including TTT
including TTC
including TTG
including TCA
including TCT
including TCC
including TCG
including TGA
including TGT
including TGC
including TGG
including CAA
including CAT
including CAC
including CAG
including CTA
including CTT
including CTC
including CTG
including CCA
including CCT
including CCC
including CCG
including CGA
including CGT
including CGC
including CGG
including GAA
including GAT
including GAC
including GAG
including GTA
including GTT
including GTC
including GTG
including GCA
including GCT
including GCC
including GCG
including GGA
including GGT
including GGC
including GGG
including A
including T
including C
including G
includin

In [None]:
model = get_model(args, X_train.shape[1])  # initalize model

# inner_path = "validation_results/arabidopsis/train_on_athal_bac_val_on_athal_genome"
# inner_path = "validation_results/arabidopsis/train_on_athal_genome_val_on_athal_bac/freq"
inner_path = "accessibility_task/2_percent_binary"

# create path to folder with results 
dir_path = ("experiments/"+inner_path+"/nucfreq"
            +"_"+datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
            +"_"+args["target_name"]
            +"_nuc"
            +str(args["include_mononuc_freq"])
            +str(args["include_dinuc_freq"])
            +str(args["include_trinuc_freq"])
            +"_lay"+str(args["layer_1_size"])
            +"-"+str(args["layer_2_size"])
            +"-"+str(args["layer_3_size"])
            +"-"+str(args["output_layer_size"])
            +"_lr"+str(args["learning_rate"])
            +"_bs"+str(args["batch_size"])
            +'_rep5')


# for binary task
model.compile(optimizer=Adam(lr=args["learning_rate"]),  # CHANGE IF WE WANT TO CHANGE OPTIM
              loss=args["loss"],
              metrics=[args["extra_metric"]])

In [None]:
logdir = os.path.join(dir_path, "logs")
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)  # https://stackoverflow.com/questions/59894720/keras-and-tensorboard-attributeerror-sequential-object-has-no-attribute-g
es_callback = EarlyStopping(monitor='val_loss', verbose=1, patience=args["patience"], restore_best_weights=True)
mc_callback = ModelCheckpoint(dir_path+'/best_model.h5', monitor='val_loss', save_best_only=True)

In [None]:
history = model.fit(X_train, y_train,
                    epochs=args["num_epochs"],
                    batch_size=args["batch_size"],
                    validation_data=(X_val, y_val),
                    callbacks=[tensorboard_callback, es_callback, mc_callback])

hist_df = pd.DataFrame(history.history) 
hist_df.to_csv(dir_path+'/training_history.csv')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 44: early stopping


In [None]:
# load best model according to val_loss

# saved_model = load_model(dir_path+'/best_model.h5')
saved_model = model

# with open(dir_path+"/results.csv", "w") as f:
#   f.write(",R2,spearman\n")
#   f.write("train,"+str(r2_score(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))+","+str(spearmanr(y_train, saved_model.predict(X_train).reshape(1, -1)[0])[0])+"\n")
#   f.write("val,"+str(r2_score(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))+","+str(spearmanr(y_val, saved_model.predict(X_val).reshape(1, -1)[0])[0])+"\n")
#   f.write("test,"+str(r2_score(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))+","+str(spearmanr(y_test, saved_model.predict(X_test).reshape(1, -1)[0])[0])+"\n")
#   f.write("real,"+str(r2_score(y_real, saved_model.predict(X_real).reshape(1, -1)[0]))+","+str(spearmanr(y_real, saved_model.predict(X_real).reshape(1, -1)[0])[0]))

# # for real_valued
# with open(dir_path+"/results.csv", "w") as f:
#   f.write(",train,val,test,real\n")
#   f.write("r2,"+str(r2_score(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))+","+str(r2_score(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))+","+str(r2_score(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))+","+str(r2_score(y_real, saved_model.predict(X_real).reshape(1, -1)[0]))+"\n")
#   f.write("spearman,"+str(spearmanr(y_train, saved_model.predict(X_train).reshape(1, -1)[0])[0])+","+str(spearmanr(y_val, saved_model.predict(X_val).reshape(1, -1)[0])[0])+","+str(spearmanr(y_test, saved_model.predict(X_test).reshape(1, -1)[0])[0])+","+str(spearmanr(y_real, saved_model.predict(X_real).reshape(1, -1)[0])[0])+"\n")

# for binary
with open(dir_path+"/results.csv", "w") as f:
  f.write(",train,val,test\n")
  # https://stackoverflow.com/questions/68836551/keras-attributeerror-sequential-object-has-no-attribute-predict-classes
  f.write("accuracy,"+str(accuracy_score(y_train, (saved_model.predict(X_train) > 0.5).astype("int32")))+","+str(accuracy_score(y_val, (saved_model.predict(X_val) > 0.5).astype("int32")))+","+str(accuracy_score(y_test, (saved_model.predict(X_test) > 0.5).astype("int32")))+"\n")


## Convolution-Based Model
Train from scratch

In [None]:
from keras.models import Sequential, load_model, model_from_json
from keras.layers import Input, Dense, Conv1D, MaxPooling2D, Dropout, Flatten, BatchNormalization
# from keras.optimizers import Adam
from tensorflow.keras.optimizers import Adam  # https://stackoverflow.com/questions/62707558/importerror-cannot-import-name-adam-from-keras-optimizers
from keras.callbacks import ModelCheckpoint, EarlyStopping  # https://machinelearningmastery.com/how-to-stop-training-deep-neural-networks-at-the-right-time-using-early-stopping/

from sklearn.metrics import r2_score
from scipy.stats import spearmanr  # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html

warnings.filterwarnings('ignore')
logging.disable(1000)

random.seed(1234)

mapping = {"A": [1, 0, 0, 0], "T": [0, 0, 0, 1], "C": [0, 1, 0, 0], "G": [0, 0, 1, 0], "X":[0, 0, 0, 0]}  # cross referenced with kipoi data loader

In [None]:
args = {"input_sequence_length":153,  # CHANGE WITH DATASET
        "target_name":"target",
        "number_of_outputs":1,
        "conv_one_set":2,
        "conv_two_set":2,
        "conv_three_set":0,
        "model_path":"models/model_genome.json",
        "weights_path":"models/weights_genome.h5",
        "linear_mapping":0,
        "last_conv_layer":1,
        "shuffle":0,
        'learning_rate':0.002,
        'batch_size':512,
        'num_epochs':100,
        'patience':20,
        'max_batch_steps':-1,
        'optimizer':'adam',
        'verbose_training':1}

In [None]:
def train_test_val(args, df):  # splits dataframe into all the sets
    if args["shuffle"] == 1:  # shuffles NTs within each sequence
      df.loc[:,"sequence"] = [''.join(random.sample(s, len(s))) for s in df["sequence"]]

    train_df = df[df.set == "train"]
    X_train = np.array([get_ohe(sqnc) for sqnc in train_df["sequence"]])
    y_train = np.array(train_df[args["target_name"]].tolist())

    val_df = df[df.set == "val"]
    X_val = np.array([get_ohe(sqnc) for sqnc in val_df["sequence"]])
    y_val = np.array(val_df[args["target_name"]].tolist())

    test_df = df[df.set == "test"]
    X_test = np.array([get_ohe(sqnc) for sqnc in test_df["sequence"]])
    y_test = np.array(test_df[args["target_name"]].tolist())

    return X_train, y_train, X_val, y_val, X_test, y_test


def prepare_validation(df):
    X_test = np.array([get_ohe(sqnc) for sqnc in df["sequence"]])
    y_test = np.array(df["target"].tolist())

    return X_test, y_test


def get_ohe(sequence):  # gets sequence in format model can use (145, 4)
    return np.array([mapping[nt] for nt in sequence])


def get_model(args):  # initializes model architecture
    mdl = Sequential()

    conv1_train = args["conv_one_set"] != 2  # True if conv layer should be trained
    mdl.add(Conv1D(120, 5, activation='relu', input_shape=(args["input_sequence_length"], 4), name="1DConv_1", trainable=conv1_train))
    mdl.add(BatchNormalization(name="batchNorm1", trainable=conv1_train))
    mdl.add(Dropout(0.1, name="drop1"))

    conv2_train = args["conv_two_set"] != 2  # True if conv layer should be trained
    mdl.add(Conv1D(120, 5, activation='relu', name="1DConv_2", trainable=conv2_train))
    mdl.add(BatchNormalization(name="batchNorm2", trainable=conv2_train))
    mdl.add(Dropout(0.1, name="drop2"))

    if args["last_conv_layer"] == 1:  # if we are not removing last conv layer for simplicity
      conv3_train = args["conv_three_set"] != 2  # True if conv layer should be trained
      mdl.add(Conv1D(120, 5, activation='relu', name="1DConv_3", trainable=conv3_train))
      mdl.add(BatchNormalization(name="batchNorm3", trainable=conv3_train))
      mdl.add(Dropout(0.1, name="drop3"))

    mdl.add(Flatten(name="flat"))

    if args["linear_mapping"] == 1: 
        mdl.add(Dense(12, activation='linear', name="dense1", trainable=False))

    # output layer
    mdl.add(Dense(1, activation='linear', name="dense2"))

    return mdl


In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = train_test_val(args, arabidopsis)
X_real, y_real = prepare_validation(validation)

In [None]:
model = get_model(args)  # initalize model

# create path to folder with results 
date = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
arch_settings = str(args["conv_one_set"])+str(args["conv_two_set"])+str(args["conv_three_set"])+str(args["linear_mapping"])
dir_path = "experiments/conv_validation_"+date+"_"+args["target_name"]+"_"+arch_settings+"_lr"+str(args['learning_rate'])+"_bs"+str(args['batch_size'])+"_ep"+str(args['num_epochs'])

model.compile(optimizer=Adam(lr=args["learning_rate"]),  # CHANGE IF WE WANT TO CHANGE OPTIM
              loss='mean_squared_error',
              metrics=[Spearman])

In [None]:
logdir = os.path.join(dir_path, "logs")
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)  # https://stackoverflow.com/questions/59894720/keras-and-tensorboard-attributeerror-sequential-object-has-no-attribute-g
es_callback = EarlyStopping(monitor='val_loss', verbose=1, patience=args["patience"], restore_best_weights=True)
mc_callback = ModelCheckpoint(dir_path+'/best_model.h5', monitor='val_loss', save_best_only=True)

In [None]:
history = model.fit(X_train, y_train,
                    epochs=args["num_epochs"],
                    batch_size=args["batch_size"],
                    validation_data=(X_val, y_val),
                    callbacks=[tensorboard_callback, es_callback, mc_callback])

hist_df = pd.DataFrame(history.history) 
hist_df.to_csv(dir_path+'/training_history.csv')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
del arabidopsis
del validation

In [None]:
# load best model according to val_loss

# saved_model = load_model(dir_path+'/best_model.h5')
saved_model = model

with open(dir_path+"/results.csv", "w") as f:
  f.write(",train,val,test,real\n")
  f.write("r2,"+str(r2_score(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))+","+str(r2_score(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))+","+str(r2_score(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))+","+str(r2_score(y_real, saved_model.predict(X_real).reshape(1, -1)[0]))+"\n")
  f.write("spearman,"+str(spearmanr(y_train, saved_model.predict(X_train).reshape(1, -1)[0])[0])+","+str(spearmanr(y_val, saved_model.predict(X_val).reshape(1, -1)[0])[0])+","+str(spearmanr(y_test, saved_model.predict(X_test).reshape(1, -1)[0])[0])+","+str(spearmanr(y_real, saved_model.predict(X_real).reshape(1, -1)[0])[0])+"\n")
  # f.write("spearman2,"+str(Spearman(y_train, saved_model.predict(X_train).reshape(1, -1)[0])[0])+","+str(Spearman(y_val, saved_model.predict(X_val).reshape(1, -1)[0])[0])+","+str(Spearman(y_test, saved_model.predict(X_test).reshape(1, -1)[0])[0])+","+str(Spearman(y_real, saved_model.predict(X_real).reshape(1, -1)[0])[0])+"\n")

## Recurrent-Based Model

In [None]:
from keras.models import Sequential, load_model, model_from_json
from keras.callbacks import ModelCheckpoint, EarlyStopping  # https://machinelearningmastery.com/how-to-stop-training-deep-neural-networks-at-the-right-time-using-early-stopping/

from keras import layers
from keras import activations

from sklearn.metrics import r2_score
from scipy.stats import spearmanr # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html

warnings.filterwarnings('ignore')
logging.disable(1000)

tf.random.set_seed(1202)  # https://www.tensorflow.org/api_docs/python/tf/random/set_seed

# mapping = {"A": [1, 0, 0, 0], "T": [0, 0, 0, 1], "C": [0, 1, 0, 0], "G": [0, 0, 1, 0], "X":[0, 0, 0, 0]}  # cross referenced with kipoi data loader
mapping = {"A": [1.0, 0.0, 0.0, 0.0], "T": [0.0, 0.0, 0.0, 1.0], "C": [0.0, 1.0, 0.0, 0.0], "G": [0.0, 0.0, 1.0, 0.0], "X":[0.0, 0.0, 0.0, 0.0]}  # cross referenced with kipoi data loader

In [None]:
args = {"input_sequence_length":249,
        "target_name":"dev_target",
        "num_outs":1,
        "shuffle":0,
        'learning_rate':0.002,
        'batch_size':512,
        'num_epochs':100,
        'patience':20,
        'max_batch_steps':-1,
        'optimizer':'adam',
        'verbose_training':1}

In [None]:
def get_model(args):
    model = keras.Sequential()

    # Add a LSTM layer with 128 internal units.
    model.add(layers.LSTM(128))

    # Add a Dense layer with 1 units.
    model.add(layers.Dense(args["num_outs"], activation=activations.linear))

    return model


def train_test_val(args, df):  # splits dataframe into all the sets
    if args["shuffle"] == 1:  # shuffles NTs within each sequence
      df.loc[:,"sequence"] = [''.join(random.sample(s, len(s))) for s in df["sequence"]]

    train_df = df[df.set == "train"]
    X_train = np.array([get_ohe(sqnc) for sqnc in train_df["sequence"]])
    y_train = np.array(train_df[args["target_name"]].tolist())

    val_df = df[df.set == "val"]
    X_val = np.array([get_ohe(sqnc) for sqnc in val_df["sequence"]])
    y_val = np.array(val_df[args["target_name"]].tolist())

    test_df = df[df.set == "test"]
    X_test = np.array([get_ohe(sqnc) for sqnc in test_df["sequence"]])
    y_test = np.array(test_df[args["target_name"]].tolist())

    return X_train, y_train, X_val, y_val, X_test, y_test


def prepare_validation(df):
    X_test = np.array([get_ohe(sqnc) for sqnc in df["sequence"]])
    y_test = np.array(df["target"].tolist())

    return X_test, y_test


def get_ohe(sequence):  # gets sequence in format model can use (145, 4)
    return np.array([mapping[nt] for nt in sequence])

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = train_test_val(args, arabidopsis)
X_real, y_real = prepare_validation(validation)

In [None]:
model = get_model(args)  # initalize model

# create path to folder with results 
date = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
dir_path = "experiments/recur_validation_"+date+"_"+args["target_name"]+"_lr"+str(args['learning_rate'])+"_bs"+str(args['batch_size'])+"_ep"+str(args['num_epochs'])

model.compile(optimizer=Adam(lr=args["learning_rate"]),  # CHANGE IF WE WANT TO CHANGE OPTIM
              loss='mean_squared_error',
              metrics=[Spearman])

In [None]:
logdir = os.path.join(dir_path, "logs")
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)  # https://stackoverflow.com/questions/59894720/keras-and-tensorboard-attributeerror-sequential-object-has-no-attribute-g
es_callback = EarlyStopping(monitor='val_loss', verbose=1, patience=20, restore_best_weights=True)
mc_callback = ModelCheckpoint(dir_path+'/best_model.h5', monitor='val_loss', save_best_only=True)

In [None]:
history = model.fit(X_train, y_train,
                    epochs=args["num_epochs"],
                    batch_size=args["batch_size"],
                    validation_data=(X_val, y_val),
                    callbacks=[tensorboard_callback, es_callback, mc_callback])

hist_df = pd.DataFrame(history.history) 
hist_df.to_csv(dir_path+'/training_history.csv')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 59: early stopping


In [None]:
# load best model according to val_loss

# saved_model = load_model(dir_path+'/best_model.h5')
saved_model = model

with open(dir_path+"/results.csv", "w") as f:
  f.write(",train,val,test,real\n")
  f.write("r2,"+str(r2_score(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))+","+str(r2_score(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))+","+str(r2_score(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))+","+str(r2_score(y_real, saved_model.predict(X_real).reshape(1, -1)[0]))+"\n")
  f.write("spearman,"+str(spearmanr(y_train, saved_model.predict(X_train).reshape(1, -1)[0])[0])+","+str(spearmanr(y_val, saved_model.predict(X_val).reshape(1, -1)[0])[0])+","+str(spearmanr(y_test, saved_model.predict(X_test).reshape(1, -1)[0])[0])+","+str(spearmanr(y_real, saved_model.predict(X_real).reshape(1, -1)[0])[0])+"\n")