name: new_dataset_tests_more_epochs.ipynb

This notebook is very similar to new_dataset_tests.ipynb, but runs slightly different experiments.

Please note that this notebook is not set up with the current "experiments" folder as the destination for results. I would not recommend running this, please use `MLME-v2.0-conv_model.py` instead. 

---

https://github.com/tensorflow/tensorflow/issues/38012 for keras no module error

In [None]:
# mount google drive

from google.colab import drive
drive.mount('/content/drive')

%cd '/content/drive/Shareddrives/NRC_Amii_Agronomics_Project/nrc-ml-plant-genomics/'

In [None]:
import keras
import warnings, logging
import json
import numpy as np
import pandas as pd
import datetime, os

from models.base_model import BaseModel
from keras.models import Sequential, load_model, model_from_json
from keras.layers import Input, Dense, Conv1D, MaxPooling2D, Dropout, Flatten, BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping  # https://machinelearningmastery.com/how-to-stop-training-deep-neural-networks-at-the-right-time-using-early-stopping/
import tensorflow as tf

from sklearn.metrics import r2_score
from scipy.stats import spearmanr # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html

warnings.filterwarnings('ignore')
logging.disable(1000)

%load_ext tensorboard

In [None]:
pwd

'/Users/sarahdavis/Documents/internship/project_files/MPRA-DragoNN'

In [None]:
ls

LICENSE                                 kipoi_playground.ipynb
README.md                               linear_mapping.ipynb
__init__.py                             [34mlogs[m[m/
commands_for_setup.txt                  main.py
create_final_dataset.ipynb              [34mmodels[m[m/
cross_val_linear_mapping.ipynb          [34mnew_data[m[m/
[34mdata[m[m/                                   new_dataset_tests.ipynb
data_exploration.ipynb                  new_dataset_tests_default_params.ipynb
[34mdata_loader[m[m/                            new_dataset_tests_duplicate.ipynb
[34mevaluator[m[m/                              new_dataset_tests_more_epochs.ipynb
[34mexample[m[m/                                [34mnew_models[m[m/
generate_data_format.ipynb              predict_on_NRC_data.ipynb
keras_model_loading.ipynb               requirements.txt
keras_retraining.ipynb                  requirements_exact.txt
keras_training.ipynb                    [34mtraine

In [None]:
# load in model as it is from kipoi

with open('models/model.json', 'r') as json_file:
    json_savedModel = json_file.read()
    
pretrained_model = model_from_json(json_savedModel)
pretrained_model.load_weights('models/pretrained.hdf5')

In [None]:
pretrained_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batch_normalization_1 (Batch (None, 141, 120)          480       
_________________________________________________________________
dropout_1 (Dropout)          (None, 141, 120)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batch_normalization_2 (Batch (None, 137, 120)          480       
_________________________________________________________________
dropout_2 (Dropout)          (None, 137, 120)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 133, 120)         

In [None]:
mapping = {"A":[1, 0, 0, 0], "T":[0, 0, 0, 1], "C":[0, 1, 0, 0], "G":[0, 0, 1, 0]}  # cross referenced with kipoi data loader

def get_ohe(sequence):  # gets sequence in format model can use
    return np.array([mapping[nt] for nt in sequence])

def get_model(treatment_dict):  # initializes model architecture
    mdl = Sequential()

    conv1_train = treatment_dict["conv1"] != 2  # bool
    mdl.add(Conv1D(120, 5, activation='relu', input_shape=(145, 4), name="1DConv_1", trainable=conv1_train))
    mdl.add(BatchNormalization(name="batchNorm1", trainable=conv1_train))
    mdl.add(Dropout(0.1, name="drop1"))

    conv2_train = treatment_dict["conv2"] != 2  # bool 
    mdl.add(Conv1D(120, 5, activation='relu', name="1DConv_2", trainable=conv2_train))
    mdl.add(BatchNormalization(name="batchNorm2", trainable=conv2_train))
    mdl.add(Dropout(0.1, name="drop2"))

    conv3_train = treatment_dict["conv3"] != 2  # bool 
    mdl.add(Conv1D(120, 5, activation='relu', name="1DConv_3", trainable=conv3_train))
    mdl.add(BatchNormalization(name="batchNorm3", trainable=conv3_train))
    mdl.add(Dropout(0.1, name="drop3"))

    mdl.add(Flatten(name="flat"))    
    mdl.add(Dense(1, activation='linear', name="dense2"))
    
    return mdl

def set_weights(treatment_dict, mdl):  # sets appropriate model weights from pretrained
    layers_to_set = []
    if treatment_dict["conv1"] != 0: layers_to_set += [0, 1, 2]
    if treatment_dict["conv2"] != 0: layers_to_set += [3, 4, 5]
    if treatment_dict["conv3"] != 0: layers_to_set += [6, 7, 8]
    
    for i in layers_to_set:
        print(mdl.layers[i])
        pretrained_layer_weights = pretrained_model.layers[i].get_weights()
        mdl.layers[i].set_weights(pretrained_layer_weights)
    
    return mdl

# NOT USED
def return_spearman(true, pred):
    return tf.convert_to_tensor(spearmanr(true, pred)[0])

# NOT USED
def correlation(x, y):  # https://www.kaggle.com/carlolepelaars/understanding-the-metric-spearman-s-rho
    mx = tf.math.reduce_mean(x)
    my = tf.math.reduce_mean(y)
    xm, ym = x-mx, y-my
    r_num = tf.math.reduce_mean(tf.multiply(xm,ym))        
    r_den = tf.math.reduce_std(xm) * tf.math.reduce_std(ym)
    return r_num / r_den

In [None]:
# loading in data
df = pd.read_csv("data/processed/hidra_chloroplast_15.csv")

In [None]:
df.head()

Unnamed: 0,organelle,start_coords,end_coords,sequence,control_raw_coverage,treatment_raw_coverage,control_norm_coverage,treatment_norm_coverage,target,A,T,C,G,set
0,NC_016734.1,0,145,AATCATAATAACTTGGTCCCGGGCATCACGGGCGAACGACGGGAAT...,998,316,338.31,405.52,0.26,39,40,40,26,test
1,NC_016734.1,5,150,TAATAACTTGGTCCCGGGCATCACGGGCGAACGACGGGAATTGAAC...,998,318,338.31,408.08,0.27,40,40,39,26,test
2,NC_016734.1,10,155,ACTTGGTCCCGGGCATCACGGGCGAACGACGGGAATTGAACCCGCG...,998,318,338.31,408.08,0.27,38,38,42,27,test
3,NC_016734.1,15,160,GTCCCGGGCATCACGGGCGAACGACGGGAATTGAACCCGCGATGGT...,998,318,338.31,408.08,0.27,40,36,42,27,test
4,NC_016734.1,20,165,GGGCATCACGGGCGAACGACGGGAATTGAACCCGCGATGGTGAATT...,998,318,338.31,408.08,0.27,41,39,39,26,test


In [None]:
# split into different datasets

train_df = df[df.set=="train"]
X_train = np.array([get_ohe(sqnc) for sqnc in train_df["sequence"]])
y_train = np.array(train_df["target"].tolist())

val_df = df[df.set=="val"]
X_val = np.array([get_ohe(sqnc) for sqnc in val_df["sequence"]])
y_val = np.array(val_df["target"].tolist())

test_df = df[df.set=="test"]
X_test = np.array([get_ohe(sqnc) for sqnc in test_df["sequence"]])
y_test = np.array(test_df["target"].tolist())

In [None]:
print(X_train.shape, X_val.shape, X_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)

(23458, 145, 4) (2927, 145, 4) (2927, 145, 4)
(23458,) (2927,) (2927,)


### Re-train from scratch

In [None]:
# 0 for init random weights & train
# 1 for load pre-trained weights and train
# 2 for load pre-trained weights and freeze

layer_treatment = {"conv1":0, 
                   "conv2":0, 
                   "conv3":0}

model = get_model(layer_treatment)
model = set_weights(layer_treatment, model)

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)          7

In [None]:
rm -rf ./logs/

In [None]:
model.compile(optimizer='adam', 
              loss='mean_squared_error') 

In [None]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)  # https://stackoverflow.com/questions/59894720/keras-and-tensorboard-attributeerror-sequential-object-has-no-attribute-g

es_callback = EarlyStopping(monitor='val_loss', verbose=1, patience=20)
mc_callback = ModelCheckpoint('new_models/best_model.h5', monitor='val_loss', save_best_only=True)

In [None]:
history = model.fit(X_train, y_train, 
                    epochs=100, 
                    validation_data=(X_val, y_val),
                    callbacks=[tensorboard_callback, es_callback, mc_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 00035: early stopping


In [None]:
%tensorboard --logdir logs

In [None]:
saved_model = load_model('new_models/best_model.h5')

In [None]:
# R^2
print("Train", r2_score(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train 0.9550730485166761
Val 0.21636633526044513
Test 0.11513153614742633


In [None]:
# Spearman
print("Train", spearmanr(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.986926687846542, pvalue=0.0)
Val SpearmanrResult(correlation=0.46815244878318235, pvalue=2.3145440504455403e-159)
Test SpearmanrResult(correlation=0.3302793981577226, pvalue=1.9260677958479014e-75)


In [None]:
ls

LICENSE                                 linear_mapping.ipynb
README.md                               [34mlogs[m[m/
__init__.py                             main.py
commands_for_setup.txt                  [34mmodels[m[m/
create_final_dataset.ipynb              [34mnew_data[m[m/
cross_val_linear_mapping.ipynb          new_dataset_tests.ipynb
[34mdata[m[m/                                   new_dataset_tests_default_params.ipynb
data_exploration.ipynb                  new_dataset_tests_duplicate.ipynb
[34mdata_loader[m[m/                            new_dataset_tests_more_epochs.ipynb
[34mevaluator[m[m/                              [34mnew_models[m[m/
[34mexample[m[m/                                [34mnew_scripts[m[m/
generate_data_format.ipynb              predict_on_NRC_data.ipynb
keras_model_loading.ipynb               requirements.txt
keras_retraining.ipynb                  requirements_exact.txt
keras_training.ipynb                    [34mtrain

In [None]:
r2_score(y_train, saved_model.predict(X_train).reshape(1, -1)[0])

0.24124043437203846

In [None]:
with open("results.csv", "w") as f:
    f.write(",R2,spearman\n")
    f.write("train,"+str(r2_score(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))+","+str(spearmanr(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))+"\n")
    f.write("val,"+str(r2_score(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))+","+str(spearmanr(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))+"\n")
    f.write("test,"+str(r2_score(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))+","+str(spearmanr(y_test, saved_model.predict(X_test).reshape(1, -1)[0])))

IndexError: invalid index to scalar variable.

### Re-train from starting point

In [None]:
# 0 for init random weights & train
# 1 for load pre-trained weights and train
# 2 for load pre-trained weights and freeze

layer_treatment = {"conv1":1, 
                   "conv2":1, 
                   "conv3":1}

model = get_model(layer_treatment)
model = set_weights(layer_treatment, model)

<keras.layers.convolutional.Conv1D object at 0x7f9065597250>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f90653ad550>
<keras.layers.core.Dropout object at 0x7f90655b0e50>
<keras.layers.convolutional.Conv1D object at 0x7f9080c9d4f0>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f906569ea90>
<keras.layers.core.Dropout object at 0x7f906530e2e0>
<keras.layers.convolutional.Conv1D object at 0x7f9062cafa00>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f90819e89a0>
<keras.layers.core.Dropout object at 0x7f9062eb5dc0>


In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)         

In [None]:
rm -rf ./logs/

In [None]:
model.compile(optimizer='adam', 
              loss='mean_squared_error') 

In [None]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)  # https://stackoverflow.com/questions/59894720/keras-and-tensorboard-attributeerror-sequential-object-has-no-attribute-g

es_callback = EarlyStopping(monitor='val_loss', verbose=1, patience=20)
mc_callback = ModelCheckpoint('new_models/best_model.h5', monitor='val_loss', save_best_only=True)

In [None]:
history = model.fit(X_train, y_train, 
                    epochs=100, 
                    validation_data=(X_val, y_val),
                    callbacks=[tensorboard_callback, es_callback, mc_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 00025: early stopping


In [None]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 18270), started 0:14:48 ago. (Use '!kill 18270' to kill it.)

In [None]:
saved_model = load_model('new_models/best_model.h5')

In [None]:
# R^2
print("Train", r2_score(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train 0.9733272884534682
Val 0.2439009604016521
Test 0.2109779255973303


In [None]:
# Spearman
print("Train", spearmanr(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.9899601158838035, pvalue=0.0)
Val SpearmanrResult(correlation=0.48257317344159556, pvalue=1.2588307389014187e-170)
Test SpearmanrResult(correlation=0.43229423636585723, pvalue=1.3642874908293733e-133)


### Train only output layer

In [None]:
# 0 for init random weights & train
# 1 for load pre-trained weights and train
# 2 for load pre-trained weights and freeze

layer_treatment = {"conv1":2, 
                   "conv2":2, 
                   "conv3":2}

model = get_model(layer_treatment)
model = set_weights(layer_treatment, model)

<keras.layers.convolutional.Conv1D object at 0x7f9065a81c40>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f9065a88ee0>
<keras.layers.core.Dropout object at 0x7f9065a8e1f0>
<keras.layers.convolutional.Conv1D object at 0x7f9065a8e6a0>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f9065a88100>
<keras.layers.core.Dropout object at 0x7f9065a9de80>
<keras.layers.convolutional.Conv1D object at 0x7f9065a9dee0>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f9065713a60>
<keras.layers.core.Dropout object at 0x7f90659a80a0>


In [None]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)         

In [None]:
rm -rf ./logs/

In [None]:
model.compile(optimizer='adam', 
              loss='mean_squared_error') 

In [None]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)  # https://stackoverflow.com/questions/59894720/keras-and-tensorboard-attributeerror-sequential-object-has-no-attribute-g

es_callback = EarlyStopping(monitor='val_loss', verbose=1, patience=20)
mc_callback = ModelCheckpoint('new_models/best_model.h5', monitor='val_loss', save_best_only=True)

In [None]:
history = model.fit(X_train, y_train, 
                    epochs=100, 
                    validation_data=(X_val, y_val),
                    callbacks=[tensorboard_callback, es_callback, mc_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 00021: early stopping


In [None]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 18270), started 0:18:03 ago. (Use '!kill 18270' to kill it.)

In [None]:
saved_model = load_model('new_models/best_model.h5')

In [None]:
# R^2
print("Train", r2_score(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train 0.36138759014684463
Val 0.2518556098884309
Test 0.1538455057442818


In [None]:
# Spearman
print("Train", spearmanr(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.6142231015764236, pvalue=0.0)
Val SpearmanrResult(correlation=0.5186443300541888, pvalue=2.75254299500723e-201)
Test SpearmanrResult(correlation=0.4242647171542155, pvalue=3.110732663352062e-128)


### Freeze Conv 1, 2, Train Conv 3 from Scratch

In [None]:
# 0 for init random weights & train
# 1 for load pre-trained weights and train
# 2 for load pre-trained weights and freeze

layer_treatment = {"conv1":2, 
                   "conv2":2, 
                   "conv3":0}

model = get_model(layer_treatment)
model = set_weights(layer_treatment, model)

<keras.layers.convolutional.Conv1D object at 0x7f90655e7340>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f9062c37a60>
<keras.layers.core.Dropout object at 0x7f906567b100>
<keras.layers.convolutional.Conv1D object at 0x7f9062d1d340>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f906567bb50>
<keras.layers.core.Dropout object at 0x7f90652d33d0>


In [None]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)         

In [None]:
rm -rf ./logs/

In [None]:
model.compile(optimizer='adam', 
              loss='mean_squared_error') 

In [None]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)  # https://stackoverflow.com/questions/59894720/keras-and-tensorboard-attributeerror-sequential-object-has-no-attribute-g

es_callback = EarlyStopping(monitor='val_loss', verbose=1, patience=20)
mc_callback = ModelCheckpoint('new_models/best_model.h5', monitor='val_loss', save_best_only=True)

In [None]:
history = model.fit(X_train, y_train, 
                    epochs=100, 
                    validation_data=(X_val, y_val),
                    callbacks=[tensorboard_callback, es_callback, mc_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 00050: early stopping


In [None]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 18270), started 0:34:55 ago. (Use '!kill 18270' to kill it.)

In [None]:
saved_model = load_model('new_models/best_model.h5')

In [None]:
# R^2
print("Train", r2_score(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train 0.9808997557103227
Val 0.10790987926415563
Test 0.057492984290018145


In [None]:
# Spearman
print("Train", spearmanr(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.9929224186001404, pvalue=0.0)
Val SpearmanrResult(correlation=0.3145536060830988, pvalue=3.1210976408112577e-68)
Test SpearmanrResult(correlation=0.22313449752737607, pvalue=2.395036362764694e-34)


### Freeze Conv 1, 2, Train Conv 3 from Init

In [None]:
# 0 for init random weights & train
# 1 for load pre-trained weights and train
# 2 for load pre-trained weights and freeze

layer_treatment = {"conv1":2, 
                   "conv2":2, 
                   "conv3":1}

model = get_model(layer_treatment)
model = set_weights(layer_treatment, model)

<keras.layers.convolutional.Conv1D object at 0x7f90653bc7c0>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f90567492b0>
<keras.layers.core.Dropout object at 0x7f90819fa580>
<keras.layers.convolutional.Conv1D object at 0x7f905672e2b0>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f908085d0a0>
<keras.layers.core.Dropout object at 0x7f905671e0d0>
<keras.layers.convolutional.Conv1D object at 0x7f905671e070>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f905671e430>
<keras.layers.core.Dropout object at 0x7f90638a9c70>


In [None]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)         

In [None]:
rm -rf ./logs/

In [None]:
model.compile(optimizer='adam', 
              loss='mean_squared_error') 

In [None]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)  # https://stackoverflow.com/questions/59894720/keras-and-tensorboard-attributeerror-sequential-object-has-no-attribute-g

es_callback = EarlyStopping(monitor='val_loss', verbose=1, patience=20)
mc_callback = ModelCheckpoint('new_models/best_model.h5', monitor='val_loss', save_best_only=True)

In [None]:
history = model.fit(X_train, y_train, 
                    epochs=100, 
                    validation_data=(X_val, y_val),
                    callbacks=[tensorboard_callback, es_callback, mc_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 00038: early stopping


In [None]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 18270), started 0:46:52 ago. (Use '!kill 18270' to kill it.)

In [None]:
saved_model = load_model('new_models/best_model.h5')

In [None]:
# R^2
print("Train", r2_score(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train 0.9629402427068685
Val 0.19670003919546697
Test 0.09247332760955385


In [None]:
# Spearman
print("Train", spearmanr(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.992360968151913, pvalue=0.0)
Val SpearmanrResult(correlation=0.43533615454640867, pvalue=1.1665585659417078e-135)
Test SpearmanrResult(correlation=0.3156834718041059, pvalue=9.786192991418566e-69)


---
### Spearman Results

0 for init random weights & train  
1 for load pre-trained weights and train  
2 for load pre-trained weights and freeze  

First go:
000: val = 0.47753801170363896, test = 0.2866754020651326  
111: val = 0.47855750422978954, test = 0.279400514316224  
222: val = 0.5858565342434344, test = 0.3741217612323338  
220: val = 0.41782505305420903, test = 0.23175390117095385  
221: val = 0.4622724735247824, test = 0.26885707718958146  
2220: val = 0.5984249204802108, test = 0.32715495828441904

Second go:
000: val = 0.44123205768583007, test = 0.17504952705223212  
111: val = 0.4053507361644881, test = 0.2592776174804178  
222: val = 0.5290568130486669, test = 0.3636127380447914  
220: val = 0.392220547475502, test = 0.24043305401824872 
221: val = 0.40291026906804733, test = 0.22825701637010898
2220: val = 0.5988624237048076, test = 0.3301200533685432

### Train only output layer

In [None]:
def get_model(treatment_dict):  # initializes model architecture
    mdl = Sequential()

    conv1_train = treatment_dict["conv1"] != 2  # bool
    mdl.add(Conv1D(120, 5, activation='relu', input_shape=(145, 4), name="1DConv_1", trainable=conv1_train))
    mdl.add(BatchNormalization(name="batchNorm1", trainable=conv1_train))
    mdl.add(Dropout(0.1, name="drop1"))

    conv2_train = treatment_dict["conv2"] != 2  # bool 
    mdl.add(Conv1D(120, 5, activation='relu', name="1DConv_2", trainable=conv2_train))
    mdl.add(BatchNormalization(name="batchNorm2", trainable=conv2_train))
    mdl.add(Dropout(0.1, name="drop2"))

    conv3_train = treatment_dict["conv3"] != 2  # bool 
    mdl.add(Conv1D(120, 5, activation='relu', name="1DConv_3", trainable=conv3_train))
    mdl.add(BatchNormalization(name="batchNorm3", trainable=conv3_train))
    mdl.add(Dropout(0.1, name="drop3"))

    mdl.add(Flatten(name="flat"))    
    mdl.add(Dense(12, activation='linear', name="dense2", trainable=False))
    mdl.add(Dense(1, activation='linear', name="dense3"))
    
    return mdl

def set_weights(treatment_dict, mdl):  # sets appropriate model weights from pretrained
    layers_to_set = [i for i in range(11)]
    
    for i in layers_to_set:
        print(mdl.layers[i])
        pretrained_layer_weights = pretrained_model.layers[i].get_weights()
        mdl.layers[i].set_weights(pretrained_layer_weights)
    
    return mdl

In [None]:
# 0 for init random weights & train
# 1 for load pre-trained weights and train
# 2 for load pre-trained weights and freeze

layer_treatment = {"conv1":2, 
                   "conv2":2, 
                   "conv3":2}

model = get_model(layer_treatment)
model = set_weights(layer_treatment, model)

<keras.layers.convolutional.Conv1D object at 0x7f90568e8fa0>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f9056821d90>
<keras.layers.core.Dropout object at 0x7f90567eb610>
<keras.layers.convolutional.Conv1D object at 0x7f90568087c0>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f9056808670>
<keras.layers.core.Dropout object at 0x7f90577d0ca0>
<keras.layers.convolutional.Conv1D object at 0x7f905779bf10>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f905671ca00>
<keras.layers.core.Dropout object at 0x7f9057a38d00>
<keras.layers.core.Flatten object at 0x7f90567f74f0>
<keras.layers.core.Dense object at 0x7f9057a1d1f0>


In [None]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)         

In [None]:
rm -rf ./logs/

In [None]:
model.compile(optimizer='adam', 
              loss='mean_squared_error') 

In [None]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)  # https://stackoverflow.com/questions/59894720/keras-and-tensorboard-attributeerror-sequential-object-has-no-attribute-g

es_callback = EarlyStopping(monitor='val_loss', verbose=1, patience=20)
mc_callback = ModelCheckpoint('new_models/best_model.h5', monitor='val_loss', save_best_only=True)

In [None]:
history = model.fit(X_train, y_train, 
                    epochs=100, 
                    validation_data=(X_val, y_val),
                    callbacks=[tensorboard_callback, es_callback, mc_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 00055: early stopping


In [None]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 18270), started 0:59:21 ago. (Use '!kill 18270' to kill it.)

In [None]:
saved_model = load_model('new_models/best_model.h5')

In [None]:
# R^2
print("Train", r2_score(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train 0.2618717956334292
Val 0.26729943961851377
Test 0.1886281739966693


In [None]:
# Spearman
print("Train", spearmanr(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.5198213241496843, pvalue=0.0)
Val SpearmanrResult(correlation=0.51172420709206, pvalue=4.081904428015674e-195)
Test SpearmanrResult(correlation=0.44716308158447066, pvalue=6.649003706919414e-144)


---
# 40

In [None]:
# loading in data
df = pd.read_csv("new_data/hidra_chloroplast_40.csv")

In [None]:
df.head()

Unnamed: 0,organelle,start_coords,end_coords,sequence,control_raw_coverage,treatment_raw_coverage,control_norm_coverage,treatment_norm_coverage,target,A,T,C,G,set
0,NC_016734.1,0,145,AATCATAATAACTTGGTCCCGGGCATCACGGGCGAACGACGGGAAT...,998,316,338.31,405.52,0.26,39,40,40,26,test
1,NC_016734.1,5,150,TAATAACTTGGTCCCGGGCATCACGGGCGAACGACGGGAATTGAAC...,998,318,338.31,408.08,0.27,40,40,39,26,test
2,NC_016734.1,10,155,ACTTGGTCCCGGGCATCACGGGCGAACGACGGGAATTGAACCCGCG...,998,318,338.31,408.08,0.27,38,38,42,27,test
3,NC_016734.1,15,160,GTCCCGGGCATCACGGGCGAACGACGGGAATTGAACCCGCGATGGT...,998,318,338.31,408.08,0.27,40,36,42,27,test
4,NC_016734.1,20,165,GGGCATCACGGGCGAACGACGGGAATTGAACCCGCGATGGTGAATT...,998,318,338.31,408.08,0.27,41,39,39,26,test


In [None]:
# split into different datasets

train_df = df[df.set=="train"]
X_train = np.array([get_ohe(sqnc) for sqnc in train_df["sequence"]])
y_train = np.array(train_df["target"].tolist())

val_df = df[df.set=="val"]
X_val = np.array([get_ohe(sqnc) for sqnc in val_df["sequence"]])
y_val = np.array(val_df["target"].tolist())

test_df = df[df.set=="test"]
X_test = np.array([get_ohe(sqnc) for sqnc in test_df["sequence"]])
y_test = np.array(test_df["target"].tolist())

In [None]:
print(X_train.shape, X_val.shape, X_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)

(21844, 145, 4) (2684, 145, 4) (2684, 145, 4)
(21844,) (2684,) (2684,)


### Re-train from scratch

In [None]:
# 0 for init random weights & train
# 1 for load pre-trained weights and train
# 2 for load pre-trained weights and freeze

layer_treatment = {"conv1":0, 
                   "conv2":0, 
                   "conv3":0}

model = get_model(layer_treatment)
model = set_weights(layer_treatment, model)

<keras.layers.convolutional.Conv1D object at 0x7f9062ff41c0>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f90629693d0>
<keras.layers.core.Dropout object at 0x7f906571f6a0>
<keras.layers.convolutional.Conv1D object at 0x7f906296f6d0>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f9065465340>
<keras.layers.core.Dropout object at 0x7f9062fa8c40>
<keras.layers.convolutional.Conv1D object at 0x7f906544a880>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f9062fa8460>
<keras.layers.core.Dropout object at 0x7f9062ff4100>
<keras.layers.core.Flatten object at 0x7f906544a040>
<keras.layers.core.Dense object at 0x7f90590dc970>


In [None]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)         

In [None]:
rm -rf ./logs/

In [None]:
model.compile(optimizer='adam', 
              loss='mean_squared_error') 

In [None]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)  # https://stackoverflow.com/questions/59894720/keras-and-tensorboard-attributeerror-sequential-object-has-no-attribute-g

es_callback = EarlyStopping(monitor='val_loss', verbose=1, patience=20)
mc_callback = ModelCheckpoint('new_models/best_model.h5', monitor='val_loss', save_best_only=True)

In [None]:
history = model.fit(X_train, y_train, 
                    epochs=100, 
                    validation_data=(X_val, y_val),
                    callbacks=[tensorboard_callback, es_callback, mc_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 00024: early stopping


In [None]:
%tensorboard --logdir logs

In [None]:
saved_model = load_model('new_models/best_model.h5')

In [None]:
# R^2
print("Train", r2_score(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train 0.9672868646098323
Val 0.37122746384963246
Test 0.21161039778814295


In [None]:
# Spearman
print("Train", spearmanr(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.9860352600373611, pvalue=0.0)
Val SpearmanrResult(correlation=0.615802985507954, pvalue=5.416410286351154e-280)
Test SpearmanrResult(correlation=0.4683716368913346, pvalue=1.9234648943495221e-146)


### Re-train from starting point

In [None]:
# 0 for init random weights & train
# 1 for load pre-trained weights and train
# 2 for load pre-trained weights and freeze

layer_treatment = {"conv1":1, 
                   "conv2":1, 
                   "conv3":1}

model = get_model(layer_treatment)
model = set_weights(layer_treatment, model)

<keras.layers.convolutional.Conv1D object at 0x7f90656341f0>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f9055c5b3d0>
<keras.layers.core.Dropout object at 0x7f90656a3760>
<keras.layers.convolutional.Conv1D object at 0x7f90658d2f40>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f905672e6d0>
<keras.layers.core.Dropout object at 0x7f90657cfcd0>
<keras.layers.convolutional.Conv1D object at 0x7f90656f5730>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f905672e0d0>
<keras.layers.core.Dropout object at 0x7f90654eb9d0>
<keras.layers.core.Flatten object at 0x7f9065a01520>
<keras.layers.core.Dense object at 0x7f90569d9ee0>


In [None]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)         

In [None]:
rm -rf ./logs/

In [None]:
model.compile(optimizer='adam', 
              loss='mean_squared_error') 

In [None]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)  # https://stackoverflow.com/questions/59894720/keras-and-tensorboard-attributeerror-sequential-object-has-no-attribute-g

es_callback = EarlyStopping(monitor='val_loss', verbose=1, patience=20)
mc_callback = ModelCheckpoint('new_models/best_model.h5', monitor='val_loss', save_best_only=True)

In [None]:
history = model.fit(X_train, y_train, 
                    epochs=100, 
                    validation_data=(X_val, y_val),
                    callbacks=[tensorboard_callback, es_callback, mc_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 00027: early stopping


In [None]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 30228), started 0:18:38 ago. (Use '!kill 30228' to kill it.)

In [None]:
saved_model = load_model('new_models/best_model.h5')

In [None]:
# R^2
print("Train", r2_score(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train 0.9757126934753173
Val 0.37685782764857967
Test 0.20896743975473753


In [None]:
# Spearman
print("Train", spearmanr(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.9908803104883138, pvalue=0.0)
Val SpearmanrResult(correlation=0.6222976264542276, pvalue=1.3684450046212899e-287)
Test SpearmanrResult(correlation=0.4372889250980363, pvalue=8.823394655917018e-126)


### Train only output layer

In [None]:
# 0 for init random weights & train
# 1 for load pre-trained weights and train
# 2 for load pre-trained weights and freeze

layer_treatment = {"conv1":2, 
                   "conv2":2, 
                   "conv3":2}

model = get_model(layer_treatment)
model = set_weights(layer_treatment, model)

<keras.layers.convolutional.Conv1D object at 0x7f904a354dc0>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f904a0481c0>
<keras.layers.core.Dropout object at 0x7f904a4bc3a0>
<keras.layers.convolutional.Conv1D object at 0x7f904a23ccd0>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f904a5a0dc0>
<keras.layers.core.Dropout object at 0x7f904a5a0df0>
<keras.layers.convolutional.Conv1D object at 0x7f904a3548e0>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f904a4c5bb0>
<keras.layers.core.Dropout object at 0x7f904a048c10>
<keras.layers.core.Flatten object at 0x7f904a4ded00>
<keras.layers.core.Dense object at 0x7f904a4c5ee0>


In [None]:
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)         

In [None]:
rm -rf ./logs/

In [None]:
model.compile(optimizer='adam', 
              loss='mean_squared_error') 

In [None]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)  # https://stackoverflow.com/questions/59894720/keras-and-tensorboard-attributeerror-sequential-object-has-no-attribute-g

es_callback = EarlyStopping(monitor='val_loss', verbose=1, patience=20)
mc_callback = ModelCheckpoint('new_models/best_model.h5', monitor='val_loss', save_best_only=True)

In [None]:
history = model.fit(X_train, y_train, 
                    epochs=100, 
                    validation_data=(X_val, y_val),
                    callbacks=[tensorboard_callback, es_callback, mc_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 00065: early stopping


In [None]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 30228), started 0:27:58 ago. (Use '!kill 30228' to kill it.)

In [None]:
saved_model = load_model('new_models/best_model.h5')

In [None]:
# R^2
print("Train", r2_score(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train 0.24092483509832452
Val 0.31761735410061576
Test 0.2555477413486553


In [None]:
# Spearman
print("Train", spearmanr(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.49743165674928225, pvalue=0.0)
Val SpearmanrResult(correlation=0.5505672205007068, pvalue=1.3116075375682348e-212)
Test SpearmanrResult(correlation=0.5506971933641037, pvalue=9.955655059416527e-213)


### Freeze Conv 1, 2, Train Conv 3 from Scratch

In [None]:
# 0 for init random weights & train
# 1 for load pre-trained weights and train
# 2 for load pre-trained weights and freeze

layer_treatment = {"conv1":2, 
                   "conv2":2, 
                   "conv3":0}

model = get_model(layer_treatment)
model = set_weights(layer_treatment, model)

<keras.layers.convolutional.Conv1D object at 0x7f906565e7f0>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f9062cd7940>
<keras.layers.core.Dropout object at 0x7f904a4de6d0>
<keras.layers.convolutional.Conv1D object at 0x7f9065521a90>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f90656c49a0>
<keras.layers.core.Dropout object at 0x7f9065722c40>
<keras.layers.convolutional.Conv1D object at 0x7f90653b3ca0>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f90629e3ca0>
<keras.layers.core.Dropout object at 0x7f90656b4eb0>
<keras.layers.core.Flatten object at 0x7f90629e38e0>
<keras.layers.core.Dense object at 0x7f9062973040>


In [None]:
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)        

In [None]:
rm -rf ./logs/

In [None]:
model.compile(optimizer='adam', 
              loss='mean_squared_error') 

In [None]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)  # https://stackoverflow.com/questions/59894720/keras-and-tensorboard-attributeerror-sequential-object-has-no-attribute-g

es_callback = EarlyStopping(monitor='val_loss', verbose=1, patience=20)
mc_callback = ModelCheckpoint('new_models/best_model.h5', monitor='val_loss', save_best_only=True)

In [None]:
history = model.fit(X_train, y_train, 
                    epochs=100, 
                    validation_data=(X_val, y_val),
                    callbacks=[tensorboard_callback, es_callback, mc_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 00036: early stopping


In [None]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 30228), started 0:40:08 ago. (Use '!kill 30228' to kill it.)

In [None]:
saved_model = load_model('new_models/best_model.h5')

In [None]:
# R^2
print("Train", r2_score(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train 0.9737901624385901
Val 0.34615371424875574
Test 0.18750562898925638


In [None]:
# Spearman
print("Train", spearmanr(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.9912703598654496, pvalue=0.0)
Val SpearmanrResult(correlation=0.5988764861697347, pvalue=5.269946749173147e-261)
Test SpearmanrResult(correlation=0.4366262612574109, pvalue=2.3075557715239193e-125)


### Freeze Conv 1, 2, Train Conv 3 from Init

In [None]:
# 0 for init random weights & train
# 1 for load pre-trained weights and train
# 2 for load pre-trained weights and freeze

layer_treatment = {"conv1":2, 
                   "conv2":2, 
                   "conv3":1}

model = get_model(layer_treatment)
model = set_weights(layer_treatment, model)

<keras.layers.convolutional.Conv1D object at 0x7f9057a1d6d0>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f90655b0670>
<keras.layers.core.Dropout object at 0x7f90567f7be0>
<keras.layers.convolutional.Conv1D object at 0x7f9057a38ca0>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f9078864dc0>
<keras.layers.core.Dropout object at 0x7f90808736d0>
<keras.layers.convolutional.Conv1D object at 0x7f9080873850>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f9065699400>
<keras.layers.core.Dropout object at 0x7f9057d3df70>
<keras.layers.core.Flatten object at 0x7f9065a81850>
<keras.layers.core.Dense object at 0x7f9065a81cd0>


In [None]:
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)        

In [None]:
rm -rf ./logs/

In [None]:
model.compile(optimizer='adam', 
              loss='mean_squared_error') 

In [None]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)  # https://stackoverflow.com/questions/59894720/keras-and-tensorboard-attributeerror-sequential-object-has-no-attribute-g

es_callback = EarlyStopping(monitor='val_loss', verbose=1, patience=20)
mc_callback = ModelCheckpoint('new_models/best_model.h5', monitor='val_loss', save_best_only=True)

In [None]:
history = model.fit(X_train, y_train, 
                    epochs=100, 
                    validation_data=(X_val, y_val),
                    callbacks=[tensorboard_callback, es_callback, mc_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 00037: early stopping


In [None]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 30228), started 11:55:00 ago. (Use '!kill 30228' to kill it.)

In [None]:
saved_model = load_model('new_models/best_model.h5')

In [None]:
# R^2
print("Train", r2_score(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train 0.9794030053637053
Val 0.28301586737718565
Test 0.16496629836058974


In [None]:
# Spearman
print("Train", spearmanr(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.991383239088942, pvalue=0.0)
Val SpearmanrResult(correlation=0.5281418799925951, pvalue=1.0212364752360976e-192)
Test SpearmanrResult(correlation=0.3888156681004329, pvalue=1.3815857403885453e-97)


---
### Spearman Results

0 for init random weights & train  
1 for load pre-trained weights and train  
2 for load pre-trained weights and freeze  

First go:
000: val = 0.47753801170363896, test = 0.2866754020651326  
111: val = 0.47855750422978954, test = 0.279400514316224  
222: val = 0.5858565342434344, test = 0.3741217612323338  
220: val = 0.41782505305420903, test = 0.23175390117095385  
221: val = 0.4622724735247824, test = 0.26885707718958146  
2220: val = 0.5984249204802108, test = 0.32715495828441904

Second go:
000: val = 0.44123205768583007, test = 0.17504952705223212  
111: val = 0.4053507361644881, test = 0.2592776174804178  
222: val = 0.5290568130486669, test = 0.3636127380447914  
220: val = 0.392220547475502, test = 0.24043305401824872 
221: val = 0.40291026906804733, test = 0.22825701637010898
2220: val = 0.5988624237048076, test = 0.3301200533685432

### Train only output layer

In [None]:
def get_model(treatment_dict):  # initializes model architecture
    mdl = Sequential()

    conv1_train = treatment_dict["conv1"] != 2  # bool
    mdl.add(Conv1D(120, 5, activation='relu', input_shape=(145, 4), name="1DConv_1", trainable=conv1_train))
    mdl.add(BatchNormalization(name="batchNorm1", trainable=conv1_train))
    mdl.add(Dropout(0.1, name="drop1"))

    conv2_train = treatment_dict["conv2"] != 2  # bool 
    mdl.add(Conv1D(120, 5, activation='relu', name="1DConv_2", trainable=conv2_train))
    mdl.add(BatchNormalization(name="batchNorm2", trainable=conv2_train))
    mdl.add(Dropout(0.1, name="drop2"))

    conv3_train = treatment_dict["conv3"] != 2  # bool 
    mdl.add(Conv1D(120, 5, activation='relu', name="1DConv_3", trainable=conv3_train))
    mdl.add(BatchNormalization(name="batchNorm3", trainable=conv3_train))
    mdl.add(Dropout(0.1, name="drop3"))

    mdl.add(Flatten(name="flat"))    
    mdl.add(Dense(12, activation='linear', name="dense2", trainable=False))
    mdl.add(Dense(1, activation='linear', name="dense3"))
    
    return mdl

def set_weights(treatment_dict, mdl):  # sets appropriate model weights from pretrained
    layers_to_set = [i for i in range(11)]
    
    for i in layers_to_set:
        print(mdl.layers[i])
        pretrained_layer_weights = pretrained_model.layers[i].get_weights()
        mdl.layers[i].set_weights(pretrained_layer_weights)
    
    return mdl

In [None]:
# 0 for init random weights & train
# 1 for load pre-trained weights and train
# 2 for load pre-trained weights and freeze

layer_treatment = {"conv1":2, 
                   "conv2":2, 
                   "conv3":2}

model = get_model(layer_treatment)
model = set_weights(layer_treatment, model)

<keras.layers.convolutional.Conv1D object at 0x7f9057b29130>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f905891dca0>
<keras.layers.core.Dropout object at 0x7f90577d7820>
<keras.layers.convolutional.Conv1D object at 0x7f9058937070>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f9058937fd0>
<keras.layers.core.Dropout object at 0x7f9057d1a730>
<keras.layers.convolutional.Conv1D object at 0x7f905891cfa0>
<keras.layers.normalization_v2.BatchNormalization object at 0x7f9058947ca0>
<keras.layers.core.Dropout object at 0x7f904af6b3a0>
<keras.layers.core.Flatten object at 0x7f9058935850>
<keras.layers.core.Dense object at 0x7f9058920c70>


In [None]:
model.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)        

In [None]:
rm -rf ./logs/

In [None]:
model.compile(optimizer='adam', 
              loss='mean_squared_error') 

In [None]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)  # https://stackoverflow.com/questions/59894720/keras-and-tensorboard-attributeerror-sequential-object-has-no-attribute-g

es_callback = EarlyStopping(monitor='val_loss', verbose=1, patience=20)
mc_callback = ModelCheckpoint('new_models/best_model.h5', monitor='val_loss', save_best_only=True)

In [None]:
history = model.fit(X_train, y_train, 
                    epochs=100, 
                    validation_data=(X_val, y_val),
                    callbacks=[tensorboard_callback, es_callback, mc_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 00056: early stopping


In [None]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 30228), started 14:14:56 ago. (Use '!kill 30228' to kill it.)

In [None]:
saved_model = load_model('new_models/best_model.h5')

In [None]:
# R^2
print("Train", r2_score(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train 0.24124043437203846
Val 0.3174910057945999
Test 0.25752724468248067


In [None]:
# Spearman
print("Train", spearmanr(y_train, saved_model.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, saved_model.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.49736508184825123, pvalue=0.0)
Val SpearmanrResult(correlation=0.5506824122889169, pvalue=1.0272808113293604e-212)
Test SpearmanrResult(correlation=0.5514594370946222, pvalue=1.9715563393393002e-213)
