name: keras_retraining.ipynb

This notebook was used to prototype code where we can change how the different layers of the MPRA-DragoNN model are treated during training. This involves initializing randomly, initializing from a starting point and commencing training, and initializing from a starting point and freezing training. 

---

https://keras.io/guides/transfer_learning/

In [None]:
# mount google drive

from google.colab import drive
drive.mount('/content/drive')

%cd '/content/drive/Shareddrives/NRC_Amii_Agronomics_Project/nrc-ml-plant-genomics/'

In [None]:
import keras
import warnings, logging
import json
import numpy as np
import pandas as pd

from models.base_model import BaseModel
from keras.models import Sequential
from keras.layers import Input, Dense, Conv1D, MaxPooling2D, Dropout, Flatten, BatchNormalization
from keras.optimizers import Adam
from keras.models import model_from_json
import tensorflow as tf

from sklearn.metrics import r2_score
from scipy.stats import spearmanr # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html

warnings.filterwarnings('ignore')
logging.disable(1000)

Using TensorFlow backend.


In [None]:
# load in model as it is

with open('models/model.json', 'r') as json_file:
    json_savedModel = json_file.read()
    
pretrained_model = model_from_json(json_savedModel)
pretrained_model.load_weights('models/pretrained.hdf5')

In [None]:
pretrained_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batch_normalization_1 (Batch (None, 141, 120)          480       
_________________________________________________________________
dropout_1 (Dropout)          (None, 141, 120)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batch_normalization_2 (Batch (None, 137, 120)          480       
_________________________________________________________________
dropout_2 (Dropout)          (None, 137, 120)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 133, 120)          72120     
__________

In [None]:
# one-hot encode sequence
mapping = {"A":[1, 0, 0, 0], "T":[0, 0, 0, 1], "C":[0, 1, 0, 0], "G":[0, 0, 1, 0]}  # cross referenced with kipoi data loader

def get_ohe(sequence):  # creates (145,4) representation of sequence
    return np.array([mapping[nt] for nt in sequence])

# def create_model(input_sequence_length, number_of_outputs):
#     model = Sequential()

#     model.add(Conv1D(120, 5, activation='relu', input_shape=(input_sequence_length, 4), name="1DConv_1", trainable=False))
#     model.add(BatchNormalization(name="batchNorm1", trainable=False))
#     model.add(Dropout(0.1, name="drop1"))

#     model.add(Conv1D(120, 5, activation='relu', name="1DConv_2", trainable=False))
#     model.add(BatchNormalization(name="batchNorm2", trainable=False))
#     model.add(Dropout(0.1, name="drop2"))

#     model.add(Conv1D(120, 5, activation='relu', name="1DConv_3", trainable=False))
#     model.add(BatchNormalization(name="batchNorm3", trainable=False))
#     model.add(Dropout(0.1, name="drop3"))

#     model.add(Flatten(name="flat"))
#     model.add(Dense(number_of_outputs, activation='linear', name="dense1"))
    
#     return model

---
### Chloroplast w/ Extra Layer

In [None]:
def create_model(input_sequence_length, number_of_outputs):
    model = Sequential()

    # first conv layer
    model.add(Conv1D(120, 5, activation='relu', input_shape=(input_sequence_length, 4), name="1DConv_1", trainable=False))
    model.add(BatchNormalization(name="batchNorm1", trainable=False))
    model.add(Dropout(0.1, name="drop1"))

    # second conv layer
    model.add(Conv1D(120, 5, activation='relu', name="1DConv_2", trainable=False))
    model.add(BatchNormalization(name="batchNorm2", trainable=False))
    model.add(Dropout(0.1, name="drop2"))

    # third conv layer
    model.add(Conv1D(120, 5, activation='relu', name="1DConv_3", trainable=False))
    model.add(BatchNormalization(name="batchNorm3", trainable=False))
    model.add(Dropout(0.1, name="drop3"))

    # flatten and previous dense output
    model.add(Flatten(name="flat"))
    model.add(Dense(12, activation='linear', name="dense1", trainable=False))
    
    # new dense output
    model.add(Dense(number_of_outputs, activation='linear', name="dense2"))
    
    return model

In [None]:
model = create_model(145, 1)

In [None]:
model.summary()#line_length=None, positions=None, print_fn=None)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)          72120     
__________

In [None]:
# set the weights that we want (tested and works)

for i in range(len(model.layers)-1):
    pretrained_layer_weights = pretrained_model.layers[i].get_weights()
    model.layers[i].set_weights(pretrained_layer_weights)

In [None]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)          72120     
__________

In [None]:
# load in the data
df = pd.read_csv("data/raw/hidra.tsv", sep="\t", header=None)

# make separate dataframes for chloroplast data and mitochondrion data
df_chloro = df[df[0] == "NC_016734.1"][[3,8]]  
df_mito = df[df[0] == "NC_008285.1"][[3,8]]

In [None]:
X_chloro = np.array([get_ohe(sqnc) for sqnc in df_chloro[3]])  # create proper representation
y_chloro = np.array(df_chloro[8].tolist())

In [None]:
X_chloro.shape, y_chloro.shape

((30544, 145, 4), (30544,))

In [None]:
# create train/test/val sequentially

idx = int(X_chloro.shape[0]/10)

X_test = X_chloro[:idx]
y_test = y_chloro[:idx]

X_val = X_chloro[idx:2*idx]
y_val = y_chloro[idx:2*idx]

X_train = X_chloro[2*idx:]
y_train = y_chloro[2*idx:]

In [None]:
# train new model

model.compile(optimizer='adam',
            loss="mean_squared_error")

history = model.fit(X_train, y_train, epochs=10, 
                    validation_data=(X_val, y_val))

Train on 24436 samples, validate on 3054 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# R^2
print("Train", r2_score(y_train, model.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, model.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, model.predict(X_test).reshape(1, -1)[0]))

Train 0.025520088173720423
Val 0.14674706940165294
Test -1.993770569559346


In [None]:
# Spearman
print("Train", spearmanr(y_train, model.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, model.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, model.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.5221309934494401, pvalue=0.0)
Val SpearmanrResult(correlation=0.4450641571374498, pvalue=1.6324941583393092e-148)
Test SpearmanrResult(correlation=0.3936982832739136, pvalue=8.878059168448573e-114)


---
### Chloroplast w/ Replaced Layer

In [None]:
def create_model(input_sequence_length, number_of_outputs):
    model = Sequential()

    model.add(Conv1D(120, 5, activation='relu', input_shape=(input_sequence_length, 4), name="1DConv_1", trainable=False))
    model.add(BatchNormalization(name="batchNorm1", trainable=False))
    model.add(Dropout(0.1, name="drop1"))

    model.add(Conv1D(120, 5, activation='relu', name="1DConv_2", trainable=False))
    model.add(BatchNormalization(name="batchNorm2", trainable=False))
    model.add(Dropout(0.1, name="drop2"))

    model.add(Conv1D(120, 5, activation='relu', name="1DConv_3", trainable=False))
    model.add(BatchNormalization(name="batchNorm3", trainable=False))
    model.add(Dropout(0.1, name="drop3"))

    model.add(Flatten(name="flat"))
#     model.add(Dense(12, activation='linear', name="dense1", trainable=False))
    
    model.add(Dense(number_of_outputs, activation='linear', name="dense2"))
    
    return model

In [None]:
model = create_model(145, 1)

In [None]:
model.summary()#line_length=None, positions=None, print_fn=None)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)          72120     
__________

In [None]:
# set the weights that we want (tested and works)

for i in range(len(model.layers)-1):
    pretrained_layer_weights = pretrained_model.layers[i].get_weights()
    model.layers[i].set_weights(pretrained_layer_weights)

In [None]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)          72120     
__________

In [None]:
# load in the data
df = pd.read_csv("data/raw/hidra.tsv", sep="\t", header=None)

# TODO: standardize data

df_chloro = df[df[0] == "NC_016734.1"][[3,8]]
df_mito = df[df[0] == "NC_008285.1"][[3,8]]

In [None]:
X_chloro = np.array([get_ohe(sqnc) for sqnc in df_chloro[3]])
y_chloro = np.array(df_chloro[8].tolist())

In [None]:
X_chloro.shape, y_chloro.shape

((30544, 145, 4), (30544,))

In [None]:
idx = int(X_chloro.shape[0]/10)

X_test = X_chloro[:idx]
y_test = y_chloro[:idx]

X_val = X_chloro[idx:2*idx]
y_val = y_chloro[idx:2*idx]

X_train = X_chloro[2*idx:]
y_train = y_chloro[2*idx:]

In [None]:
# train new model

model.compile(optimizer='adam',
            loss="mean_squared_error")

history = model.fit(X_train, y_train, epochs=10, 
                    validation_data=(X_val, y_val))

Train on 24436 samples, validate on 3054 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# R^2
print("Train", r2_score(y_train, model.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, model.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, model.predict(X_test).reshape(1, -1)[0]))

Train 0.305479799583078
Val -0.04177023622108811
Test -2.0856430092333516


In [None]:
# Spearman
print("Train", spearmanr(y_train, model.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, model.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, model.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.7238977464110093, pvalue=0.0)
Val SpearmanrResult(correlation=0.29193534866384485, pvalue=4.5447566658393835e-61)
Test SpearmanrResult(correlation=0.3175957726749138, pvalue=1.5550787606399824e-72)


## C3, C4, C5

### C3 Chloroplast: L1 = F, L2 = F, L3 = F

In [None]:
def create_model(input_sequence_length, number_of_outputs):
    model = Sequential()

    model.add(Conv1D(120, 5, activation='relu', input_shape=(input_sequence_length, 4), name="1DConv_1", trainable=True))
    model.add(BatchNormalization(name="batchNorm1", trainable=True))
    model.add(Dropout(0.1, name="drop1"))

    model.add(Conv1D(120, 5, activation='relu', name="1DConv_2", trainable=True))
    model.add(BatchNormalization(name="batchNorm2", trainable=True))
    model.add(Dropout(0.1, name="drop2"))

    model.add(Conv1D(120, 5, activation='relu', name="1DConv_3", trainable=True))
    model.add(BatchNormalization(name="batchNorm3", trainable=True))
    model.add(Dropout(0.1, name="drop3"))

    model.add(Flatten(name="flat"))
#     model.add(Dense(12, activation='linear', name="dense1", trainable=False))
    
    model.add(Dense(number_of_outputs, activation='linear', name="dense2"))
    
    return model

In [None]:
model = create_model(145, 1)

In [None]:
model.summary()#line_length=None, positions=None, print_fn=None)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)          72120     
__________

In [None]:
# set the weights that we want (tested and works)

for i in range(len(model.layers)-1):
    print(model.layers[i])
    pretrained_layer_weights = pretrained_model.layers[i].get_weights()
    model.layers[i].set_weights(pretrained_layer_weights)

<keras.layers.convolutional.Conv1D object at 0x7ff64eec2250>
<keras.layers.normalization.BatchNormalization object at 0x7ff64eea79d0>
<keras.layers.core.Dropout object at 0x7ff64eda7610>
<keras.layers.convolutional.Conv1D object at 0x7ff64eda7950>
<keras.layers.normalization.BatchNormalization object at 0x7ff64ee04b50>
<keras.layers.core.Dropout object at 0x7ff64f3a4850>
<keras.layers.convolutional.Conv1D object at 0x7ff64f775750>
<keras.layers.normalization.BatchNormalization object at 0x7ff64f7a5910>
<keras.layers.core.Dropout object at 0x7ff64f8137d0>
<keras.layers.core.Flatten object at 0x7ff64f92e050>


In [None]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)          72120     
__________

In [None]:
# load in the data
df = pd.read_csv("data/raw/hidra.tsv", sep="\t", header=None)

# TODO: standardize data

df_chloro = df[df[0] == "NC_016734.1"][[3,8]]
# df_mito = df[df[0] == "NC_008285.1"][[3,8]]

In [None]:
X_chloro = np.array([get_ohe(sqnc) for sqnc in df_chloro[3]])
y_chloro = np.array(df_chloro[8].tolist())

In [None]:
X_chloro.shape, y_chloro.shape

((30544, 145, 4), (30544,))

In [None]:
idx = int(X_chloro.shape[0]/10)

X_test = X_chloro[:idx]
y_test = y_chloro[:idx]

X_val = X_chloro[idx:2*idx]
y_val = y_chloro[idx:2*idx]

X_train = X_chloro[2*idx:]
y_train = y_chloro[2*idx:]

In [None]:
# train new model

model.compile(optimizer='adam',
            loss="mean_squared_error")

history = model.fit(X_train, y_train, epochs=10, 
                    validation_data=(X_val, y_val))

Train on 24436 samples, validate on 3054 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# R^2
print("Train", r2_score(y_train, model.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, model.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, model.predict(X_test).reshape(1, -1)[0]))

Train 0.9799335241758428
Val -0.4995104242444577
Test -5.636248290009296


In [None]:
# Spearman
print("Train", spearmanr(y_train, model.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, model.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, model.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.9922069131053126, pvalue=0.0)
Val SpearmanrResult(correlation=0.33976088308434715, pvalue=2.1632926427376007e-83)
Test SpearmanrResult(correlation=0.19787366531644104, pvalue=2.451821229188447e-28)


---
### C4 Chloroplast: L1 = Z, L2 = Z, L3 = F

In [None]:
def create_model(input_sequence_length, number_of_outputs):
    model = Sequential()

    model.add(Conv1D(120, 5, activation='relu', input_shape=(input_sequence_length, 4), name="1DConv_1", trainable=False))
    model.add(BatchNormalization(name="batchNorm1", trainable=False))
    model.add(Dropout(0.1, name="drop1"))

    model.add(Conv1D(120, 5, activation='relu', name="1DConv_2", trainable=False))
    model.add(BatchNormalization(name="batchNorm2", trainable=False))
    model.add(Dropout(0.1, name="drop2"))

    model.add(Conv1D(120, 5, activation='relu', name="1DConv_3", trainable=True))
    model.add(BatchNormalization(name="batchNorm3", trainable=True))
    model.add(Dropout(0.1, name="drop3"))

    model.add(Flatten(name="flat"))
#     model.add(Dense(12, activation='linear', name="dense1", trainable=False))
    
    model.add(Dense(number_of_outputs, activation='linear', name="dense2"))
    
    return model

In [None]:
model = create_model(145, 1)

In [None]:
model.summary()#line_length=None, positions=None, print_fn=None)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)          72120     
__________

In [None]:
# set the weights that we want (tested and works)

for i in range(len(model.layers)-1):
    print(model.layers[i])
    pretrained_layer_weights = pretrained_model.layers[i].get_weights()
    model.layers[i].set_weights(pretrained_layer_weights)

<keras.layers.convolutional.Conv1D object at 0x7ff62c70cd90>
<keras.layers.normalization.BatchNormalization object at 0x7ff62c70ce10>
<keras.layers.core.Dropout object at 0x7ff62c70c050>
<keras.layers.convolutional.Conv1D object at 0x7ff62c62d950>
<keras.layers.normalization.BatchNormalization object at 0x7ff62c9adc10>
<keras.layers.core.Dropout object at 0x7ff62cd6aa10>
<keras.layers.convolutional.Conv1D object at 0x7ff62da23090>
<keras.layers.normalization.BatchNormalization object at 0x7ff62da98e90>
<keras.layers.core.Dropout object at 0x7ff62e0bdf10>
<keras.layers.core.Flatten object at 0x7ff62e0bd910>


In [None]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)          72120     
__________

In [None]:
# load in the data
df = pd.read_csv("data/raw/hidra.tsv", sep="\t", header=None)

# TODO: standardize data

df_chloro = df[df[0] == "NC_016734.1"][[3,8]]
df_mito = df[df[0] == "NC_008285.1"][[3,8]]

In [None]:
X_chloro = np.array([get_ohe(sqnc) for sqnc in df_chloro[3]])
y_chloro = np.array(df_chloro[8].tolist())

In [None]:
X_chloro.shape, y_chloro.shape

((30544, 145, 4), (30544,))

In [None]:
idx = int(X_chloro.shape[0]/10)

X_test = X_chloro[:idx]
y_test = y_chloro[:idx]

X_val = X_chloro[idx:2*idx]
y_val = y_chloro[idx:2*idx]

X_train = X_chloro[2*idx:]
y_train = y_chloro[2*idx:]

In [None]:
# train new model

model.compile(optimizer='adam',
            loss="mean_squared_error")

history = model.fit(X_train, y_train, epochs=10, 
                    validation_data=(X_val, y_val))

Train on 24436 samples, validate on 3054 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# R^2
print("Train", r2_score(y_train, model.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, model.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, model.predict(X_test).reshape(1, -1)[0]))

Train 0.8871313556493472
Val -0.12134065629052238
Test -3.498811411763822


In [None]:
# Spearman
print("Train", spearmanr(y_train, model.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, model.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, model.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.9863863021497022, pvalue=0.0)
Val SpearmanrResult(correlation=0.30289889295843436, pvalue=7.943610210692869e-66)
Test SpearmanrResult(correlation=0.19439020557687545, pvalue=2.1821821628929946e-27)


---
### C3 Chloroplast: L1 = Z, L2 = Z, L3 = R

In [None]:
def create_model(input_sequence_length, number_of_outputs):
    model = Sequential()

    model.add(Conv1D(120, 5, activation='relu', input_shape=(input_sequence_length, 4), name="1DConv_1", trainable=False))
    model.add(BatchNormalization(name="batchNorm1", trainable=False))
    model.add(Dropout(0.1, name="drop1"))

    model.add(Conv1D(120, 5, activation='relu', name="1DConv_2", trainable=False))
    model.add(BatchNormalization(name="batchNorm2", trainable=False))
    model.add(Dropout(0.1, name="drop2"))

    model.add(Conv1D(120, 5, activation='relu', name="1DConv_3", trainable=True))
    model.add(BatchNormalization(name="batchNorm3", trainable=True))
    model.add(Dropout(0.1, name="drop3"))

    model.add(Flatten(name="flat"))
#     model.add(Dense(12, activation='linear', name="dense1", trainable=False))
    
    model.add(Dense(number_of_outputs, activation='linear', name="dense2"))
    
    return model

In [None]:
model = create_model(145, 1)

In [None]:
model.summary()#line_length=None, positions=None, print_fn=None)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)          72120     
__________

In [None]:
# set the weights that we want (tested and works)

for i in range(6):
    print(model.layers[i])
    pretrained_layer_weights = pretrained_model.layers[i].get_weights()
    model.layers[i].set_weights(pretrained_layer_weights)

<keras.layers.convolutional.Conv1D object at 0x7ff61befbf50>
<keras.layers.normalization.BatchNormalization object at 0x7ff61bf58910>
<keras.layers.core.Dropout object at 0x7ff61bf7bc50>
<keras.layers.convolutional.Conv1D object at 0x7ff61c1141d0>
<keras.layers.normalization.BatchNormalization object at 0x7ff61c114f10>
<keras.layers.core.Dropout object at 0x7ff61bf58e90>


In [None]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)          72120     
__________

In [None]:
# load in the data
df = pd.read_csv("data/raw/hidra.tsv", sep="\t", header=None)

# TODO: standardize data

df_chloro = df[df[0] == "NC_016734.1"][[3,8]]
df_mito = df[df[0] == "NC_008285.1"][[3,8]]

In [None]:
X_chloro = np.array([get_ohe(sqnc) for sqnc in df_chloro[3]])
y_chloro = np.array(df_chloro[8].tolist())

In [None]:
X_chloro.shape, y_chloro.shape

((30544, 145, 4), (30544,))

In [None]:
idx = int(X_chloro.shape[0]/10)

X_test = X_chloro[:idx]
y_test = y_chloro[:idx]

X_val = X_chloro[idx:2*idx]
y_val = y_chloro[idx:2*idx]

X_train = X_chloro[2*idx:]
y_train = y_chloro[2*idx:]

In [None]:
# train new model

model.compile(optimizer='adam',
            loss="mean_squared_error")

history = model.fit(X_train, y_train, epochs=10, 
                    validation_data=(X_val, y_val))

Train on 24436 samples, validate on 3054 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# R^2
print("Train", r2_score(y_train, model.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, model.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, model.predict(X_test).reshape(1, -1)[0]))

Train 0.9358380801416339
Val -0.254193614432878
Test -3.7695818839165547


In [None]:
# Spearman
print("Train", spearmanr(y_train, model.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, model.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, model.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.9797530730630867, pvalue=0.0)
Val SpearmanrResult(correlation=0.27975972285828044, pvalue=5.008215429418851e-56)
Test SpearmanrResult(correlation=0.20483495827101586, pvalue=2.7433648175799054e-30)


### C3 Mitochondrion: L1 = F, L2 = F, L3 = F

In [None]:
def create_model(input_sequence_length, number_of_outputs):
    model = Sequential()

    model.add(Conv1D(120, 5, activation='relu', input_shape=(input_sequence_length, 4), name="1DConv_1", trainable=True))
    model.add(BatchNormalization(name="batchNorm1", trainable=True))
    model.add(Dropout(0.1, name="drop1"))

    model.add(Conv1D(120, 5, activation='relu', name="1DConv_2", trainable=True))
    model.add(BatchNormalization(name="batchNorm2", trainable=True))
    model.add(Dropout(0.1, name="drop2"))

    model.add(Conv1D(120, 5, activation='relu', name="1DConv_3", trainable=True))
    model.add(BatchNormalization(name="batchNorm3", trainable=True))
    model.add(Dropout(0.1, name="drop3"))

    model.add(Flatten(name="flat"))
#     model.add(Dense(12, activation='linear', name="dense1", trainable=False))
    
    model.add(Dense(number_of_outputs, activation='linear', name="dense2"))
    
    return model

In [None]:
model = create_model(145, 1)

In [None]:
model.summary()#line_length=None, positions=None, print_fn=None)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)          72120     
__________

In [None]:
# set the weights that we want (tested and works)

for i in range(len(model.layers)-1):
    print(model.layers[i])
    pretrained_layer_weights = pretrained_model.layers[i].get_weights()
    model.layers[i].set_weights(pretrained_layer_weights)

<keras.layers.convolutional.Conv1D object at 0x7fd48b7307d0>
<keras.layers.normalization.BatchNormalization object at 0x7fd48b73fad0>
<keras.layers.core.Dropout object at 0x7fd48b644b50>
<keras.layers.convolutional.Conv1D object at 0x7fd48b6449d0>
<keras.layers.normalization.BatchNormalization object at 0x7fd48b690110>
<keras.layers.core.Dropout object at 0x7fd48bfdaed0>
<keras.layers.convolutional.Conv1D object at 0x7fd48c1d4c50>
<keras.layers.normalization.BatchNormalization object at 0x7fd48c1f65d0>
<keras.layers.core.Dropout object at 0x7fd48c1c5990>
<keras.layers.core.Flatten object at 0x7fd48c37ed90>


In [None]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)          72120     
__________

In [None]:
# load in the data
df = pd.read_csv("data/raw/hidra.tsv", sep="\t", header=None)

# TODO: standardize data

# df_chloro = df[df[0] == "NC_016734.1"][[3,8]]
df_chloro = df[df[0] == "NC_008285.1"][[3,8]]

In [None]:
X_chloro = np.array([get_ohe(sqnc) for sqnc in df_chloro[3]])
y_chloro = np.array(df_chloro[8].tolist())

In [None]:
X_chloro.shape, y_chloro.shape

((44342, 145, 4), (44342,))

In [None]:
idx = int(X_chloro.shape[0]/10)

X_test = X_chloro[:idx]
y_test = y_chloro[:idx]

X_val = X_chloro[idx:2*idx]
y_val = y_chloro[idx:2*idx]

X_train = X_chloro[2*idx:]
y_train = y_chloro[2*idx:]

In [None]:
# train new model

model.compile(optimizer='adam',
            loss="mean_squared_error")

history = model.fit(X_train, y_train, epochs=10, 
                    validation_data=(X_val, y_val))

Train on 35474 samples, validate on 4434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# R^2
print("Train", r2_score(y_train, model.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, model.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, model.predict(X_test).reshape(1, -1)[0]))

Train 0.960030166856412
Val -0.2553586740139038
Test -0.0474876591929847


In [None]:
# Spearman
print("Train", spearmanr(y_train, model.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, model.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, model.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.9831030515052255, pvalue=0.0)
Val SpearmanrResult(correlation=0.05786917496219279, pvalue=0.00011546788033856874)
Test SpearmanrResult(correlation=0.12038642119929258, pvalue=8.742740315960421e-16)


---
### C4 Mitochondrion: L1 = Z, L2 = Z, L3 = F

In [None]:
def create_model(input_sequence_length, number_of_outputs):
    model = Sequential()

    model.add(Conv1D(120, 5, activation='relu', input_shape=(input_sequence_length, 4), name="1DConv_1", trainable=False))
    model.add(BatchNormalization(name="batchNorm1", trainable=False))
    model.add(Dropout(0.1, name="drop1"))

    model.add(Conv1D(120, 5, activation='relu', name="1DConv_2", trainable=False))
    model.add(BatchNormalization(name="batchNorm2", trainable=False))
    model.add(Dropout(0.1, name="drop2"))

    model.add(Conv1D(120, 5, activation='relu', name="1DConv_3", trainable=True))
    model.add(BatchNormalization(name="batchNorm3", trainable=True))
    model.add(Dropout(0.1, name="drop3"))

    model.add(Flatten(name="flat"))
#     model.add(Dense(12, activation='linear', name="dense1", trainable=False))
    
    model.add(Dense(number_of_outputs, activation='linear', name="dense2"))
    
    return model

In [None]:
model = create_model(145, 1)

In [None]:
model.summary()#line_length=None, positions=None, print_fn=None)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)          72120     
__________

In [None]:
# set the weights that we want (tested and works)

for i in range(len(model.layers)-1):
    print(model.layers[i])
    pretrained_layer_weights = pretrained_model.layers[i].get_weights()
    model.layers[i].set_weights(pretrained_layer_weights)

<keras.layers.convolutional.Conv1D object at 0x7fd46c455410>
<keras.layers.normalization.BatchNormalization object at 0x7fd46c467a10>
<keras.layers.core.Dropout object at 0x7fd46c467850>
<keras.layers.convolutional.Conv1D object at 0x7fd46c347950>
<keras.layers.normalization.BatchNormalization object at 0x7fd46ca43fd0>
<keras.layers.core.Dropout object at 0x7fd46ca62850>
<keras.layers.convolutional.Conv1D object at 0x7fd46d2c4f50>
<keras.layers.normalization.BatchNormalization object at 0x7fd46d643a10>
<keras.layers.core.Dropout object at 0x7fd46d2ac510>
<keras.layers.core.Flatten object at 0x7fd46d85fe90>


In [None]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)          72120     
__________

In [None]:
# load in the data
df = pd.read_csv("data/raw/hidra.tsv", sep="\t", header=None)

# TODO: standardize data

# df_chloro = df[df[0] == "NC_016734.1"][[3,8]]
df_chloro = df[df[0] == "NC_008285.1"][[3,8]]

In [None]:
X_chloro = np.array([get_ohe(sqnc) for sqnc in df_chloro[3]])
y_chloro = np.array(df_chloro[8].tolist())

In [None]:
X_chloro.shape, y_chloro.shape

((44342, 145, 4), (44342,))

In [None]:
idx = int(X_chloro.shape[0]/10)

X_test = X_chloro[:idx]
y_test = y_chloro[:idx]

X_val = X_chloro[idx:2*idx]
y_val = y_chloro[idx:2*idx]

X_train = X_chloro[2*idx:]
y_train = y_chloro[2*idx:]

In [None]:
# train new model

model.compile(optimizer='adam',
            loss="mean_squared_error")

history = model.fit(X_train, y_train, epochs=10, 
                    validation_data=(X_val, y_val))

Train on 35474 samples, validate on 4434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# R^2
print("Train", r2_score(y_train, model.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, model.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, model.predict(X_test).reshape(1, -1)[0]))

Train 0.9415857396748929
Val -0.22931888773526987
Test -0.20688776875565962


In [None]:
# Spearman
print("Train", spearmanr(y_train, model.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, model.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, model.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.9741109089254318, pvalue=0.0)
Val SpearmanrResult(correlation=0.07419364169473011, pvalue=7.581153582165634e-07)
Test SpearmanrResult(correlation=-0.021965243001119066, pvalue=0.14363383603781982)


---
### C3 Mitochondrion: L1 = Z, L2 = Z, L3 = R

In [None]:
def create_model(input_sequence_length, number_of_outputs):
    model = Sequential()

    model.add(Conv1D(120, 5, activation='relu', input_shape=(input_sequence_length, 4), name="1DConv_1", trainable=False))
    model.add(BatchNormalization(name="batchNorm1", trainable=False))
    model.add(Dropout(0.1, name="drop1"))

    model.add(Conv1D(120, 5, activation='relu', name="1DConv_2", trainable=False))
    model.add(BatchNormalization(name="batchNorm2", trainable=False))
    model.add(Dropout(0.1, name="drop2"))

    model.add(Conv1D(120, 5, activation='relu', name="1DConv_3", trainable=True))
    model.add(BatchNormalization(name="batchNorm3", trainable=True))
    model.add(Dropout(0.1, name="drop3"))

    model.add(Flatten(name="flat"))
#     model.add(Dense(12, activation='linear', name="dense1", trainable=False))
    
    model.add(Dense(number_of_outputs, activation='linear', name="dense2"))
    
    return model

In [None]:
model = create_model(145, 1)

In [None]:
model.summary()#line_length=None, positions=None, print_fn=None)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)          72120     
__________

In [None]:
# set the weights that we want (tested and works)

for i in range(6):
    print(model.layers[i])
    pretrained_layer_weights = pretrained_model.layers[i].get_weights()
    model.layers[i].set_weights(pretrained_layer_weights)

<keras.layers.convolutional.Conv1D object at 0x7fd45ba7f950>
<keras.layers.normalization.BatchNormalization object at 0x7fd45ba7f2d0>
<keras.layers.core.Dropout object at 0x7fd45bd5ac10>
<keras.layers.convolutional.Conv1D object at 0x7fd45ba989d0>
<keras.layers.normalization.BatchNormalization object at 0x7fd45c24ced0>
<keras.layers.core.Dropout object at 0x7fd45bc21f50>


In [None]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)          72120     
__________

In [None]:
# load in the data
df = pd.read_csv("data/raw/hidra.tsv", sep="\t", header=None)

# TODO: standardize data

# df_chloro = df[df[0] == "NC_016734.1"][[3,8]]
df_chloro = df[df[0] == "NC_008285.1"][[3,8]]

In [None]:
X_chloro = np.array([get_ohe(sqnc) for sqnc in df_chloro[3]])
y_chloro = np.array(df_chloro[8].tolist())

In [None]:
X_chloro.shape, y_chloro.shape

((44342, 145, 4), (44342,))

In [None]:
idx = int(X_chloro.shape[0]/10)

X_test = X_chloro[:idx]
y_test = y_chloro[:idx]

X_val = X_chloro[idx:2*idx]
y_val = y_chloro[idx:2*idx]

X_train = X_chloro[2*idx:]
y_train = y_chloro[2*idx:]

In [None]:
# train new model

model.compile(optimizer='adam',
            loss="mean_squared_error")

history = model.fit(X_train, y_train, epochs=10, 
                    validation_data=(X_val, y_val))

Train on 35474 samples, validate on 4434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# R^2
print("Train", r2_score(y_train, model.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, model.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, model.predict(X_test).reshape(1, -1)[0]))

Train 0.9489978382498858
Val -0.30910819657873745
Test -0.15048862029150722


In [None]:
# Spearman
print("Train", spearmanr(y_train, model.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, model.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, model.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.96592136407161, pvalue=0.0)
Val SpearmanrResult(correlation=0.012712260445004118, pvalue=0.3973947908291495)
Test SpearmanrResult(correlation=0.021540618220178786, pvalue=0.15153949320643711)
