name: keras_training.ipynb

This notebook was used for initializing our own version of MPRA-DragoNN's model and training it from scratch. Here, we test varying different settings to see the effect it has on our metrics.

---

In [None]:
# mount google drive

from google.colab import drive
drive.mount('/content/drive')

%cd '/content/drive/Shareddrives/NRC_Amii_Agronomics_Project/nrc-ml-plant-genomics/'

In [None]:
import keras
import warnings, logging
import numpy as np
import pandas as pd

from models.conv_model import ConvModel as Model
from utils.dirs import create_dirs
from utils.fetch_args import fetch_args

from models.base_model import BaseModel
from keras.models import Sequential
from keras.layers import Input, Dense, Conv1D, MaxPooling2D, Dropout, Flatten, BatchNormalization
from keras.optimizers import Adam
import tensorflow as tf

from sklearn.metrics import r2_score
from scipy.stats import spearmanr # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html

warnings.filterwarnings('ignore')
logging.disable(1000)

Using TensorFlow backend.


In [None]:
# one-hot encode sequence
mapping = {"A":[1, 0, 0, 0], "T":[0, 0, 0, 1], "C":[0, 1, 0, 0], "G":[0, 0, 1, 0]}  # cross referenced with kipoi data loader

def get_output(spec_model, sequence):  # get prediction for input sequence
    char_list = np.array([mapping[nt] for nt in sequence])
    return spec_model.predict(np.array([char_list]))

def get_ohe(sequence):  # create (145,4) representation of sequence
    return np.array([mapping[nt] for nt in sequence])

In [None]:
# load in model as it is

from keras.models import model_from_json
import json

with open('models/model.json', 'r') as json_file:
    json_savedModel = json_file.read()
    
model = model_from_json(json_savedModel)
model.load_weights('models/pretrained.hdf5')

In [None]:
model.summary(line_length=None, positions=None, print_fn=None)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batch_normalization_1 (Batch (None, 141, 120)          480       
_________________________________________________________________
dropout_1 (Dropout)          (None, 141, 120)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batch_normalization_2 (Batch (None, 137, 120)          480       
_________________________________________________________________
dropout_2 (Dropout)          (None, 137, 120)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 133, 120)          72120     
__________

In [None]:
# define new model architecture

def create_model(input_sequence_length, number_of_outputs):
    model = Sequential()

    # conv layer 1
    model.add(Conv1D(120, 5, activation='relu', input_shape=(input_sequence_length, 4), name="1DConv_1"))
    model.add(BatchNormalization(name="batchNorm1"))
    model.add(Dropout(0.1, name="drop1"))

    # conv layer 2
    model.add(Conv1D(120, 5, activation='relu', name="1DConv_2"))
    model.add(BatchNormalization(name="batchNorm2"))
    model.add(Dropout(0.1, name="drop2"))

    # conv layer 3
    model.add(Conv1D(120, 5, activation='relu', name="1DConv_3"))
    model.add(BatchNormalization(name="batchNorm3"))
    model.add(Dropout(0.1, name="drop3"))

    # output layer with output dim of 1
    model.add(Flatten(name="flat"))
    model.add(Dense(number_of_outputs, activation='linear', name="dense1"))
    
    return model
    
mdl = create_model(145, 1)

In [None]:
mdl.summary(line_length=None, positions=None, print_fn=None)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1DConv_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batchNorm1 (BatchNormalizati (None, 141, 120)          480       
_________________________________________________________________
drop1 (Dropout)              (None, 141, 120)          0         
_________________________________________________________________
1DConv_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batchNorm2 (BatchNormalizati (None, 137, 120)          480       
_________________________________________________________________
drop2 (Dropout)              (None, 137, 120)          0         
_________________________________________________________________
1DConv_3 (Conv1D)            (None, 133, 120)          72120     
__________

In [None]:
# load in the data
df = pd.read_csv("data/raw/hidra.tsv", sep="\t", header=None)

# TODO: standardize data

df_chloro = df[df[0] == "NC_016734.1"][[3,8]]
df_mito = df[df[0] == "NC_008285.1"][[3,8]]

In [None]:
X_chloro = np.array([get_ohe(sqnc) for sqnc in df_chloro[3]])  # chloroplast X
y_chloro = np.array(df_chloro[8].tolist())   # chloroplast y

In [None]:
X_chloro.shape, y_chloro.shape

((30544, 145, 4), (30544,))

In [None]:
# create different sets through indexing

idx = int(X_chloro.shape[0]/10)

X_test = X_chloro[:idx]
y_test = y_chloro[:idx]

X_val = X_chloro[idx:2*idx]
y_val = y_chloro[idx:2*idx]

X_train = X_chloro[2*idx:]
y_train = y_chloro[2*idx:]

In [None]:
# train new model

mdl.compile(optimizer='adam',
            loss="mean_squared_error")

history = mdl.fit(X_train, y_train, epochs=10, 
                    validation_data=(X_val, y_val))

Train on 24436 samples, validate on 3054 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# R^2
print("Train", r2_score(y_train, mdl.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, mdl.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, mdl.predict(X_test).reshape(1, -1)[0]))

Train 0.9590174112337179
Val -0.5884829547777926
Test -4.965272603670951


In [None]:
# Spearman
print("Train", spearmanr(y_train, mdl.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, mdl.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, mdl.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.9794102599531422, pvalue=0.0)
Val SpearmanrResult(correlation=0.21649076552267812, pvalue=1.0202527807036954e-33)
Test SpearmanrResult(correlation=0.26511640101141287, pvalue=2.7223944009329857e-50)


---
### Mitochondria Data

In [None]:
# define new model architecture
    
mdl = create_model(145, 1)

In [None]:
X_mito = np.array([get_ohe(sqnc) for sqnc in df_mito[3]])
y_mito = np.array(df_mito[8].tolist())

In [None]:
X_mito.shape, y_mito.shape

((44342, 145, 4), (44342,))

In [None]:
idx = int(X_mito.shape[0]/10)

X_test = X_mito[:idx]
y_test = y_mito[:idx]

X_val = X_mito[idx:2*idx]
y_val = y_mito[idx:2*idx]

X_train = X_mito[2*idx:]
y_train = y_mito[2*idx:]

In [None]:
# train new model

mdl.compile(optimizer='adam',
            loss="mean_squared_error")

history = mdl.fit(X_train, y_train, epochs=10, 
                    validation_data=(X_val, y_val))

Train on 35474 samples, validate on 4434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# R^2
print("Train", r2_score(y_train, mdl.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, mdl.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, mdl.predict(X_test).reshape(1, -1)[0]))

Train 0.9434088137532982
Val -0.4230217426463865
Test -0.21026188013430214


In [None]:
# Spearman
print("Train", spearmanr(y_train, mdl.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, mdl.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, mdl.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.9615425749725413, pvalue=0.0)
Val SpearmanrResult(correlation=-0.07428158475379147, pvalue=7.355636301332377e-07)
Test SpearmanrResult(correlation=-0.04403222466025, pvalue=0.0033610846116716277)


---
### Both Organelles Combined

In [None]:
# define new model architecture

mdl = create_model(145, 1)

In [None]:
X = np.array([get_ohe(sqnc) for sqnc in df[3]])
y = np.array(df[8].tolist())

In [None]:
X.shape, y.shape

((74886, 145, 4), (74886,))

In [None]:
idx = int(X.shape[0]/10)

X_test = X[:idx]
y_test = y[:idx]

X_val = X[idx:2*idx]
y_val = y[idx:2*idx]

X_train = X[2*idx:]
y_train = y[2*idx:]

In [None]:
# train new model

mdl.compile(optimizer='adam',
            loss="mean_squared_error")

history = mdl.fit(X_train, y_train, epochs=10, 
                    validation_data=(X_val, y_val))

Train on 59910 samples, validate on 7488 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# R^2
print("Train", r2_score(y_train, mdl.predict(X_train).reshape(1, -1)[0]))
print("Val", r2_score(y_val, mdl.predict(X_val).reshape(1, -1)[0]))
print("Test", r2_score(y_test, mdl.predict(X_test).reshape(1, -1)[0]))

Train 0.947173902237862
Val -0.5940821534956884
Test -0.6880373838211193


In [None]:
# Spearman
print("Train", spearmanr(y_train, mdl.predict(X_train).reshape(1, -1)[0]))
print("Val", spearmanr(y_val, mdl.predict(X_val).reshape(1, -1)[0]))
print("Test", spearmanr(y_test, mdl.predict(X_test).reshape(1, -1)[0]))

Train SpearmanrResult(correlation=0.9761082996169309, pvalue=0.0)
Val SpearmanrResult(correlation=-0.011469546021019145, pvalue=0.3210200838002757)
Test SpearmanrResult(correlation=0.0061536295228367555, pvalue=0.5944423480757881)


---
### Train on Chloroplast, Test on Mitochondria

In [None]:
# define new model architecture

mdl = create_model(145, 1)

In [None]:
X_train = np.array([get_ohe(sqnc) for sqnc in df_chloro[3]])
y_train = np.array(df_chloro[8].tolist())

X_test = np.array([get_ohe(sqnc) for sqnc in df_mito[3]])
y_test = np.array(df_mito[8].tolist())

In [None]:
# train new model

mdl.compile(optimizer='adam',
            loss="mean_squared_error")

history = mdl.fit(X_train, y_train, epochs=10, 
                    validation_data=(X_test, y_test))

Train on 30544 samples, validate on 44342 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# R^2
print("Train (chloro)", r2_score(y_train, mdl.predict(X_train).reshape(1, -1)[0]))
print("Test (mito)", r2_score(y_test, mdl.predict(X_test).reshape(1, -1)[0]))

Train (chloro) 0.9582699270793779
Test (mito) -0.952098390488433


In [None]:
# Spearman
print("Train (chloro)", spearmanr(y_train, mdl.predict(X_train).reshape(1, -1)[0]))
print("Test (mito)", spearmanr(y_test, mdl.predict(X_test).reshape(1, -1)[0]))

Train (chloro) SpearmanrResult(correlation=0.9800406286843023, pvalue=0.0)
Test (mito) SpearmanrResult(correlation=0.08974406717675475, pvalue=5.80757893890165e-80)


---
### Train on Mitochondria, Test on Chloroplast

In [None]:
# define new model architecture

mdl = create_model(145, 1)

In [None]:
X_test = np.array([get_ohe(sqnc) for sqnc in df_chloro[3]])
y_test = np.array(df_chloro[8].tolist())

X_train = np.array([get_ohe(sqnc) for sqnc in df_mito[3]])
y_train = np.array(df_mito[8].tolist())

In [None]:
# train new model

mdl.compile(optimizer='adam',
            loss="mean_squared_error")

history = mdl.fit(X_train, y_train, epochs=10, 
                    validation_data=(X_test, y_test))

Train on 44342 samples, validate on 30544 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# R^2
print("Train (mito)", r2_score(y_train, mdl.predict(X_train).reshape(1, -1)[0]))
print("Test (chloro)", r2_score(y_test, mdl.predict(X_test).reshape(1, -1)[0]))

Train (mito) 0.9488984015280464
Test (chloro) -0.4415901464557095


In [None]:
# pearson
print("Train (mito)", spearmanr(y_train, mdl.predict(X_train).reshape(1, -1)[0]))
print("Test (chloro)", spearmanr(y_test, mdl.predict(X_test).reshape(1, -1)[0]))

Train (mito) SpearmanrResult(correlation=0.9680841000607434, pvalue=0.0)
Test (chloro) SpearmanrResult(correlation=0.08197976218219209, pvalue=1.0494133661475483e-46)


---
### Test Standardization

This was just a test to see what would happen if I standardized the target variable. Nothing of note really came from this.

In [None]:
# load in the data
df = pd.read_csv("data/raw/hidra.tsv", sep="\t", header=None)

df.iloc[:,8] = (df[8] - np.mean(df[8]))/np.std(df[8])

df_chloro = df[df[0] == "NC_016734.1"][[3,8]]
df_mito = df[df[0] == "NC_008285.1"][[3,8]]

In [None]:
# define new model architecture

mdl = create_model(145, 1)

In [None]:
X_test = np.array([get_ohe(sqnc) for sqnc in df_chloro[3]])
y_test = np.array(df_chloro[8].tolist())

X_train = np.array([get_ohe(sqnc) for sqnc in df_mito[3]])
y_train = np.array(df_mito[8].tolist())

In [None]:
# train new model

mdl.compile(optimizer='adam',
            loss="mean_squared_error")

history = mdl.fit(X_train, y_train, epochs=10, 
                    validation_data=(X_test, y_test))

Train on 44342 samples, validate on 30544 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# R^2
print("Train (mito)", r2_score(y_train, mdl.predict(X_train).reshape(1, -1)[0]))
print("Test (chloro)", r2_score(y_test, mdl.predict(X_test).reshape(1, -1)[0]))

Train (mito) 0.9465854465104836
Test (chloro) -0.3444262085197689


In [None]:
# Spearman
print("Train (mito)", spearmanr(y_train, mdl.predict(X_train).reshape(1, -1)[0]))
print("test (chloro)", spearmanr(y_test, mdl.predict(X_test).reshape(1, -1)[0]))

Train (mito) SpearmanrResult(correlation=0.9718708714426192, pvalue=0.0)
test (chloro) SpearmanrResult(correlation=0.10268415076719577, pvalue=2.2190665838296173e-72)


---
### Standardize Individually

See previous section's description.

In [None]:
# load in the data
df = pd.read_csv("data/raw/hidra.tsv", sep="\t", header=None)

# TODO: standardize data

df_chloro = df[df[0] == "NC_016734.1"][[3,8]]
df_chloro.loc[:,8] = (df_chloro[8] - np.mean(df_chloro[8]))/np.std(df_chloro[8])

df_mito = df[df[0] == "NC_008285.1"][[3,8]]
df_mito.loc[:,8] = (df_mito[8] - np.mean(df_mito[8]))/np.std(df_mito[8])

In [None]:
# define new model architecture

mdl = create_model(145, 1)

In [None]:
X_test = np.array([get_ohe(sqnc) for sqnc in df_chloro[3]])
y_test = np.array(df_chloro[8].tolist())

X_train = np.array([get_ohe(sqnc) for sqnc in df_mito[3]])
y_train = np.array(df_mito[8].tolist())

In [None]:
# train new model

mdl.compile(optimizer='adam',
            loss="mean_squared_error")

history = mdl.fit(X_train, y_train, epochs=10, 
                    validation_data=(X_test, y_test))

Train on 44342 samples, validate on 30544 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# R^2
print("Train (mito)", r2_score(y_train, mdl.predict(X_train).reshape(1, -1)[0]))
print("Test (chloro)", r2_score(y_test, mdl.predict(X_test).reshape(1, -1)[0]))

Train (mito) 0.9540165363225646
Test (chloro) -0.08505520640107611


In [None]:
# Spearman
print("Train (mito)", spearmanr(y_train, mdl.predict(X_train).reshape(1, -1)[0]))
print("test (chloro)", spearmanr(y_test, mdl.predict(X_test).reshape(1, -1)[0]))

Train (mito) SpearmanrResult(correlation=0.9710171124923995, pvalue=0.0)
test (chloro) SpearmanrResult(correlation=0.09415262227452184, pvalue=4.2673737531514166e-61)
