This notebook contains functionality to perform the following:

Explore technical aspects of using the MPRA-DragoNN model, including loading in a pre-defined architecture, initializing models with pre-defined weights, accessing individual layers, and using hdf5 data. Keras 12-element representation is created and save for future linear mapping use.

References: 

* https://stackoverflow.com/questions/35074549/how-to-load-a-model-from-an-hdf5-file-in-keras (when using, remember to import specific layer types from keras)
* https://machinelearningmastery.com/how-to-use-transfer-learning-when-developing-convolutional-neural-network-models/ CNN transfer learning tutorial


In [None]:
# mount google drive

from google.colab import drive
drive.mount('/content/drive')

%cd '/content/drive/Shareddrives/NRC_Amii_Agronomics_Project/nrc-ml-plant-genomics/'

In [None]:
import keras
import warnings, logging
import numpy as np

from models.conv_model import ConvModel as Model
from utils.dirs import create_dirs
from utils.fetch_args import fetch_args

warnings.filterwarnings('ignore')
logging.disable(1000)

Using TensorFlow backend.


In [None]:
%ls

LICENSE                     [34mkipoi[m[m/
README.md                   kipoi_playground.ipynb
__init__.py                 linear_mapping.ipynb
commands_for_setup.txt      main.py
[34mdata[m[m/                       [34mmodels[m[m/
data_exploration.ipynb      [34mnew_data[m[m/
[34mdata_loader[m[m/                predict_on_NRC_data.ipynb
[34mevaluator[m[m/                  requirements.txt
[34mexample[m[m/                    requirements_exact.txt
generate_data_format.ipynb  [34mtrainers[m[m/
keras_model_loading.ipynb   [34mutils[m[m/


In [None]:
from keras.models import model_from_json
import json

with open('models/model.json', 'r') as json_file:
    json_savedModel = json_file.read()   # read json file
    
model = model_from_json(json_savedModel)  # convert json data structure to usable model
model.load_weights('models/pretrained.hdf5')   # load pre-trained weights

In [None]:
model.summary(line_length=None, positions=None, print_fn=None)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 141, 120)          2520      
_________________________________________________________________
batch_normalization_1 (Batch (None, 141, 120)          480       
_________________________________________________________________
dropout_1 (Dropout)          (None, 141, 120)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 137, 120)          72120     
_________________________________________________________________
batch_normalization_2 (Batch (None, 137, 120)          480       
_________________________________________________________________
dropout_2 (Dropout)          (None, 137, 120)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 133, 120)          72120     
__________

In [None]:
model.layers

[<keras.layers.convolutional.Conv1D at 0x7fe1f6b1ef50>,
 <keras.layers.normalization.BatchNormalization at 0x7fe1f6b1ee50>,
 <keras.layers.core.Dropout at 0x7fe1f6b5e9d0>,
 <keras.layers.convolutional.Conv1D at 0x7fe1f6b5e1d0>,
 <keras.layers.normalization.BatchNormalization at 0x7fe1f6b7dbd0>,
 <keras.layers.core.Dropout at 0x7fe1f6dd8990>,
 <keras.layers.convolutional.Conv1D at 0x7fe1f6e72f90>,
 <keras.layers.normalization.BatchNormalization at 0x7fe1f6ea8510>,
 <keras.layers.core.Dropout at 0x7fe1f6f90c50>,
 <keras.layers.core.Flatten at 0x7fe1f71c7a50>,
 <keras.layers.core.Dense at 0x7fe1f72ef490>]

In [None]:
for layer in model.layers:
    print(layer)
    print(np.array(layer.get_weights()).shape, "\n") # list of numpy arrays
    print(layer.get_weights(), "\n") # list of numpy arrays

<keras.layers.convolutional.Conv1D object at 0x7fe1f6b1ef50>
(2,) 

[array([[[-0.20759815, -0.27741235,  0.20513098, ..., -0.5075552 ,
          0.02815552, -0.2771745 ],
        [-0.16587065, -0.2860154 , -0.11723913, ...,  0.08329422,
          0.20837966, -0.08525781],
        [ 0.06763158,  0.2358079 , -0.27557042, ...,  0.10912888,
         -0.18372208,  0.16274358],
        [ 0.19182613, -0.06025316,  0.11166722, ..., -0.16255344,
         -0.06013646, -0.24308987]],

       [[-0.1286703 ,  0.06272014, -0.41996783, ...,  0.15120022,
         -0.24819784,  0.09262886],
        [ 0.18137728,  0.04991252,  0.12473465, ..., -0.23988877,
          0.03044431, -0.28737277],
        [-0.26420784, -0.3068193 ,  0.11479288, ..., -0.07273901,
         -0.20195907, -0.01112762],
        [-0.00097122, -0.00742353, -0.04638174, ..., -0.11749103,
          0.1505862 ,  0.11840549]],

       [[ 0.04204793, -0.04697714, -0.07359485, ...,  0.14347391,
          0.14307939,  0.14285913],
        [

In [None]:
X = np.array([[[1, 0, 0, 0]]*145])
X

array([[[1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1

In [None]:
model.predict(X)

array([[ 1.1350813 ,  1.619066  ,  1.6189165 ,  0.5772135 ,  0.19718614,
         0.47316065,  0.9922271 ,  0.03419602,  0.646011  , -0.6097446 ,
         0.03506389, -0.35607833]], dtype=float32)

In [None]:
model.layers[0].weights

[<tf.Variable 'conv1d_1/kernel:0' shape=(5, 4, 120) dtype=float32_ref>,
 <tf.Variable 'conv1d_1/bias:0' shape=(120,) dtype=float32_ref>]

In [None]:
model.layers[0].get_weights()

[array([[[-0.20759815, -0.27741235,  0.20513098, ..., -0.5075552 ,
           0.02815552, -0.2771745 ],
         [-0.16587065, -0.2860154 , -0.11723913, ...,  0.08329422,
           0.20837966, -0.08525781],
         [ 0.06763158,  0.2358079 , -0.27557042, ...,  0.10912888,
          -0.18372208,  0.16274358],
         [ 0.19182613, -0.06025316,  0.11166722, ..., -0.16255344,
          -0.06013646, -0.24308987]],
 
        [[-0.1286703 ,  0.06272014, -0.41996783, ...,  0.15120022,
          -0.24819784,  0.09262886],
         [ 0.18137728,  0.04991252,  0.12473465, ..., -0.23988877,
           0.03044431, -0.28737277],
         [-0.26420784, -0.3068193 ,  0.11479288, ..., -0.07273901,
          -0.20195907, -0.01112762],
         [-0.00097122, -0.00742353, -0.04638174, ..., -0.11749103,
           0.1505862 ,  0.11840549]],
 
        [[ 0.04204793, -0.04697714, -0.07359485, ...,  0.14347391,
           0.14307939,  0.14285913],
         [ 0.05395318,  0.13912876,  0.09199578, ..., -0.1

In [None]:
# layer.set_weights(weights)

In [None]:
weights = model.get_weights()
# np.array(weights).shape

In [None]:
len(weights)

20

In [None]:
weights[0].shape

(5, 4, 120)

In [None]:
weights[0][0].shape

(4, 120)

In [None]:
weights[0][0][0].shape

(120,)

In [None]:
weights[19]

array([0.02913981, 0.00915099, 0.02186438, 0.01912426, 0.02128757,
       0.0229942 , 0.01872605, 0.02257514, 0.02278137, 0.02862196,
       0.02395758, 0.03187391], dtype=float32)

---
### Loading in NRC Data

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('data/raw/CanolaOrganelles_v.1.tsv', sep="\t", header=None)

In [None]:
df

Unnamed: 0,0,1
0,AATCATAATAACTTGGTCCCGGGCATCACGGGCGAACGACGGGAAT...,0.26
1,TAATAACTTGGTCCCGGGCATCACGGGCGAACGACGGGAATTGAAC...,0.27
2,ACTTGGTCCCGGGCATCACGGGCGAACGACGGGAATTGAACCCGCG...,0.27
3,GTCCCGGGCATCACGGGCGAACGACGGGAATTGAACCCGCGATGGT...,0.27
4,GGGCATCACGGGCGAACGACGGGAATTGAACCCGCGATGGTGAATT...,0.27
...,...,...
74881,ATGGAGTTGTGTTTTGCCACCTGGAGTTTTAATGGAAGTTTGAGTG...,0.47
74882,GTTGTGTTTTGCCACCTGGAGTTTTAATGGAAGTTTGAGTGCGTCC...,0.50
74883,GTTTTGCCACCTGGAGTTTTAATGGAAGTTTGAGTGCGTCCTAAAA...,0.47
74884,GCCACCTGGAGTTTTAATGGAAGTTTGAGTGCGTCCTAAAAGCCAA...,0.51


In [None]:
mapping = {"A":[1, 0, 0, 0], "T":[0, 0, 0, 1], "C":[0, 1, 0, 0], "G":[0, 0, 1, 0]}  # cross referenced with kipoi data loader

def get_output(sequence):  # returns target prediction from 12-element vector 
    char_list = np.array([mapping[nt] for nt in sequence])
    return model.predict(np.array([char_list]))

In [None]:
get_output(df[0][0])

array([[ 0.05719248,  0.06287669,  0.06941698, -0.09135018, -0.13182782,
        -0.1274574 ,  0.03053499, -0.07282076, -0.02254434, -0.04860629,
        -0.1205321 , -0.10466526]], dtype=float32)

In [None]:
pred = np.array([get_output(sqnc)[0] for sqnc in df[0]])   # target prediction for every row sequence in the dataframe

In [None]:
pred.shape

(74886, 12)

In [None]:
# save element dataframe
pd.DataFrame(pred, columns=["node"+str(i) for i in range(1,13)]).to_csv("data/processed/CanolaTargetsKeras.csv")

---
### Loading in hdf5 data

We never ended up using this section, I don't claim to understand all of this code  
http://mitra.stanford.edu/kundaje/projects/mpra/data/

In [None]:
import h5py

In [None]:
f = h5py.File('data/train.hdf5', 'r')

In [None]:
list(f.keys())

['X', 'Y', 'weights']

In [None]:
X = f['X']
y = f['Y']
weights = f["weights"]

In [None]:
X

<HDF5 group "/X" (1 members)>

In [None]:
is_dataset = isinstance(X, h5py.Dataset)
is_dataset

False

In [None]:
f.close()

In [None]:
weights

<Closed HDF5 group>

In [None]:
# look at MPRA-DragoNN code to see how they do it
from keras.utils import Sequence
import h5py
import os


class MPRADataLoader(Sequence):
    def __init__(self, config, datatype):
        self.config = config
        self.batch_size = config.batch_size
        self.fname = os.path.join(config.data_path, datatype + '.hdf5')

        with h5py.File(self.fname, 'r') as hf:
            self.max_batches = hf['X']['sequence'].shape[0]//self.batch_size
        
        if self.config.max_batch_steps != -1:
            self.max_batches = min(self.config.max_batch_steps, self.max_batches)

    def __len__(self):
        return self.max_batches

    def __getitem__(self, idx):
        start_idx = self.batch_size*idx
        end_idx = start_idx + self.batch_size
        with h5py.File(self.fname, 'r') as hf:
            x, y = hf['X']['sequence'][start_idx:end_idx], hf['Y']['output'][start_idx:end_idx]
       
        return x,y 

In [None]:
with h5py.File("data/train.hdf5", 'r') as hf:
    x, y = hf['X']['sequence'], hf['Y']['output']

In [None]:
x

<Closed HDF5 dataset>