In [1]:
TEST = True

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os, sys, h5py
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import tensorflow as tf
import scipy

import sys
sys.path.append('../../..')
import mutagenesisfunctions as mf
import helper 
from deepomics import neuralnetwork as nn
from deepomics import utils, fit, visualize, saliency

from Bio import AlignIO
import time as time
import pandas as pd
#---------------------------------------------------------------------------------------------------------------------------------
'''DEFINE ACTIONS'''
TRAIN = False
TEST = False
WRITE = False
FOM = False
SOMCALC = False
SOMVIS = False

if '--train' in sys.argv:
    TRAIN = True
if '--test' in sys.argv:
    TEST = True
if '--write' in sys.argv:
    WRITE = True
if '--fom' in sys.argv:
    FOM = True
if '--somcalc' in sys.argv:
    SOMCALC = True
if '--somvis' in sys.argv:
    SOMVIS = True

#---------------------------------------------------------------------------------------------------------------------------------
'''DEFINE LOOP'''
trials = ['trna', 'riboswitch', 'glna']
trainportion_list = [0.7, 0.5, 0.3, 0.1] #0.7 is the original trainportion we've been working with

datafiles = {'glna': ['glna_100k_d8.hdf5', '../../data_RFAM/glnAsim_100k.sto'], 
              'trna': ['trna_100k_d4.hdf5', '../../data_RFAM/trnasim_100k.sto'],
              'riboswitch': ['riboswitch_100k_d4.hdf5', '../../data_RFAM/riboswitch_100k.sto'],}

exp = 'trainsize'  #for the params folder


img_folder = 'Images'

for t in trials:
    print ('----------------  %s  ---------------'%(t))
    for trainportion in trainportion_list:
        print ('Sequences trained on: ', trainportion*200000)

        
        '''OPEN DATA'''

        starttime = time.time()

        #Open data from h5py
        exp_data = 'data_background'
        filename = '%s_100k_sh%.0f.hdf5'%(t, 0.25*100) #We're importing the data that we originally baselined against 75:25 profile:shuffle
        data_path = os.path.join('../..', exp_data, filename)
        with h5py.File(data_path, 'r') as dataset:
            X_data = np.array(dataset['X_data'])
            Y_data = np.array(dataset['Y_data'])

        numdata, seqlen, _, dims = X_data.shape
        dims = dims-1

        #remove gaps from sequences
        ungapped = True
        if ungapped:
            X_data = X_data[:, :, :, :dims]

        # get validation and test set from training set
        train_frac = trainportion
        valid_frac = 0.1
        test_frac = 1-trainportion-valid_frac
        N = numdata
        split_1 = int(N*(1-valid_frac-test_frac))
        split_2 = int(N*(1-test_frac))
        shuffle = np.random.permutation(N)

        #set up dictionaries
        train = {'inputs': X_data[shuffle[:split_1]], 
                 'targets': Y_data[shuffle[:split_1]]}
        valid = {'inputs': X_data[shuffle[split_1:split_2]], 
                 'targets': Y_data[shuffle[split_1:split_2]]}
        test = {'inputs': X_data[shuffle[split_2:]], 
                 'targets': Y_data[shuffle[split_2:]]}

        print ('Data extraction and dict construction completed in: ' + mf.sectotime(time.time() - starttime))

        simalign_file = datafiles[t][1]
        #Get the full secondary structure and sequence consensus from the emission
        SS = mf.getSSconsensus(simalign_file)
        SQ = mf.getSQconsensus(simalign_file)

        #Get the ungapped sequence and the indices of ungapped nucleotides
        _, ugSS, ugidx = mf.rm_consensus_gaps(X_data, SS)
        _, ugSQ, _ = mf.rm_consensus_gaps(X_data, SQ)


        #Get the sequence and indices of the conserved base pairs
        bpchars = ['(',')','<','>','{','}']
        sig_bpchars = ['<','>']
        bpidx, bpSS, nonbpidx = mf.sigbasepair(SS, bpchars)
        numbp = len(bpidx)
        numug = len(ugidx)

        #Get the bpug information
        bpugSQ, bpugidx = mf.bpug(ugidx, bpidx, SQ)
        #---------------------------------------------------------------------------------------------------------------------------------


        '''SAVE PATHS AND PARAMETERS'''
        params_results = '../../results'

        modelarch = 'resbind'
        trial = t + '_tp%.0f'%(trainportion*100)
        modelsavename = '%s_%s'%(modelarch, trial)
        
        
        '''BUILD NEURAL NETWORK'''

        def cnn_model(input_shape, output_shape):

            # create model
            layer1 = {'layer': 'input', #41
                  'input_shape': input_shape
                  }
            layer2 = {'layer': 'conv1d',
                  'num_filters': 96,
                  'filter_size': input_shape[1]-29,
                  'norm': 'batch',
                  'activation': 'relu',
                  'dropout': 0.3,
                  'padding': 'VALID',
                  }
            layer3 = {'layer': 'conv1d_residual',
                  'filter_size': 5,
                  'function': 'relu',
                  'dropout_block': 0.1,
                  'dropout': 0.3,
                  'mean_pool': 10,
                  }

            layer4 = {'layer': 'dense',        # input, conv1d, dense, conv1d_residual, dense_residual, conv1d_transpose,
                                          # concat, embedding, variational_normal, variational_softmax, + more
                'num_units': 196,
                'norm': 'batch',          # if removed, automatically adds bias instead
                'activation': 'relu',     # or leaky_relu, prelu, sigmoid, tanh, etc
                'dropout': 0.5,           # if removed, default is no dropout
                   }


            layer5 = {'layer': 'dense',
                  'num_units': output_shape[1],
                  'activation': 'sigmoid'
                  }

            model_layers = [layer1, layer2, layer3, layer4, layer5]

            # optimization parameters
            optimization = {"objective": "binary",
                        "optimizer": "adam",
                        "learning_rate": 0.0003,
                        "l2": 1e-5,
                        #"label_smoothing": 0.05,
                        #"l1": 1e-6,
                        }
            return model_layers, optimization

        tf.reset_default_graph()

        # get shapes of inputs and targets
        input_shape = list(train['inputs'].shape)
        input_shape[0] = None
        output_shape = train['targets'].shape

        # load model parameters
        model_layers, optimization = cnn_model(input_shape, output_shape)

        # build neural network class
        nnmodel = nn.NeuralNet(seed=247)
        nnmodel.build_layers(model_layers, optimization)

        # compile neural trainer
        save_path = os.path.join(params_results, exp)
        param_path = os.path.join(save_path, modelsavename)
        nntrainer = nn.NeuralTrainer(nnmodel, save='best', file_path=param_path)

        '''TEST'''
        sess = utils.initialize_session()

        # set best parameters
        nntrainer.set_best_parameters(sess)
                
        # test model
        loss, mean_vals, std_vals = nntrainer.test_model(sess, test, name='test')

----------------  trna  ---------------
Sequences trained on:  140000.0
Data extraction and dict construction completed in: 6.12s
loading model from:  ../../results/trainsize/resbind_trna_tp70_best.ckpt
INFO:tensorflow:Restoring parameters from ../../results/trainsize/resbind_trna_tp70_best.ckpt
  test  loss:		0.01359
  test  accuracy:	0.99950+/-0.00000
  test  auc-roc:	1.00000+/-0.00000
  test  auc-pr:		1.00000+/-0.00000
Sequences trained on:  100000.0
Data extraction and dict construction completed in: 5.87s
loading model from:  ../../results/trainsize/resbind_trna_tp50_best.ckpt
INFO:tensorflow:Restoring parameters from ../../results/trainsize/resbind_trna_tp50_best.ckpt
  test  loss:		0.01563
  test  accuracy:	0.99875+/-0.00000
  test  auc-roc:	0.99996+/-0.00000
  test  auc-pr:		0.99997+/-0.00000
Sequences trained on:  60000.0
Data extraction and dict construction completed in: 6.09s
loading model from:  ../../results/trainsize/resbind_trna_tp30_best.ckpt
INFO:tensorflow:Restoring 

In [28]:
100000*(1-mean_vals[0])

183.74885156967613

In [29]:
test['inputs'].shape[0] - test['inputs'].shape[0]*mean_vals[0]

294.0