In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os, sys, h5py
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb
import tensorflow as tf
from deepomics import neuralnetwork as nn
from deepomics import utils, fit, visualize, saliency
import scipy

import sys
sys.path.append('../../../..')
import mutagenesisfunctions as mf
from Bio import AlignIO
import time as time
import pandas as pd

ImportError: No module named mutagenesisfunctions

In [4]:
#Open positive control simulated sequences
starttime = time.time()
simalign_file = '../../../data_RFAM/riboswitch_100k.sto'
Xpos = mf.sto_onehot(simalign_file, 'rna')
Xpos = np.expand_dims(Xpos, axis=2)
print ('Open positive control: ' + mf.sectotime(time.time()-starttime))

#Make negative controls
starttime = time.time()
numdata, seqlen, _, dims = Xpos.shape
dims = dims-1
Xneg = mf.seq_generator(numdata, seqlen, dims, 274)
print ('Random sequence generation completed in: ' + mf.sectotime(time.time() - starttime))

IOError: [Errno 2] No such file or directory: '../../../data_RFAM/riboswitch_100k.sto'

In [None]:
simalign_file = '../../../data_RFAM/riboswitch_100k.sto'

#Get the full secondary structure from the emission
SS = mf.getSSconsensus(simalign_file)

#Get the sequence and indices of the conserved base pairs
bpchars = ['(',')','<','>','{','}']
sig_bpchars = ['<','>']
bpidx, bpSS, nonbpidx = mf.sigbasepair(SS, bpchars)

In [None]:
starttime = time.time()

#insert non base pair positive control things into negative control
for s in range(Xpos.shape[0]):
    Xneg_rand[s, nonbpidx, :, :] = np.copy(Xpos[s, nonbpidx, :, :])

print ('Making neg control w/o base pairs: ' + mf.sectotime(time.time() - starttime))

#check
if Xneg_rand[:, nonbpidx, :, :].all() == Xpos[:, nonbpidx, :, :].all():
    print ('Success!')

In [None]:
#rejoin pos and neg controls
X_data = np.concatenate((Xpos, Xneg_rand), axis=0)
numdata, seqlen, _, dims = X_data.shape
dims = dims-1

In [None]:
#make Y data
Y_data = np.zeros((numdata, 1))
Y_data[:numdata//2, :] = 1.

In [None]:
#Save dictionaries into h5py files
hdf5path = '../../../data_RFAM/riboswitch_100k_t1.hdf5'
with h5py.File(hdf5path, 'w') as f:
    f.create_dataset('X_data', data=X_data)
    f.create_dataset('Y_data', data=Y_data)

In [None]:
starttime = time.time()

#Open data from h5py

data_path = '../../../data_RFAM/riboswitch_100k_t1.hdf5'
with h5py.File(data_path, 'r') as dataset:
    X_data = np.array(dataset['X_data'])
    Y_data = np.array(dataset['Y_data'])
    
num_data, seq_length, _, dims = X_data.shape
dims = dims-1

#remove gaps from sequences
ungapped = True
if ungapped:
    X_data = X_data[:, :, :, :dims]
    
# get validation and test set from training set
test_frac = 0.3
valid_frac = 0.1
N = numdata
split_1 = int(N*(1-valid_frac-test_frac))
split_2 = int(N*(1-test_frac))
shuffle = np.random.permutation(N)

#set up dictionaries
train = {'inputs': X_data[shuffle[:split_1]], 
         'targets': Y_data[shuffle[:split_1]]}
valid = {'inputs': X_data[shuffle[split_1:split_2]], 
         'targets': Y_data[shuffle[split_1:split_2]]}
test = {'inputs': X_data[shuffle[split_2:]], 
         'targets': Y_data[shuffle[split_2:]]}
    
print ('Data extraction and dict construction completed in: ' + mf.sectotime(time.time() - starttime))