In [1]:
import numpy as np
from Bio import AlignIO
import sys, h5py
sys.path.append('../..')
import mutagenesisfunctions as mf
import time as time

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [None]:
#build model from seed
! cmbuild ../../data_RFAM/RF01739.cm ../data_RFAM/seeds/RF01739.sto

In [None]:
#emit sequences from alignment
! cmemit -a -N 100000 --seed 274 -o ../../data_RFAM/glnAsim_100k.sto ../../data_RFAM/RF01739.cm

In [3]:
#open file
filename = '../data_RFAM/glnAsim_100k.sto'
alignment = AlignIO.read(open(filename), "stockholm")

In [2]:
#Open positive control simulated sequences
starttime = time.time()
simalign_file = '../data_RFAM/riboswitch_100k.sto'
Xpos = mf.sto_onehot(simalign_file, 'rna')
Xpos = np.expand_dims(Xpos, axis=2)
print ('Open positive control: ' + mf.sectotime(time.time()-starttime))

Open positive control: 50.14s


In [3]:
simalign_file = '../data_RFAM/trnasim_100k.sto'

#Get the full secondary structure and sequence consensus from the emission
SS = mf.getSSconsensus(simalign_file)
SQ = mf.getSQconsensus(simalign_file)

In [None]:
#Get the ungapped sequence and the indices of ungapped nucleotides
_, ugSS, ugidx = mf.rm_consensus_gaps(X_data, SS)
_, ugSQ, _ = mf.rm_consensus_gaps(X_data, SQ)
len(ugidx)

In [None]:
#Get the gapped indices in the consensus SS
gapidx = mf.consensus_gaps(SS)
len(gapidx)

In [26]:
#Get the sequence and indices of the conserved base pairs
bpchars = ['(',')','<','>','{','}']
sig_bpchars = ['<','>']
bpidx, bpSS, nonbpidx = mf.sigbasepair(SS, bpchars)

###### Negative control generated randomly

In [None]:
#Make negative controls
starttime = time.time()
numdata, seqlen, _, dims = Xpos.shape
dims = dims-1
SS = mf.getSSconsensus(simalign_file)
Xneg = mf.seq_generator_gaps(SS, numdata, seqlen, dims, pgaps=(0.,0.))
print ('Random sequence generation completed in: ' + mf.sectotime(time.time() - starttime))

###### Negative control with copies of everything that is not a base pair

In [27]:
starttime = time.time()

#insert non base pair positive control things into negative control
for s in range(Xpos.shape[0]):
    Xneg[s, nonbpidx, :, :] = np.copy(Xpos[s, nonbpidx, :, :])

print ('Making neg control w/o base pairs: ' + mf.sectotime(time.time() - starttime))

#check
if Xneg[:, gapidx, :, :].all() == Xpos[:, gapidx, :, :].all():
    print ('Success!')

Making neg control w/o base pairs: 6.71s
Success!


###### Negative control with copies of just the gaps

In [5]:
starttime = time.time()

#insert gaps in the negative control where there were gaps in the positive control
for s in range(Xpos.shape[0]):
    gapidxcopy = np.where(Xpos[s,:,0,4]==1.)[0]
    Xneg[s, gapidxcopy, :, :] = np.array([0., 0. , 0. ,0., 1.])
    
print ('Making neg control w/ copy of pos control gaps: '
       + mf.sectotime(time.time() - starttime))

#check
if np.sum(Xneg[:,:,:,4]) == np.sum(Xpos[:,:,:,4]):
    print ('Success')

Making neg control w/ copy of pos control gaps: 1.05s
Success


###### Negative control emmitted from a mean profile of the data

In [3]:
starttime = time.time()

#Emit sequences from the positive control profile
Xprofile = np.squeeze(np.mean(Xpos, axis=0))
numdata, seqlen, _, dims = Xpos.shape
dims = dims-1
Xnegprofile = mf.seq_generator_profile(Xprofile, numdata, seqlen, dims)

print ('Making neg control emitted from frequency profile: '
       + mf.sectotime(time.time() - starttime))

Making neg control emitted from frequency profile: 23min 44.319999999999936s


###### Negative control as a shuffled positive set

In [4]:
starttime = time.time()

#Create a negative control of randomly suffled sequences
numdata, seqlen, _, dims = Xpos.shape
dims = dims-1

def seq_bunchshuffle(Xpos, numdata, seqlen, bunchsize=(10, 75)):

    #n = the number of bunches
    smallbunch, largebunch = bunchsize
    n_upper = seqlen//smallbunch
    n_lower = seqlen//largebunch

    Xshuffle = np.zeros((np.shape(Xpos)))
    ns = []
    for seq in range(numdata):
        Xcopy = np.copy(Xpos[seq])

        n = np.random.randint(n_lower, n_upper)

        bunchidx = [i*(seqlen//n) for i in range(n)]
        bunchidx.append(seqlen)

        start=0
        randidx = np.random.permutation(n)
        for i in range(n):
            idx = randidx[i]
            space = bunchidx[idx+1]-bunchidx[idx]
            Xshuffle[seq, start:start+space, :, :] = Xcopy[bunchidx[idx]:bunchidx[idx+1], :, :]
            start = start + space
            
    return (Xshuffle)


Xnegshuffle = seq_bunchshuffle(Xpos, numdata, seqlen)

print ('Making neg control as shuffled pos: ' + mf.sectotime(time.time() - starttime))

Making neg control as shuffled pos: 8.22s


###### Combine some combination of profile emitted and shuffled sequences

In [11]:
#percent of shuffled negative controls
shufflepercent = 0.25
numdata, seqlen, _, dims = Xpos.shape
dims = dims-1
shuffleportion = np.random.permutation(numdata)[:int(numdata*shufflepercent)]
Xneg = np.concatenate((Xnegprofile[:int(numdata*(1-shufflepercent))], Xnegshuffle[shuffleportion]))

In [15]:
#rejoin pos and neg controls
X_data = np.concatenate((Xpos, Xnegprofile), axis=0)
numdata, seqlen, _, dims = X_data.shape
dims = dims-1

In [16]:
#make Y data
Y_data = np.zeros((numdata, 1))
Y_data[:numdata//2, :] = 1.

In [17]:
starttime = time.time()

#Save dictionaries into h5py files
hdf5path = '../data_RFAM/riboswitch_100k_d5.hdf5'
with h5py.File(hdf5path, 'w') as f:
    f.create_dataset('X_data', data=X_data.astype(np.float32), compression='gzip')
    f.create_dataset('Y_data', data=Y_data.astype(np.float32), compression='gzip')
print ('Saving data: ' + mf.sectotime(time.time() - starttime))

Saving data: 18.24s


In [2]:
starttime = time.time()

#Open data from h5py

data_path = '../data_RFAM/trna_100k_d3.hdf5'
with h5py.File(data_path, 'r') as dataset:
    X_data = np.array(dataset['X_data'])
    Y_data = np.array(dataset['Y_data'])
    
num_data, seq_length, _, dims = X_data.shape
dims = dims-1
    
print ('Data extraction completed in: ' + str(time.time() - starttime) + 's')

Data extraction completed in: 9.35376906395s


In [3]:
Xpos = X_data[:100000]

In [5]:
shuffleportion = np.random.permutation(100000)[:25000]
X_data[100000:125000] = Xnegshuffle[shuffleportion]

In [10]:
X_data.shape

(200000, 513, 1, 5)

In [None]:
#remove gaps from sequences
ungapped = True
if ungapped:
    X_data = X_data[:, :, :, :dims]

In [None]:
# get validation and test set from training set
test_frac = 0.3
valid_frac = 0.1
N = numdata
split_1 = int(N*(1-valid_frac-test_frac))
split_2 = int(N*(1-test_frac))
shuffle = np.random.permutation(N)

#set up dictionaries
train = {'inputs': X_data[shuffle[:split_1]], 
         'targets': Y_data[shuffle[:split_1]]}
valid = {'inputs': X_data[shuffle[split_1:split_2]], 
         'targets': Y_data[shuffle[split_1:split_2]]}
test = {'inputs': X_data[shuffle[split_2:]], 
         'targets': Y_data[shuffle[split_2:]]}

In [6]:
print ('Data extraction and dict construction completed in: %ds' %2)

Data extraction and dict construction completed in: 2s
