In [7]:
import numpy as np
import sys, h5py, os
sys.path.append('../../../..')
import mutagenesisfunctions as mf
import time as time

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

np.random.seed(274)


In [8]:
#ALL OF THESE SEQUENCES ARE IN THEIR IDX FORM, NOT HOTPLOT
#sequence generator function
def randomsequence(N, maxsize, sample=False):
    if sample:
        seq_sizes = np.random.randint(1, maxsize, size=N)
    else:
        seq_sizes = (np.ones((N))*maxsize).astype(int)
        
    seq_idxs = []
    for seq_size in seq_sizes:
        seq_idx = np.random.randint(0,4, size=seq_size)
        seq_idxs.append(list(seq_idx))
    return (seq_idxs)

#stem region generator
def stemregion(N, maxsize, sample=False):
    stem1 = randomsequence(N, maxsize, sample=sample)
    #generate reverse complement of stem1
    bpref = {0:3, 1:2, 2:1, 3:0}
    stem2 = []
    for s in stem1:
        seq_idx = [bpref[n] for n in s]
        stem2.append(seq_idx[::-1])
    
    return (stem1, stem2)

#CONVERT IDX TO ONEHOT
def onehot(sequence):
    length = len(sequence)
    onehotseq = np.zeros((length, 4))
    for i,s in enumerate(sequence):
        onehotseq[i, s] = 1.
    return (onehotseq.astype(np.float32))

#SAVE THE DATA
def savehdf5(hdf5path, X_data, Y_data):
    with h5py.File(hdf5path, 'w') as f:
        f.create_dataset('X_data', data=X_data.astype(np.float32), compression='gzip')
        f.create_dataset('Y_data', data=Y_data.astype(np.float32), compression='gzip')
    print ('Saving data to %s'%(hdf5path))
    
#OPEN THE DATA
def openhdf5(hdf5path):
    with h5py.File(hdf5path, 'r') as dataset:
        X_data = np.array(dataset['X_data'])
        Y_data = np.array(dataset['Y_data'])
    return (X_data, Y_data)

In [46]:
#SHUFFLE THE XPOS SEQUENCES TO RANDOM BUNCHES
def seq_bunchshuffle(Xpos, numdata, seqlen, bunchsize=(10, 75), numbunches=None):

    if bunchsize:
        #n = the number of bunches per sequence
        smallbunch, largebunch = bunchsize
        n_upper = seqlen//smallbunch
        n_lower = seqlen//largebunch
    if numbunches:
        n_lower, n_upper = numbunches

    Xshuffle = np.zeros((np.shape(Xpos)))
    ns = []
    for seq in range(numdata):
        Xcopy = np.copy(Xpos[seq])

        n = np.random.randint(n_lower, n_upper)

        bunchidx = [i*(seqlen//n) for i in range(n)]
        bunchidx.append(seqlen)

        start=0
        randidx = np.random.permutation(n)
        for i in range(n):
            idx = randidx[i]
            space = bunchidx[idx+1]-bunchidx[idx]
            #Note this is without the pseudo dimension for deepomics
            Xshuffle[seq, start:start+space, :] = Xcopy[bunchidx[idx]:bunchidx[idx+1], :]
            start = start + space
            
    return (Xshuffle)


#SHUFFLE THE XPOS SEQUENCES TO RANDOM BUNCHES AND DISTRIBUTE THEM BETWEEN EACHOTHER
def interseq_bunchshuffle(Xpos, seqlen, numdata, numbunches=4):

    Xshuffle = np.zeros((np.shape(Xpos)))
    n = numbunches
    #ns = []
    
    #Create the bunch reservoir
    bunch_res = [[] for i in range(n)]
    for seq in range(numdata):
        Xcopy = np.copy(Xpos[seq])


        bunchidx = [i*(seqlen//n) for i in range(n)]
        bunchidx.append(seqlen)

        start=0
        seqidx = np.arange(n)
        for i in range(n):
            idx = seqidx[i]
            space = bunchidx[idx+1]-bunchidx[idx]
            #Note this is without the pseudo dimension for deepomics
            bunch_res[i].append(Xcopy[bunchidx[idx]:bunchidx[idx+1], :])
            start = start + space
    
    #shuffle up the order
    bunch_shuffled = [np.random.permutation(bunch_res[i]) for i in range(n)]
    
    
    #stitch them back together
    for seq in range(numdata):
        
        bunchidx = [i*(seqlen//n) for i in range(n)]
        bunchidx.append(seqlen)

        start=0
        seqidx = np.arange(n)
        for i in range(n):
            idx = seqidx[i]
            space = bunchidx[idx+1]-bunchidx[idx]
            #Note this is without the pseudo dimension for deepomics
            
            Xshuffle[seq, start:start+space, :] = bunch_shuffled[i][seq]
            
            #bunch_res[i].append(Xcopy[bunchidx[idx]:bunchidx[idx+1], :])
            start = start + space
    return (Xshuffle)

# Pseudoknot Generation

We will model a simple pseudoknot

In [47]:
def buildpk(stemsize, loopsize, numstems=2, numloops=5):
    loops = [randomsequence(1,loopsize)[0] for l in range(numloops)]

    stem1s = []
    stem2s = []
    for s in range(numstems):
        stem1, stem2 = stemregion(1, stemsize)
        stem1s.append(stem1[0])
        stem2s.append(stem2[0])
    stems = np.vstack([stem1s[0], stem1s[1], stem2s[0], stem2s[1]])

    #assemble
    pk_idx = []
    for ii in range(numloops):
        pk_idx=pk_idx + loops[ii]
        if ii <= len(stems)-1:
            pk_idx=pk_idx+list(stems[ii])
    return(pk_idx)

def negpk(X_pos):
    numdata, seqlen, _ = X_pos.shape
    firsthalf = X_pos[:, :seqlen//2, :]
    secondhalf = X_pos[:, seqlen//2:, :]
    shuffle1 = np.random.permutation(firsthalf)
    shuffle2 = np.random.permutation(secondhalf)
    X_neg = np.concatenate((shuffle1, shuffle2), axis=1)
    return (X_neg)

### PK for transfer learning tests - hairpin + pseudoknot

We want to simulate a more complicated RNA to test transfer learning.

The sequence will have 6 binding regions with regions binding in the following pattern:
- 1-3
- 2-6
- 4-5

Where 1-3 and 2-6 comprise the pseudoknot and 4-5 are the hairpin

In [48]:
#function to build this more complicated pseudoknot (consult notes for structure)
def build_pkhp(stemsize=6, loopsize=5, numstems=3, numloops=7):
    loops = [randomsequence(1,loopsize)[0] for l in range(numloops)]
    
    stem1s = [] #The first binding region of each stem
    stem2s = [] #The second binding region of each stem
    for s in range(numstems):
        stem1, stem2 = stemregion(1, stemsize)
        stem1s.append(stem1[0])
        stem2s.append(stem2[0])
    #Add binding regions in the order of their place in the sequence
    stems = np.vstack([stem1s[0], stem1s[1], stem2s[0], stem1s[2], stem2s[2], stem2s[1]])

    #assemble
    pk_idx = []
    for ii in range(numloops):
        pk_idx=pk_idx + loops[ii]
        if ii <= len(stems)-1:
            pk_idx=pk_idx+list(stems[ii])
    return(pk_idx)

In [57]:
#set parameters
np.random.seed(23)
stemsize = 6
numstems = 3
loopsize = 5
numloops = 7
num = 50000
seqlen = stemsize*(2*numstems) + loopsize*numloops

datatype = 4

hpstart, hpend = (37,54)

In [58]:
#GENERATE THE POSITIVE SEQUENCES

#for pkhp
X_pos_pkhp = np.asarray([onehot(build_pkhp(stemsize=stemsize, loopsize=loopsize)) for r in range(num)])

#for hp
X_pos_hp = np.asarray([onehot(randomsequence(1, maxsize=seqlen)[0]) for n in range(num)]) #random sequences
for n in range(num): #Insert the hairpin () into the random sequences
    X_pos_hp[n, hpstart:hpend, :] = X_pos_pkhp[n, hpstart:hpend, :]

In [59]:
#GENERATE THE NEGATIVE SEQUENCES (DATA 1: BOTH AS WELL SHUFFLED COPIES)

if datatype == 1:

    X_neg_pkhp = seq_bunchshuffle(X_pos_pkhp, num, seqlen, bunchsize=None, numbunches=(2,4))

    X_neg_hp = seq_bunchshuffle(X_pos_hp, num, seqlen, bunchsize=None, numbunches=(2,4))

#(DATA 2: PKHP NEGATIVE SEQUENCES JUST A COPY OF HP)

if datatype == 2:
    
    X_neg_pkhp = np.copy(X_pos_hp)

    X_neg_hp = seq_bunchshuffle(X_pos_hp, num, seqlen, bunchsize=None, numbunches=(2,4))
    
#(DATA 3: BOTH AS INTERSEQ SHUFFLED COPIES)

if datatype == 3:
    
    X_neg_pkhp = interseq_bunchshuffle(X_pos_pkhp, num, seqlen)

    X_neg_hp = interseq_bunchshuffle(X_pos_hp, num, seqlen)
    
#(DATA 4: PKHP NEGATIVE SEQUENCES JUST A COPY OF HP BUT HP NEGS A INTERSEQ SHUFFLED SET)

if datatype == 4:
    
    X_neg_pkhp = np.copy(X_pos_hp)

    X_neg_hp = interseq_bunchshuffle(X_pos_hp, num, seqlen)
    

In [60]:
#GENERATE TWO ARRAYS WITH YLABELS FOR POS AND NEG FOR EASY INDEXING

Y_pos = np.ones((num, 1))
Y_neg = np.zeros((num, 1))

In [61]:
#PACKAGE THEM INTO A FILE

starttime = time.time()

#Save dictionaries into h5py files
hdf5path = '../../../data_toypk/toypkhp_50_d%0.f.hdf5'%(datatype)
with h5py.File(hdf5path, 'w') as f:
    f.create_dataset('X_pos_pkhp', data=X_pos_pkhp.astype(np.float32), compression='gzip')
    f.create_dataset('X_neg_pkhp', data=X_neg_pkhp.astype(np.float32), compression='gzip')
    
    f.create_dataset('X_pos_hp', data=X_pos_hp.astype(np.float32), compression='gzip')
    f.create_dataset('X_neg_hp', data=X_neg_hp.astype(np.float32), compression='gzip')
    
    f.create_dataset('Y_pos', data=Y_pos.astype(np.float32), compression='gzip')
    f.create_dataset('Y_neg', data=Y_neg.astype(np.float32), compression='gzip')

print ('Saving data: ' + mf.sectotime(time.time() - starttime))

Saving data: 1.53s


In [62]:
hdf5path

'../../../data_toypk/toypkhp_50_d4.hdf5'