In [1]:
import numpy as np
import sys, h5py, os
sys.path.append('../..')
import mutagenesisfunctions as mf
import time as time

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

np.random.seed(274)


In [2]:
#ALL OF THESE SEQUENCES ARE IN THEIR IDX FORM, NOT HOTPLOT
#sequence generator function
def randomsequence(N, maxsize, sample=False):
    if sample:
        seq_sizes = np.random.randint(1, maxsize, size=N)
    else:
        seq_sizes = (np.ones((N))*maxsize).astype(int)
        
    seq_idxs = []
    for seq_size in seq_sizes:
        seq_idx = np.random.randint(0,4, size=seq_size)
        seq_idxs.append(list(seq_idx))
    return (seq_idxs)

#stem region generator
def stemregion(N, maxsize, sample=False):
    stem1 = randomsequence(N, maxsize, sample=sample)
    #generate reverse complement of stem1
    bpref = {0:3, 1:2, 2:1, 3:0}
    stem2 = []
    for s in stem1:
        seq_idx = [bpref[n] for n in s]
        stem2.append(seq_idx[::-1])
    
    return (stem1, stem2)

#CONVERT IDX TO ONEHOT
def onehot(sequence):
    length = len(sequence)
    onehotseq = np.zeros((length, 4))
    for i,s in enumerate(sequence):
        onehotseq[i, s] = 1.
    return (onehotseq.astype(np.float32))

#SAVE THE DATA
def savehdf5(hdf5path, X_data, Y_data):
    with h5py.File(hdf5path, 'w') as f:
        f.create_dataset('X_data', data=X_data.astype(np.float32), compression='gzip')
        f.create_dataset('Y_data', data=Y_data.astype(np.float32), compression='gzip')
    print ('Saving data to %s'%(hdf5path))
    
#OPEN THE DATA
def openhdf5(hdf5path):
    with h5py.File(hdf5path, 'r') as dataset:
        X_data = np.array(dataset['X_data'])
        Y_data = np.array(dataset['Y_data'])
    return (X_data, Y_data)

# Hairpin Generation Factory

This notebook will serve as an easy way to generate toy hairpin sequences with all features of the data controlable. To make generation easy, there are 2 functions that will generate that will create the first outer loop, the two stem regions, the inner loop and the second outer loop which will then be stitched together at the end. Additionally, for each dataset generated, a negative chimeric control should be made for which there is fucnction.

Variable kinds of hairpins that can be made, eg:
1. Hairpins with variable buffers
2. Same length sequences with stem regions in different locations
3. Stem regions of different sizes 
4. Outer loops of different sizes
5. (maybe) hairpins with gaps in them through which we can test RNNs on as a positive control for RNN experiments

In [3]:
def buildhp(stemsize, Oloopsize, Iloopsize, numstems=1, numOloops=2, numIloops=1):
    Oloops = [randomsequence(1,Oloopsize)[0] for l in range(numOloops)]
    Iloops = [randomsequence(1,Iloopsize)[0] for l in range(numIloops)]
    
    stem1s = []
    stem2s = []
    for s in range(numstems):
        stem1, stem2 = stemregion(1, stemsize)
        stem1s.append(stem1[0])
        stem2s.append(stem2[0])
    stems = np.vstack([stem1s[0], stem2s[0]])
    
    #assemble
    hp_idx = Oloops[0] + list(stems[0]) + Iloops[0] + list(stems[1]) + Oloops[1]
    return(hp_idx)

def buildhp_full(stemsize, Oloopsize, Iloopsize, numstems=1, numOloops=2, numIloops=1):
    Oloops = [randomsequence(1,Oloopsize[l])[0] for l in range(numOloops)]
    Iloops = [randomsequence(1,Iloopsize)[0] for l in range(numIloops)]
    
    stem1s = []
    stem2s = []
    for s in range(numstems):
        stem1, stem2 = stemregion(1, stemsize)
        stem1s.append(stem1[0])
        stem2s.append(stem2[0])
    stems = np.vstack([stem1s[0], stem2s[0]])
    
    #assemble
    hp_idx = Oloops[0] + list(stems[0]) + Iloops[0] + list(stems[1]) + Oloops[1]
    return(hp_idx)

def neghp(X_pos):
    numdata, seqlen, _ = X_pos.shape
    firsthalf = X_pos[:, :seqlen//2, :]
    secondhalf = X_pos[:, seqlen//2:, :]
    shuffle1 = np.random.permutation(firsthalf)
    shuffle2 = np.random.permutation(secondhalf)
    X_neg = np.concatenate((shuffle1, shuffle2), axis=1)
    return (X_neg)

### Sizes

In [4]:
import time as time
starttime = time.time()

#set up trials
trial_names = ['mod']#['small', 'med', 'large']
small = (5, 5, 8)
mod = (6, 8, 7)
med = (11, 7, 6)
large = (22, 14, 12)
trials = [mod]#[small, med, large]

num = 50000

#set up save parameters
exp_name = 'toyhp'
exp_folder = 'data_%s'%(exp_name)


for ii, t in enumerate(trials):
    stemsize, Iloopsize, Oloopsize = t

    #Build X_pos
    X_pos = np.asarray([onehot(buildhp(stemsize=stemsize, Oloopsize=Oloopsize, Iloopsize=Iloopsize)) for r in range(num)])
    #Build X_neg as a shuffled chimera
    X_neg = neghp(X_pos)

    X_data = np.vstack((X_pos, X_neg))

    #Build Y labels
    Y_data = np.zeros((num*2, 1))
    Y_data[:num, :] = 1.
    
    #Save
    filename = '%s_%0.fk_%s.hdf5'%(exp_name, num//1000, trial_names[ii])
    savepath = os.path.join('..', exp_folder, filename)
    savehdf5(savepath, X_data, Y_data)




Saving data to ../data_toyhp/toyhp_50k_mod.hdf5


### Offcenter

In [44]:
full_len = 41
num = 50000

stemsize = 11
Iloopsize = 7

mean_offidx, std_offidx = (6, 2)
offidxs = np.clip(np.random.normal(mean_offidx, std_offidx, num), a_max=10, a_min=2).astype(int)
endloopsizes = full_len - 2*stemsize - Iloopsize - offidxs
Oloopsize = zip(offidxs, endloopsizes)


#set up save parameters
exp_name = 'toyhp'
exp_folder = 'data_%s'%(exp_name)
trial = 'offcenter'

#Build X_pos
X_pos = np.asarray([onehot(buildhp_full(stemsize=stemsize, Oloopsize=Oloopsize[r], Iloopsize=Iloopsize)) for r in range(num)])
#Build X_neg as a shuffled chimera
X_neg = neghp(X_pos)

X_data = np.vstack((X_pos, X_neg))

#Build Y labels
Y_data = np.zeros((num*2, 1))
Y_data[:num, :] = 1.

#Save
filename = '%s_%0.fk_%s.hdf5'%(exp_name, num//1000, trial)
savepath = os.path.join('..', exp_folder, filename)
with h5py.File(savepath, 'w') as f:
    f.create_dataset('X_data', data=X_data.astype(np.float32), compression='gzip')
    f.create_dataset('Y_data', data=Y_data.astype(np.float32), compression='gzip')
    f.create_dataset('offidxs', data=offidxs.astype(np.int8), compression='gzip')
print ('Saving data to %s'%(savepath))

Saving data to ../data_toyhp/toyhp_50k_offcenter.hdf5


# Pseudoknot Generation

We will model a simple pseudoknot

In [20]:
def buildpk(stemsize, loopsize, numstems=2, numloops=5):
    loops = [randomsequence(1,loopsize)[0] for l in range(numloops)]

    stem1s = []
    stem2s = []
    for s in range(numstems):
        stem1, stem2 = stemregion(1, stemsize)
        stem1s.append(stem1[0])
        stem2s.append(stem2[0])
    stems = np.vstack([stem1s[0], stem1s[1], stem2s[0], stem2s[1]])

    #assemble
    pk_idx = []
    for ii in range(numloops):
        pk_idx=pk_idx + loops[ii]
        if ii <= len(stems)-1:
            pk_idx=pk_idx+list(stems[ii])
    return(pk_idx)

def negpk(X_pos):
    numdata, seqlen, _ = X_pos.shape
    firsthalf = X_pos[:, :seqlen//2, :]
    secondhalf = X_pos[:, seqlen//2:, :]
    shuffle1 = np.random.permutation(firsthalf)
    shuffle2 = np.random.permutation(secondhalf)
    X_neg = np.concatenate((shuffle1, shuffle2), axis=1)
    return (X_neg)

Generate a 50,000 large positive control set of pseudoknots with labels

In [21]:
import time as time
starttime = time.time()

stemsize = 6
numstems = 2
loopsize = 6
numloops = 5
num = 50000

#Build X_pos
X_pos = np.asarray([onehot(buildpk(stemsize=stemsize, loopsize=loopsize)) for r in range(num)])
#Build X_neg as a shuffled chimera
X_neg = negpk(X_pos)

X_data = np.vstack((X_pos, X_neg))

#Build Y labels
Y_data = np.zeros((num*2, 1))
Y_data[:num, :] = 1.

              
time.time() - starttime

4.342957019805908

In [23]:
starttime = time.time()

#Save dictionaries into h5py files
hdf5path = '../data_toypk/toypk_50_d2.hdf5'
with h5py.File(hdf5path, 'w') as f:
    f.create_dataset('X_data', data=X_data.astype(np.float32), compression='gzip')
    f.create_dataset('Y_data', data=Y_data.astype(np.float32), compression='gzip')
print ('Saving data: ' + mf.sectotime(time.time() - starttime))

Saving data: 0.68s


In [24]:
starttime = time.time()

#Open data from h5py

hdf5path = '../data_toypk/toypk_50_d2.hdf5'
with h5py.File(hdf5path, 'r') as dataset:
    X_data = np.array(dataset['X_data'])
    Y_data = np.array(dataset['Y_data'])
    
num_data, seq_length, dims = X_data.shape

    
print ('Data extraction completed in: ' + str(time.time() - starttime) + 's')

Data extraction completed in: 0.31542301178s


In [25]:
X_data = np.expand_dims(X_data, axis=2)

X_data.shape

(100000, 54, 1, 4)