In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os, sys, h5py
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb
import tensorflow as tf
import scipy

import sys
sys.path.append('../../..')
import mutagenesisfunctions as mf
from deepomics import neuralnetwork as nn
from deepomics import utils, fit, visualize, saliency
import time as time
import pandas as pd

  from ._conv import register_converters as _register_converters


In [2]:
starttime = time.time()

#Open data from h5py

data_path = '../../data_RFAM/trna_100k_d5.hdf5'
with h5py.File(data_path, 'r') as dataset:
    X_data = np.array(dataset['X_data'])
    Y_data = np.array(dataset['Y_data'])
    
numdata, seqlen, _, dims = X_data.shape
dims = dims-1

#remove gaps from sequences
ungapped = True
if ungapped:
    X_data = X_data[:, :, :, :dims]
    
# get validation and test set from training set
test_frac = 0.3
valid_frac = 0.1
N = numdata
split_1 = int(N*(1-valid_frac-test_frac))
split_2 = int(N*(1-test_frac))
shuffle = np.random.permutation(N)

#set up dictionaries
train = {'inputs': X_data[shuffle[:split_1]], 
         'targets': Y_data[shuffle[:split_1]]}
valid = {'inputs': X_data[shuffle[split_1:split_2]], 
         'targets': Y_data[shuffle[split_1:split_2]]}
test = {'inputs': X_data[shuffle[split_2:]], 
         'targets': Y_data[shuffle[split_2:]]}
    
print ('Data extraction and dict construction completed in: ' + mf.sectotime(time.time() - starttime))

Data extraction and dict construction completed in: 28.31s


In [3]:
simalign_file = '../../data_RFAM/trnasim_100k.sto'

#Get the full secondary structure and sequence consensus from the emission
SS = mf.getSSconsensus(simalign_file)
SQ = mf.getSQconsensus(simalign_file)

#Get the ungapped sequence and the indices of ungapped nucleotides
_, ugSS, ugidx = mf.rm_consensus_gaps(X_data, SS)
_, ugSQ, _ = mf.rm_consensus_gaps(X_data, SQ)


#Get the sequence and indices of the conserved base pairs
bpchars = ['(',')','<','>','{','}']
sig_bpchars = ['<','>']
bpidx, bpSS, nonbpidx = mf.sigbasepair(SS, bpchars)
numbp = len(bpidx)
numug = len(ugidx)

#Get the bpug information
bpugSQ, bpugidx = mf.bpug(ugidx, bpidx, SQ)

bpSS

'(((((((<<<<>>>><<<<<>>>>><<<<<>>>>>)))))))'

In [4]:
ugSS

'(((((((,,<<<<_______>>>>,<<<<<_______>>>>>,,,,<<<<<_______>>>>>))))))):'

In [5]:
for i,s in enumerate(ugSS):
    print (i,s)

0 (
1 (
2 (
3 (
4 (
5 (
6 (
7 ,
8 ,
9 <
10 <
11 <
12 <
13 _
14 _
15 _
16 _
17 _
18 _
19 _
20 >
21 >
22 >
23 >
24 ,
25 <
26 <
27 <
28 <
29 <
30 _
31 _
32 _
33 _
34 _
35 _
36 _
37 >
38 >
39 >
40 >
41 >
42 ,
43 ,
44 ,
45 ,
46 <
47 <
48 <
49 <
50 <
51 _
52 _
53 _
54 _
55 _
56 _
57 _
58 >
59 >
60 >
61 >
62 >
63 )
64 )
65 )
66 )
67 )
68 )
69 )
70 :


I need to make a function that takes the ugSS and returns the coordinates of the base pairs. From there I can make a secondary function to get the mirror image coordinates to get all of the base paired holistic coordinates from a SoM results array

In [12]:
np.asarray(basepairs)[:, ::-1]

array([[20, 12],
       [21, 11],
       [22, 10],
       [23,  9],
       [37, 29],
       [38, 28],
       [39, 27],
       [40, 26],
       [41, 25],
       [58, 50],
       [59, 49],
       [60, 48],
       [61, 47],
       [62, 46],
       [63,  6],
       [64,  5],
       [65,  4],
       [66,  3],
       [67,  2],
       [68,  1],
       [69,  0]])

In [18]:
bp_coords(ugSS)

array([[12, 20],
       [11, 21],
       [10, 22],
       [ 9, 23],
       [29, 37],
       [28, 38],
       [27, 39],
       [26, 40],
       [25, 41],
       [50, 58],
       [49, 59],
       [48, 60],
       [47, 61],
       [46, 62],
       [ 6, 63],
       [ 5, 64],
       [ 4, 65],
       [ 3, 66],
       [ 2, 67],
       [ 1, 68],
       [ 0, 69],
       [20, 12],
       [21, 11],
       [22, 10],
       [23,  9],
       [37, 29],
       [38, 28],
       [39, 27],
       [40, 26],
       [41, 25],
       [58, 50],
       [59, 49],
       [60, 48],
       [61, 47],
       [62, 46],
       [63,  6],
       [64,  5],
       [65,  4],
       [66,  3],
       [67,  2],
       [68,  1],
       [69,  0]])

In [17]:
def bp_coords(ugSS):
    '''
    Function that takes in an ungapped Sequence string and
    outputs a list of lists with the coordinates base pairs.
    Optionally it can also output the list extended with the 
    reflections of the coordinates for use with holistics 
    plots.
    '''

    bp_openers = ['(', '<', '{']
    bp_closers = [')', '>', '}']

    basepairs = [] #list to hold the base pair coords
    opened = np.array([]) # holds the integers of chars and keeps track of how close they are to being closed
    counter = 0
    for char in ugSS:

        if char in bp_openers:
            #open a base pair and start counting till its closed
            opened = np.append(opened, 0)
            opened += 1

        elif char in bp_closers: 
            #get closer to closing if we find a closing bracket
            opened -= 1
            if 0 in opened:
                #check if we've successfuly closed a pair
                op = np.where(opened ==0)[0][0]
                basepairs.append([op, counter]) #add the pair to our list
                opened[np.where(opened ==0)] = 1000 # make the recently closed char negligible
            opened = np.append(opened, 1000) #treat closing brackets as negligible


        else:
            opened = np.append(opened, 1000) #non-base-paired chars are negligible

        counter += 1
    
    basepairs = np.asarray(basepairs)
    
    #Optional reflection
    reflect = basepairs[:, ::-1]
    basepairs = np.vstack([basepairs, reflect])
    
    return (basepairs)
    
    
    
