In [1]:
import numpy as np
from Bio import AlignIO

In [2]:
#build model from seed
! cmbuild ../../data_RFAM/RF01739.cm ../../data_RFAM/seeds/RF01739.sto

# cmbuild :: covariance model construction from multiple sequence alignments
# INFERNAL 1.1.2 (July 2016)
# Copyright (C) 2016 Howard Hughes Medical Institute.
# Freely distributed under a BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# CM file:                                            ../../data_RFAM/RF01739.cm
# alignment file:                                     ../../data_RFAM/seeds/RF01739.sto
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#                                                                      rel entropy
#                                                                      -----------
# idx    name                     nseq eff_nseq   alen  clen  bps bifs    CM   HMM description
# ------ -------------------- -------- -------- ------ ----- ---- ---- ----- ----- -----------
       1 glnA                      956     7.25    274    61   16    1 0.916 0.708 glnA RNA
#
# CPU time: 0.32u 0

In [117]:
#emit sequences from alignment
! cmemit -a -N 100000 --seed 274 -o ../../data_RFAM/glnAsim_100k.sto ../../data_RFAM/RF01739.cm

In [118]:
#open file
filename = '../../data_RFAM/glnAsim_100k.sto'
alignment = AlignIO.read(open(filename), "stockholm")

In [120]:
#get SS consensus
def getSSconsensus(simalign_file):
    SS = ''
    with open(simalign_file) as f1:
        for line in f1:
            if '#=GC SS_cons' in line:
                line = line.strip()
                line = line.split()
                SS = SS + line[-1]
    return (SS)

In [None]:
#Open positive control simulated sequences
starttime = time.time()
simalign_file = '../../../data_RFAM/glnAsim_100k.sto'
X_pos = mf.sto_onehot(simalign_file, 'rna')
X_pos = np.expand_dims(X_pos, axis=2)
print (time.time() - starttime)

#Make negative controls
starttime = time.time()
num_data, seq_length, _, dims = X_pos.shape
dims = dims-1
SS = mf.getSSconsensus(simalign_file)
X_neg = mf.seq_generator_gaps(SS, num_data, seq_length, dims)
print (time.time() - starttime)

In [125]:
SS = getSSconsensus(filename)
SS

'.....:.........:....(..(.....(...(....(......,....<...<....<....<...<....<...<.................................................................._........_....._....._......_......................................................................................................................................>.....>....>....>...>....>....>.....,.......,.......,.......,.....,......,......,.....,....<....<...<.....<......................................................._........_........_....._........_.......................................................................................................................................................................................................................................................................................................>....>.....>....>....,.....,....,.....,......)....)...)...)....).....:....:.....:.....:........'

In [127]:
len(alignment[0])

893

In [124]:
print (alignment[0].seq)

-----A--------------C--G-----U---U----C------A----G---U----A----U---C----U---Uu---------------------------------------------------------------guU--------C-----A-------------uucgugccgugu-------------------------------------------------------------------------------------------------------------ugucuaauaguagA-----A----G----A---U----A----C-----C-------G-------A-------A-----A------G------U-----A----G----G---U-----Uc-----------------------------------------------------uA--------A--------A-----C--------Acaauccguagacugaaagucaagauau------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------aguucaguugcgggaagaucauggguccA----G-----C----C----G-----A----A-----G------G----A---A---C----G-----C----A--------------------


In [16]:
rnadict = {'A':0, 'C':1, 'G':2, 'U':3, 'a':0, 'c':1, 'g':2, 'u':3, '-':4, '.':4}


In [36]:
def seq_onehot(sequences, dims, alphabet, gaps=True):
    if alphabet == 'rna':
        ndict = rnadict
    onehot = np.zeros((len(sequences), len(sequences[0]), dims+1))
    for i1,seq in enumerate(sequences):
        for i2, n in enumerate(seq):
            onehot[i1][i2][ndict[n]] = 1.    
    if gaps == False:
            onehot = onehot[:, :, :dims]
    return onehot
        

In [140]:
X_data = seq_onehot(sequences, 4, 'rna')


In [141]:
X_data.shape

(100, 240, 5)

Figure out how to remove areas where gaps are in majority

In [128]:
#remove gaps in the consensus SS from the simulated sequences
def consensus_gaps(SS, gapchar='.'):
    idx = []
    for i,s in enumerate(SS):
        if s == gapchar:
            idx.append(i)
    return (idx)

def rm_consensus_gaps(X_data, SS, gapchar='.'):
    idx = []
    for i,s in enumerate(SS):
        if s != gapchar:
            idx.append(i)

    return (X_data[:, idx, :])

In [87]:
idx = np.where(SS == '.')
idx

(array([], dtype=int64),)

In [142]:
#generate random negative control sequences
seed=274
num_data = len(alignment)
seq_length = len(alignment[0])
dims = 4
pgap_ungapped = 0.05
pgap_gapped = 0.8
SS = SS
gapchar = '.'


def seq_generator_gaps(SS, num_data, seq_length, dims, pgaps=(0.05,0.8), seed=274, gapchar='.')
    np.random.seed(seed)
    Xsim = np.zeros((num_data, seq_length, 1, dims+1))
    p,q = pgaps

    for d in range(num_data):

        #ungapped random nucleotides
        Xsim_seq = np.random.choice(dims+1, seq_length, [(1-p)/4, (1-p)/4, (1-p)/4, (1-p)/4, p])
        #gapped random nucleotides
        Xsim_gaps = np.random.choice(dims+1, seq_length, [(1-q)/4, (1-q)/4, (1-q)/4, (1-q)/4, q])
        #merge
        gapidx = consensus_gaps(SS, gapchar)
        Xsim_seq[gapidx] = np.copy(Xsim_gaps[gapidx])


        Xsim_hp = np.zeros((seq_length,1, dims+1))
        for (idx,nuc) in enumerate(Xsim_seq):
            Xsim_hp[idx][0][nuc] = 1
        Xsim[d] = Xsim_hp
        
    return (Xsim)

In [133]:
#generate random negative control sequences
seed=274
num_data = len(alignment)
seq_length = len(alignment[0])
dims = 4
pgap_ungapped = 0.05
pgap_gapped = 0.8
SS = SS
gapchar = '.'


np.random.seed(seed)
Xsim = np.zeros((num_data, seq_length, 1, dims+1))
p = pgap_ungapped
q = pgap_gapped


#ungapped random nucleotides
Xsim_seq = np.random.choice(dims+1, seq_length, [(1-p)/4, (1-p)/4, (1-p)/4, (1-p)/4, p])
#gapped random nucleotides
Xsim_gaps = np.random.choice(dims+1, seq_length, [(1-q)/4, (1-q)/4, (1-q)/4, (1-q)/4, q])
#merge
gapidx = consensus_gaps(SS, gapchar)
Xsim_seq[gapidx] = np.copy(Xsim_gaps[gapidx])

In [144]:
Xsim

array([[[[0., 0., 0., 0., 1.]],

        [[0., 1., 0., 0., 0.]],

        [[0., 0., 0., 0., 1.]],

        ...,

        [[0., 0., 1., 0., 0.]],

        [[0., 1., 0., 0., 0.]],

        [[1., 0., 0., 0., 0.]]],


       [[[0., 0., 1., 0., 0.]],

        [[1., 0., 0., 0., 0.]],

        [[0., 0., 0., 1., 0.]],

        ...,

        [[0., 1., 0., 0., 0.]],

        [[0., 1., 0., 0., 0.]],

        [[0., 0., 1., 0., 0.]]],


       [[[0., 0., 1., 0., 0.]],

        [[0., 0., 1., 0., 0.]],

        [[0., 0., 0., 0., 1.]],

        ...,

        [[0., 0., 1., 0., 0.]],

        [[0., 0., 0., 0., 1.]],

        [[0., 0., 1., 0., 0.]]],


       ...,


       [[[0., 1., 0., 0., 0.]],

        [[0., 0., 0., 0., 1.]],

        [[0., 0., 1., 0., 0.]],

        ...,

        [[0., 0., 0., 0., 1.]],

        [[0., 0., 0., 0., 1.]],

        [[0., 1., 0., 0., 0.]]],


       [[[0., 0., 1., 0., 0.]],

        [[0., 0., 0., 1., 0.]],

        [[1., 0., 0., 0., 0.]],

        ...,

        [[1., 0.,

In [137]:
len(gapidx)

832