In [1]:
#########################################################
####
#### Tutorial: RNA Design with Infrared (for Developers)
####
#########################################################

###############################################
## Start with simple sampling of RNA sequences

# -- _the_ main module of Infrared
import infrared as ir

In [2]:
# -- specify a constraint network and it's tree decomp

class MyConstraintNetwork(ir.ConstraintNetwork):
    def __init__(self,varnum,dependencies=[],constraints=[],functions=[]):
        super().__init__()
        self.varnum = varnum
        self.dependencies = dependencies
        self.constraints = constraints
        self.functions = functions

# our first CN is _very_ simple: 20 variables (=sequence positions), no dependencies
cn = MyConstraintNetwork(20)

class MyTreeDecomposition(ir.TreeDecomposition):
    def __init__(self,cn):
        super().__init__(cn.varnum,cn.dependencies)
        self.domains = 4   # 4 types of nucleotides
        self.cn = cn

# tree decompose the CN
td = MyTreeDecomposition(cn)

# and construct it's cluster tree from the TD
ct = td.construct_cluster_tree()

In [3]:
# -- evaluate the cluster tree once, before samples can be produced
#  -- this computes the partition function
ct.evaluate()

In [4]:
# -- generate 10 samples
samples = [ct.sample().values() for i in range(10)]

In [5]:
samples

[[2, 3, 0, 3, 2, 0, 2, 2, 0, 2, 3, 1, 1, 0, 3, 2, 1, 3, 1, 2],
 [0, 2, 3, 0, 2, 2, 0, 3, 0, 0, 2, 0, 3, 3, 0, 0, 1, 3, 0, 2],
 [2, 2, 1, 3, 3, 2, 0, 2, 2, 2, 0, 0, 3, 2, 2, 2, 3, 1, 3, 0],
 [1, 0, 0, 3, 1, 0, 0, 1, 1, 0, 1, 0, 0, 2, 1, 0, 0, 2, 2, 2],
 [3, 0, 1, 3, 0, 3, 1, 1, 0, 2, 1, 0, 1, 2, 2, 3, 2, 0, 1, 0],
 [1, 1, 0, 1, 3, 1, 2, 3, 2, 1, 0, 0, 0, 3, 0, 1, 3, 2, 3, 0],
 [1, 2, 0, 0, 0, 2, 1, 3, 2, 2, 0, 2, 3, 2, 0, 2, 1, 3, 2, 0],
 [0, 2, 0, 3, 0, 1, 0, 1, 1, 0, 1, 0, 0, 2, 1, 1, 2, 1, 1, 1],
 [1, 0, 1, 0, 0, 2, 2, 1, 2, 2, 3, 1, 2, 0, 2, 3, 0, 2, 1, 3],
 [1, 2, 1, 3, 1, 1, 3, 1, 3, 1, 1, 2, 1, 3, 3, 1, 0, 3, 0, 0]]

In [6]:
# -- and show them (pretty)
[ir.values_to_sequence(s) for s in samples]

['GUAUGAGGAGUCCAUGCUCG',
 'AGUAGGAUAAGAUUAACUAG',
 'GGCUUGAGGGAAUGGGUCUA',
 'CAAUCAACCACAAGCAAGGG',
 'UACUAUCCAGCACGGUGACA',
 'CCACUCGUGCAAAUACUGUA',
 'CGAAAGCUGGAGUGAGCUGA',
 'AGAUACACCACAAGCCGCCC',
 'CACAAGGCGGUCGAGUAGCU',
 'CGCUCCUCUCCGCUUCAUAA']

In [7]:
##################################################
## add constraints from an RNA secondary structure

import rna_support as rna
structure = "((((...))))(((...)))"
bps = rna.parseRNAStructureBps(structure)

In [8]:
bps

[(0, 10), (1, 9), (2, 8), (3, 7), (11, 19), (12, 18), (13, 17)]

In [21]:
complementary_nucleotides = ["AU","CG","GC","GU","UA","UG"]
# -- define complementarity constraints
class ComplConstraint(ir.Constraint):
    def __init__(self,i,j):
        super().__init__([i,j])
        self.i = i
        self.j = j
    def __call__(self,a): # a=assignment / sequence of nucleotides
        a = a.values()
        nucls = ir.values_to_sequence([a[self.i],a[self.j]])
        return nucls in complementary_nucleotides
    
## btw, there is already a pre-defined constraint ir.ComplConstraint,
## which we could have used as well

cons = [ ( [i,j], [ ComplConstraint(i,j) ] ) for (i,j) in bps ]

In [22]:
# -- infer dependencies
deps = [ [i,j] for (i,j) in bps ]
deps

[[0, 10], [1, 9], [2, 8], [3, 7], [11, 19], [12, 18], [13, 17]]

In [11]:
# -- reinitialize constraint network, rebuild tree decomp and cluster tree
#complementary_nucleotides = ["AU"]

seqlen = len(structure) # --> number of positions / variables in the CN
cn = MyConstraintNetwork(seqlen, dependencies = deps, constraints = cons)
td = MyTreeDecomposition(cn)
ct = td.construct_cluster_tree()

In [12]:
# generate samples (with complementarity constraints)
def spit_them_samples_out(ct,num):
    ct.evaluate()
    samples = [ ct.sample().values() for i in range(num) ]
    return [ ir.values_to_sequence(s) for s in samples ]

print("  "+structure)
spit_them_samples_out(ct,10)

  ((((...))))(((...)))


['CUCGUCGUGGGAUGUCUCGU',
 'UGCCCCGGGCACAUAGGAUG',
 'GUGGCCCUUACCGACAGUCG',
 'GGUCGUAGGUCCGGAAUUUG',
 'GAGGUGUUCUUGUGGAAUGU',
 'GGUGAACCAUUCUGGCACAG',
 'UGGUCAUGCCGUUUCGCGAA',
 'AGGAGGAUUUUUGUGCGACA',
 'CAUUGAUGAUGGUUAACGAC',
 'GCUUCUAGGGCGAUGUGGUU']

In [13]:
##########################
## Control the GC content

# -- define function for GC Control
class GCControl(ir.Function):
    def __init__(self,i,weight):
        super().__init__([i])
        self.i = i
        self.weight = weight
    def __call__(self,a):
        a = a.values()
        nucl = ir.value_to_nucleotide(a[self.i])
        if nucl in "GC":
            return self.weight
        else:
            return 1.0

## btw, there is predefined ir.GCControl

In [14]:
## -- setup functions
gc_weight = 1

gc_funs = [ ([i], [GCControl(i,gc_weight)]) for i in range(seqlen) ]

# -- reinitialize constraint network, rebuild tree decomp and cluster tree
def build_ct(seqlen,deps,cons,funs):
    cn = MyConstraintNetwork(seqlen, deps, cons, funs)
    td = MyTreeDecomposition(cn)
    return td.construct_cluster_tree()

ct = build_ct(seqlen, deps, cons, gc_funs)

spit_them_samples_out(ct,10)

['AUUAGACUAAUUACUCCGUA',
 'UUACGUCGUGGGUGUCGCGC',
 'UGGCGAGGCUAGGCACAGUU',
 'AGGAUAUUCUUUGGGUAUCA',
 'GUGGAUACUGUUGGUAACUA',
 'ACGACGUUCGUGAAACUUUU',
 'UCUAUUUUAGGUUCCGGGAA',
 'GUGGUUUUCACGGUCAUACU',
 'UAGAAGCUUUGUAUGCUAUG',
 'AUGAGUUUUGUAGGGGCCCU']

In [15]:
##########################
## Control the BP energy

ir.set_bpenergy_table(ir.params_bp) # set bp energies to magic numbers

bpe_weight = 1
bpe_funs = [ ([i,j], [ir.BPEnergy(i,j,False,bpe_weight)]) for (i,j) in bps ] 
ct = build_ct(seqlen, deps, cons, bpe_funs + gc_funs)

print("  "+structure)
spit_them_samples_out(ct,10)

  ((((...))))(((...)))


['CUGGAAUUCGGGCGCUACGC',
 'GUGUACCAUAUUUGAUAUGA',
 'UCUCGAAGGGGACUAUCGGU',
 'UGAUAAUGUUAUUUUAUGGG',
 'CCCAUGAUGGGUUCCUAGGA',
 'UGGUACCGCUACAUCCAGUG',
 'UCGACUUUCGAAUUCCCGGU',
 'AACCAUUGGUUUGUGGAGCG',
 'CUAGCAGCUAGUGUGGAGUA',
 'GGUUGUGAGCCGACUUUGUC']

In [23]:
### as example of additional hard constraints: avoid GG dinucleotides
class AvoidGGConstraint(ir.Constraint):
    def __init__(self,i):
        super().__init__([i,i+1])
        self.i = i
    def __call__(self,a): # a=assignment / sequence of nucleotides
        a = a.values()
        nucls = ir.values_to_sequence([a[self.i],a[self.i+1]])
        return nucls != "GG"

gg_cons = [ ([i,i+1],[AvoidGGConstraint(i)]) for i in range(seqlen-1) ]
gg_deps = [ x[0] for x in gg_cons]

In [25]:
ir.set_bpenergy_table(ir.params_bp) # set bp energies to magic numbers

bpe_weight = 1
bpe_funs = [ ([i,j], [ ir.BPEnergy(i, j, False, bpe_weight) ]) for (i,j) in bps ] 
ct = build_ct(seqlen, deps + gg_deps, cons + gg_cons, bpe_funs + gc_funs)

print("  "+structure)
spit_them_samples_out(ct,10)

  ((((...))))(((...)))


['GUUUGUAAAAUCUGCCCUAG',
 'UGUUUGUGACGUGUCCAGUA',
 'AAUGCCUCGUUGCGUCCCGC',
 'AGAUCCGAUUUUUUCGAAAA',
 'UCAUCUAAUGAGCGUCGCGC',
 'CUGAGACUUAGAGAGACUUU',
 'AUUGCAUCAAUGUGCGUCAU',
 'GUUGAUCUAGCAAGUGAUUU',
 'UGUACAGUGUACGCCGAGCG',
 'GUGACGCUCGUUAUCACAUG']

In [16]:
#### now, automatize the targeting of feature values
##

# -- we start by defining the features

class GCFeature(ir.Feature):
    def __init__(self, weight, target, tolerance):
        super().__init__( "GC", weight, target, tolerance)
    def eval(self, sample):
        return rna.GC_content(sample) * 100

class EnergyFeature(ir.Feature):
    def __init__(self, structure, weight, target, tolerance):
        super().__init__( "E", weight, target, tolerance )
        self.structure = structure
    def eval(self, sample):
        import RNA
        return RNA.energy_of_struct(sample, self.structure)

In [19]:
# -- next, instantiate the sampler

class MySampler(ir.MultiDimensionalBoltzmannSampler):
    def __init__( self, features ):
        super().__init__(features)
        self.setup_engine()
    def gen_constraint_network(self, features):
        bpe_weight = features["E"].weight
        bpe_funs = [ ([i,j], [ir.BPEnergy(i,j,False,bpe_weight)]) for (i,j) in bps ] 
        gc_weight = features["GC"].weight
        gc_funs = [ ([i], [GCControl(i,gc_weight)]) for i in range(seqlen) ]
        return MyConstraintNetwork(seqlen, deps, cons, bpe_funs + gc_funs)
    def gen_tree_decomposition(self,cn):
        return MyTreeDecomposition(cn)
    def sample(self):
        return ir.values_to_sequence(super().sample().values())
                  
# -- produce the feature objects 
# !!! here we define the targeted feature values and tolerances !!!
features = { "E": EnergyFeature(structure,1,-2,1),
             "GC": GCFeature(1,70,15) }

# -- from this, construct the sampler
sampler = MySampler(features)

In [20]:
# -- and sample away
count=0
for seq in sampler.targeted_samples():
    import RNA # for energy evaluation
    print("{} {:.2f} {:.2f}".format(seq, RNA.energy_of_struct(seq,structure), 100*rna.GC_content(seq)))
    count+=1
    if count >= 10:
        break

GGCUCCUAGCUGCGAAACGC -2.30 65.00
GGUCCCCGGCCGGGUGUUCU -1.10 75.00
GGAUCCUAUCCUCCGCGGGA -2.00 65.00
CAGAGGAUCUGCGCUACGCG -1.70 65.00
CCCAAAGUGGGCCUGACGGG -2.90 70.00
GGACCUCGUCCUCACAGUGG -1.00 65.00
GGCUCGCGGUCGCUAUAGGC -1.80 70.00
CCGUAUCACGGCGAAUGUCG -1.50 60.00
GGCCAUGGGCCUGGUCGUUA -1.00 65.00
GUGCUCCGCGCCCUCGUAGG -2.50 75.00
