In [None]:
#########################################################
####
#### Tutorial: RNA Design with Infrared (for Developers)
####
#########################################################

###############################################
## Start with simple sampling of RNA sequences

# -- _the_ main module of Infrared
import infrared as ir

In [None]:
# -- specify a constraint network and it's tree decomp

# our first CN is _very_ simple: 20 variables (=sequence positions), no dependencies
cn = ir.ConstraintNetwork(varnum=20, domains=4)
# produce cluster tree for the CN, using default tree decomposition
ct = ir.ClusterTree(cn)

In [None]:
def show_td_info(ct,width):
    td = ct.get_td()
    print("tree width =", td.treewidth())
    print("bags =", td.get_bags())
    print("edges =", td.get_edges())
    
    tmpfile="tmp_out"
    td.writeTD(open(tmpfile+".dot","w"))
    import treedecomp
    treedecomp.dotfile_to_png(tmpfile+".dot")
    from IPython.display import Image
    return Image(filename=(tmpfile+".png"),width=width)
    
show_td_info(ct,100)

In [None]:
# -- evaluate the cluster tree
#   -- in this simple case, this will count the structures
count = ct.evaluate()
print("# =",int(count))
# btw, of course, 'count' is the number of all possible seqs 4**20

In [None]:
# -- generate 10 samples
samples = [ct.sample().values() for i in range(10)]

In [None]:
samples

In [None]:
# -- and show them (pretty)
[ir.values_to_sequence(s) for s in samples]

In [None]:
##################################################
## add constraints from an RNA secondary structure

import rna_support as rna
structure = "((((...))))(((...)))"
bps = rna.parseRNAStructureBps(structure)

In [None]:
bps

In [None]:
complementary_nucleotides = ["AU","CG","GC","GU","UA","UG"]
# -- define complementarity constraints
class ComplConstraint(ir.Constraint):
    def __init__(self,i,j):
        super().__init__([i,j])
        self.i = i
        self.j = j
    def __call__(self,a): # a=assignment / sequence of nucleotides
        a = a.values()
        nucls = ir.values_to_sequence([a[self.i],a[self.j]])
        return nucls in complementary_nucleotides
    
## btw, there is already a pre-defined constraint rna.ComplConstraint,
## which we could have used as well

cons = [ ComplConstraint(i,j) for (i,j) in bps ]
deps = [ x.vars() for x in cons ]
cons, deps

In [None]:
# -- reinitialize constraint network, rebuild tree decomp and cluster tree
#complementary_nucleotides = ["AU"]

seqlen = len(structure) # --> number of positions / variables in the CN
cn = ir.ConstraintNetwork(varnum=seqlen, domains=4, constraints = cons)
ct = ir.ClusterTree(cn)

show_td_info(ct,100)

In [None]:
# generate samples (with complementarity constraints)
def spit_them_samples_out(ct,num):
    samples = [ ct.sample().values() for i in range(num) ]
    return [ ir.values_to_sequence(s) for s in samples ]

count = ct.evaluate()
print("# =",int(count))
# btw, count == 6**7 * 4**6, by simple combinatorics

print("  "+structure)
spit_them_samples_out(ct,10)

In [None]:
##########################
## Control the GC content

# -- define function for GC Control
class GCControl(ir.Function):
    def __init__(self,i,weight):
        super().__init__([i])
        self.i = i
        self.weight = weight
    def __call__(self,a):
        a = a.values()
        nucl = ir.value_to_nucleotide(a[self.i])
        if nucl in "GC":
            return self.weight
        else:
            return 1.0

## btw, there is predefined rna.GCControl

In [None]:
## -- setup functions
gc_weight = 1 ## <- try different weights: 0.1, 10, ...

gc_funs = [ GCControl(i,gc_weight) for i in range(seqlen) ]

# -- reinitialize constraint network, rebuild tree decomp and cluster tree
cn = ir.ConstraintNetwork(varnum=seqlen, domains=4,
                          constraints=cons,
                          functions=gc_funs)
ct = ir.ClusterTree(cn)

spit_them_samples_out(ct, 10)

In [None]:
##########################
## Control the BP energy

ir.set_bpenergy_table(ir.params_bp) # set bp energies to magic numbers

bpe_weight = 1 ## <- try different weights: 0.1, 10, ...
bpe_funs = [ rna.BPEnergy(i,j,False,bpe_weight) for (i,j) in bps ] 

cn = ir.ConstraintNetwork(varnum=seqlen, domains=4,
                          constraints=cons,
                          functions=bpe_funs + gc_funs)
ct = ir.ClusterTree(cn)


print("  "+structure)
spit_them_samples_out(ct, 10)

In [None]:
### as example of additional hard constraints: avoid GG dinucleotides
class AvoidGGConstraint(ir.Constraint):
    def __init__(self,i):
        super().__init__([i,i+1])
        self.i = i
    def __call__(self,a): # a=assignment / sequence of nucleotides
        a = a.values()
        nucls = ir.values_to_sequence([a[self.i],a[self.i+1]])
        return nucls != "GG"

gg_cons = [ AvoidGGConstraint(i) for i in range(seqlen-1) ]

In [None]:
cn = ir.ConstraintNetwork(varnum=seqlen, domains=4,
                          constraints = cons + gg_cons,
                          functions = bpe_funs + gc_funs)
ct = ir.ClusterTree(cn)

print("  "+structure)
spit_them_samples_out(ct, 10)

In [None]:
#### now, automatize the targeting of feature values
##

# -- we start by defining the features

class GCFeature(ir.Feature):
    def __init__(self, weight, target, tolerance):
        super().__init__( "GC", weight, target, tolerance)
    def eval(self, sample):
        return rna.GC_content(sample) * 100

class EnergyFeature(ir.Feature):
    def __init__(self, structure, weight, target, tolerance):
        super().__init__( "E", weight, target, tolerance )
        self.structure = structure
    def eval(self, sample):
        import RNA
        return RNA.energy_of_struct(sample, self.structure)

In [None]:
# -- next, instantiate the sampler

class MySampler(ir.MultiDimensionalBoltzmannSampler):
    def __init__( self, features ):
        super().__init__(features)
        
    def gen_constraint_network(self, features):
        bpe_weight = features["E"].weight
        bpe_funs = [ rna.BPEnergy(i,j,False,bpe_weight) for (i,j) in bps ] 
        gc_weight = features["GC"].weight
        gc_funs = [ GCControl(i,gc_weight) for i in range(seqlen) ]
        return ir.ConstraintNetwork(varnum=seqlen, domains=4,
                                    constraints = cons,
                                    functions = bpe_funs + gc_funs)
    
    def sample(self):
        return ir.values_to_sequence(super().sample().values())
                  
# -- produce the feature objects 
# !!! here we define the targeted feature values and tolerances !!!
features = { "E": EnergyFeature(structure,1,-2,1),
             "GC": GCFeature(1,70,15) }

# -- from this, construct the sampler
sampler = MySampler(features)

In [None]:
# -- and sample away
count=0
for seq in sampler.targeted_samples():
    import RNA # for energy evaluation
    print("{} {:.2f} {:.2f}".format(seq, RNA.energy_of_struct(seq,structure), 100*rna.GC_content(seq)))
    count+=1
    if count >= 10:
        break