In [None]:
#########################################################
####
#### Tutorial: RNA Design with Infrared (for Developers)
####
#########################################################

###############################################
## Start with simple sampling of RNA sequences

# -- _the_ main module of Infrared
import infrared as ir

In [None]:
# -- specify a constraint network and it's tree decomp

# our first CN is _very_ simple: 20 variables (=sequence positions), no dependencies\
model = ir.Model()
model.add_variables(20,4)
cn = ir.ConstraintNetwork(model)
# produce cluster tree for the CN, using default tree decomposition
ct = ir.ClusterTree(cn)

In [None]:
def show_td_info(ct,width):
    td = ct.get_td()
    print("tree width =", td.treewidth())
    print("bags =", td.get_bags())
    print("edges =", td.get_edges())
    
    tmpfile="tmp_out"
    td.writeTD(open(tmpfile+".dot","w"))
    import treedecomp
    treedecomp.dotfile_to_png(tmpfile+".dot")
    from IPython.display import Image
    return Image(filename=(tmpfile+".png"),width=width)
    
show_td_info(ct,500)

In [None]:
# -- evaluate the cluster tree
#   -- in this simple case, this will count the structures
count = ct.evaluate()
print("# =",int(count))
# btw, of course, 'count' is the number of all possible seqs 4**20

In [None]:
# -- generate 10 samples
samples = [ct.sample().values() for i in range(10)]

In [None]:
samples

In [None]:
# -- and show them (pretty)
from infrared import rna
[rna.values_to_sequence(s) for s in samples]

In [None]:
##################################################
## add constraints from an RNA secondary structure
structure = "((((...))))(((...)))"
bps = rna.parseRNAStructureBps(structure)

In [None]:
bps

In [None]:
complementary_nucleotides = ["AU","CG","GC","GU","UA","UG"]
# -- define complementarity constraints
ir.def_constraint_class( 
    'ComplConstraint',
    lambda i,j: [i,j],
    lambda x,y: rna.values_to_sequence([x,y]) 
                  in complementary_nucleotides
)    
    
## btw, there is already a pre-defined constraint rna.ComplConstraint,
## which we could have used as well

cons = [ ComplConstraint( i , j ) for (i,j) in bps ]
deps = [ x.vars() for x in cons ]

cons, deps

In [None]:
# -- reinitialize constraint network, rebuild tree decomp and cluster tree
#complementary_nucleotides = ["AU"]

seqlen = len(structure) # --> number of positions / variables in the CN
model = ir.Model()
model.add_variables(seqlen,4)
model.add_constraints(cons)
cn = ir.ConstraintNetwork(model)
ct = ir.ClusterTree(cn)

show_td_info(ct,500)

In [None]:
# generate samples (with complementarity constraints)
def spit_them_samples_out(ct,num):
    samples = [ ct.sample().values() for i in range(num) ]
    return [ rna.values_to_sequence(s) for s in samples ]

count = ct.evaluate()
print("# =",int(count))
# btw, count == 6**7 * 4**6, by simple combinatorics

print("  "+structure)
spit_them_samples_out(ct,10)

In [None]:
##########################
## Control the GC content

# -- define function for GC Control
ir.def_function_class(
    'GCControl',
    lambda i: [i],
    lambda x: rna.value_to_nucleotide( x ) in "GC"
)

## btw, there is predefined rna.GCControl

In [None]:
## -- setup functions
gc_weight = 1 ## <- try different weights: 0.1, 10, ...

gc_funs = [ GCControl( i )
              for i in range( seqlen ) ]

model.add_functions(gc_funs, 'gc')

# -- reinitialize constraint network, rebuild tree decomp and cluster tree
cn = ir.ConstraintNetwork( model )
ct = ir.ClusterTree( cn )

spit_them_samples_out( ct, 10 )

In [None]:
##########################
## Control the BP energy

rna.set_bpenergy_table() # set bp energies to magic numbers

bpe_funs = [ rna.BPEnergy( i, j, False ) for (i,j) in bps ] 

model.add_functions(bpe_funs, 'bpenergy')

model.set_feature_weight(0, 'bpenergy')

cn = ir.ConstraintNetwork(model)

ct = ir.ClusterTree(cn)


print("  "+structure)
spit_them_samples_out(ct, 10)

In [None]:
### as example of additional hard constraints: avoid GG dinucleotides

ir.def_constraint_class('AvoidGGConstraint',
                         lambda i: [i, i+1],
                         lambda x,y: rna.values_to_sequence([x,y]) != "GG")

gg_cons = [ AvoidGGConstraint( i ) for i in range(seqlen-1) ]

In [None]:
model.add_constraints(gg_cons)

cn = ir.ConstraintNetwork(model)

ct = ir.ClusterTree(cn)

print("  "+structure)
spit_them_samples_out(ct, 10)

In [None]:
# -- from this, construct the sampler

model =  ir.Model()
model.add_variables(seqlen, 4)
model.add_constraints(cons)
model.add_functions(bpe_funs, 'bpenergy')
model.add_functions(gc_funs, 'gc')

sampler = ir.MultiDimensionalBoltzmannSampler(model)

sampler.set_target( -12, 1, 'bpenergy' )
sampler.set_target( 10, 2, 'gc' )

def s2s(sample):
    return rna.values_to_sequence(sample.values())


# -- and sample away

for i in range(10):
    sample = sampler.targeted_sample()
    print("{} {:.2f} {:.2f}".format(s2s(sample), 
                                    model.eval_feature(sample,'bpenergy'),
                                    model.eval_feature(sample,'gc')))

In [None]:
## similar but with control of Turner energy
import RNA

model.add_feature( 'Energy', # feature name
                   'bpenergy', # controlled group(s)
                   #
                   # function to evaluate the feature for a sample;
                   # NOTE how we have to bind i
                   lambda sample, structure=structure:
                      RNA.energy_of_struct( rna.values_to_sequence( sample.values() ),
                                            structure )
                 )

sampler = ir.MultiDimensionalBoltzmannSampler(model)

#sampler.set_target( -11, 1, 'bpenergy' )
sampler.set_target( -5, 1, 'Energy' )
#sampler.set_target( 10, 2, 'gc' )

# -- and sample away

for i in range(10):
    sample = sampler.targeted_sample()
    print("{} {:5.2f} {:5.2f} {:5.2f}".format(s2s(sample), 
                                    model.eval_feature(sample,'bpenergy'),
                                    model.eval_feature(sample,'Energy'),
                                    model.eval_feature(sample,'gc')))

# Multiple target design a la RNARedPrint in new syntax

In [None]:
import infrared as ir
from infrared import rna
import RNA

##################################################
## our target RNA secondary structure
#             01234567890123456789
structures = list()
structures.append( "((((...)))).(((...)))" )
structures.append( "((((((......)))...)))" )
#structures.append( "......(((...)))......" )

seqlen = len(structures[0])

######
# construct the constraint model
model = ir.Model()

# one variable X_i per position i;
# the value of X_i encodes the nucleotide at position i   
model.add_variables( seqlen, 4 )


for i,structure in enumerate(structures):
    bps = rna.parseRNAStructureBps(structure)

    model.add_constraints( rna.ComplConstraint( i = i, j = j ) for ( i, j ) in bps )
    
    model.add_functions( [ rna.BPEnergy( i = i, j = j, is_terminal = False ) 
                           for ( i, j ) in bps ], group = f'bpenergy{i}' )
    
    model.add_feature( f'E{i}', # feature name
                       f'bpenergy{i}', # controlled group(s)
                       #
                       # function to evaluate the feature for a sample;
                       # NOTE how we have to bind i
                       lambda sample, i=i: RNA.energy_of_struct( rna.values_to_sequence( sample.values() ),
                                              structures[i] )
                     )

model.add_functions( [ rna.GCControl( i = i ) for i in range(seqlen) ], group = 'gc' )

# the model generates automatic features 'bpenergyI', 'gc' from the function groups;
# as well as total feature combining all function groups;
# however, we want to diretly control Turner energy (instead of base pair energy).
# For this purpose, add additional features 'EI'


def print_sample(sample):    
    seq = rna.values_to_sequence( sample.values() )
    
    print("{} GC={:.2f}".format( seq,
                              model.eval_feature(sample, 'gc')*100/seqlen ),end=""
         )
    for i,s in enumerate(structures):
        print(f" E{i}={model.eval_feature(sample, f'E{i}'):.2f}",end="")
    print()
    


print("###########################################")    
## Sampling at specific weights

sampler = ir.BoltzmannSampler( model )

print( "Tree width:", sampler.treewidth() )

sampler.plot_td("treedecomp.pdf")

######
# set targets

model.set_feature_weight( -5, 'E0' )
model.set_feature_weight( -2, 'gc' )

######
# and draw samples
for i in range(10):
    sample = sampler.sample()
    print_sample(sample)
    
    
print("###########################################")    
## MDBS

model.set_feature_weight( 0, 'E0' )
model.set_feature_weight( 0, 'E1' )
#model.set_feature_weight( 0, 'E2' )
model.set_feature_weight( 0, 'gc' )

######
# create sampler
sampler = ir.MultiDimensionalBoltzmannSampler( model )

######
# set targets

# control number of gc's; we target 70% +/- 15% GC-content
sampler.set_target( 0.85 * seqlen, 0.02 * seqlen, 'gc' )

# control Turner energy, target -2 +/- 1 kcal/mol
sampler.set_target( -2, 0.2, 'E0' )

# control Turner energy, target -2 +/- 1 kcal/mol
sampler.set_target( -3, 0.2, 'E1' )

# control Turner energy, target -2 +/- 1 kcal/mol
#sampler.set_target( -1.5, 0.2, 'E2' )

######
# and draw samples
for i in range(10):
    sample = sampler.targeted_sample()
    print_sample(sample)