# Introduction

In [None]:
from infrared import *
from infrared.rna import *

In [None]:
target = "((((((((((...))))((((....))))))))))"
model = Model(len(target), 4)
model.add_constraints(BPComp(i,j) for (i,j) in parse(target))
sampler = Sampler(model)
samples = [sampler.sample() for _ in range(10)]

In [None]:
sequences = [ass_to_seq(x) for x in samples]
sequences

We are going to visualize the nucleotide frequencies of the sampled sequences 
if module ```logomaker``` is availabe.
(e.g. install by ```conda install logomaker```)

In [None]:
def draw_logo(samples,name=None):
    import logomaker as lm
    import matplotlib.pyplot as plt
    
    sequences = [ass_to_seq(x) for x in samples]
    
    matrix = lm.alignment_to_matrix(sequences = sequences)
    logo = lm.Logo(matrix)
    logo.style_xticks(rotation=90, fmt='%d', anchor=0)
    logo.ax.xaxis.set_ticks_position('none')
    if name is not None:
        plt.savefig(name)
    plt.show()
    logo.ax.xaxis.set_tick_params(pad=-1)
    return sequences

def opt_draw_logo(samples,name=None):
    try:
        draw_logo(samples,name)
    except ModuleNotFoundError:
        pass
    return [ass_to_seq(x) for x in samples]


opt_draw_logo(samples)

### Multiple targets

In [None]:
#           01234567890123456789012345678901234
targets = ["((((((((((...))))((((....))))))))))",
           "((((((.((((((((....))))..))))))))))",
           ".((((((...)))))).(((((((....)))))))"]

In [None]:
for target in targets:
    model.add_constraints(BPComp(i,j) for (i,j) in parse(target))

In [None]:
sampler = Sampler(model)
designs = [sampler.sample() for _ in range(10)]

samples = [x for x in designs]
opt_draw_logo(samples)

# Methods

## Elementary use of Infrared

In [None]:
n = 35

In [None]:
model = Model(n,4)

In [None]:
target = "((((((((((...))))((((....))))))))))"
model.add_constraints(BPComp(i,j) for (i,j) in parse(target))

In [None]:
sampler = Sampler(model)
samples = [sampler.sample() for _ in range(10)]

opt_draw_logo(samples)

## Sequence constraints in IUPAC code

In [None]:
iupac_sequence = "SNNNNNNNNNRYYNNNNNNNNGNRANNNNNNNNNS"

In [None]:
for i, x in enumerate(iupac_sequence):
    model.add_constraints(ValueIn(i, iupacvalues(x)))

In [None]:
sampler = Sampler(model)
samples = [sampler.sample() for _ in range(20)]

opt_draw_logo(samples)

## Control of GC content

In [None]:
# add functions for GC control
model.add_functions([GCCont(i) for i in range(n)], 'gc')

In [None]:
# set a weight and sample
model.set_feature_weight(0.15, 'gc')

sampler = Sampler(model)
samples = [sampler.sample() for _ in range(1000)]
opt_draw_logo(samples)

In [None]:
## Code to produce the figures in the paper
WRITEFIGS = False
for name,weight in [('minus', -1), ('zero', 0), ('plus', 1)]:
    
    model.set_feature_weight(weight, 'gc')
    sampler = Sampler(model)
    samples = [sampler.sample() for _ in range(1000)]

    sequences = opt_draw_logo(samples, f"gc_content_{name}-logo.svg")

    gc_contents = [100*sum(x in "GC" for x in sequence)/len(sequence) for sequence in sequences]
    import matplotlib.pyplot as plt
    h = plt.hist(gc_contents,bins=10,range=(0,100))
    if WRITEFIGS:
        plt.savefig(f"gc_content_{name}-hist.svg")

Set a target of 75% GC content and then draw targeted samples

In [None]:
sampler = Sampler(model)

sampler.set_target( 0.75 * n, 0.01 * n, 'gc' )

samples = [sampler.targeted_sample() for _ in range(1000)]
sequences = opt_draw_logo(samples)

gc_contents = [100*sum(x in "GC" for x in sequence)/len(sequence) for sequence in sequences]
gc_content = sum(gc_contents) / len(gc_contents)
print(f"GC content in samples: {gc_content:0.2f}%")

# Controlling energy - Multiple features

In [None]:
# recall current model
model = Model(n,4) 
bps = parse(target)
model.add_constraints(BPComp(i,j) for (i,j) in bps)
model.add_functions([GCCont(i) for i in range(n)], 'gc')

In [None]:
# add (base pair) energy control
model.add_functions([BPEnergy(i, j, (i-1, j+1) not in bps)
                     for (i,j) in bps], 'energy')

In [None]:
# target specific GC and low energy 
model.set_feature_weight(-2, 'energy')
sampler = Sampler(model)
sampler.set_target(0.75*n, 0.01*n, 'gc')
samples = [sampler.targeted_sample() for _ in range(10)]

In [None]:
opt_draw_logo(samples)

In [None]:
# add stacking energy control 
# - this could be used in place of defining base pair energy
#   in the code above
model.add_functions([StackEnergy(i, j)
    for (i,j) in bps if (i+1,j-1) in bps], 'energy')

## Targeting Turner energy

NOTE: here we make use of the Vienna RNA library.
The code in this section won't work,
if the library is not installed.

In [None]:
import RNA

In [None]:
# Restate current model
model = Model(n,4) 
bps = parse(target)
model.add_constraints(BPComp(i,j) for (i,j) in bps)
model.add_functions([GCCont(i) for i in range(n)], 'gc')
model.add_functions([BPEnergy(i, j, (i-1, j+1) not in bps)
                     for (i,j) in bps], 'energy')

In [None]:
# add the Turner energy feature
model.add_feature('Energy', 'energy',
    lambda sample, target=target:
        RNA.energy_of_struct(ass_to_seq(sample), target))

In [None]:
# specify targets and draw targeted samples
sampler = Sampler(model)
sampler.set_target(0.75*n, 0.05*n, 'gc')
sampler.set_target(-10, 0.5, 'Energy')
samples = [sampler.targeted_sample() for _ in range(10)]

sequences = opt_draw_logo(samples)

[(seq,RNA.energy_of_struct(seq,target)) for seq in sequences]

# Multiple target targets

In [None]:
# construct model
model = Model(n,4) 
model.add_functions([GCCont(i) for i in range(n)], 'gc')

for k, target in enumerate(targets):
    bps = parse(target)
    model.add_constraints(BPComp(i,j) for (i,j) in bps)
    model.add_functions([BPEnergy(i, j, (i-1, j+1) not in bps)
                         for (i,j) in bps], f'energy{k}')

### Target specific GC content and high affinity to all targets

In [None]:
# set weights for energy targets
for k,_ in enumerate(targets):
    model.set_feature_weight(-2, f'energy{k}')

# create sampler and set target
sampler = Sampler(model)
sampler.set_target(0.75*n, 0.05*n, 'gc')
samples = [sampler.targeted_sample() for _ in range(5)]

sequences = opt_draw_logo(samples)

# annotate sequences with energies (annotate with Turner energies only if RNA module is available)
try:
    import RNA
    sequences = ["".join([seq]+[f" {RNA.energy_of_struct(seq,target):5.1f}" for target in targets]) for seq in sequences]
except ModuleNotFoundError:
    pass

sequences

### Target specific GC content and specific Turner energies for all targets

Note: this will again require the Vienna RNA library

In [None]:
# add Turner energy features for all target targets
for k, target in enumerate(targets):
    model.add_feature(f'Energy{k}', f'energy{k}',
        lambda sample, target=target:
            RNA.energy_of_struct(ass_to_seq(sample), target))

sampler = Sampler(model)
sampler.set_target(0.75*n, 0.01*n, 'gc')

sampler.set_target( -15, 1, 'Energy0')
sampler.set_target( -20, 1, 'Energy1')
sampler.set_target( -20, 1, 'Energy2')

samples = [sampler.targeted_sample() for _ in range(5)]

sequences = opt_draw_logo(samples)

# annotate sequences with energies
["".join([seq]+[f" {RNA.energy_of_struct(seq,target):5.1f}" for target in targets]) for seq in sequences]

## Plot dependencies and tree decomposition

In [None]:
from IPython.display import Image

In [None]:
# Plot dependency graph

filename = 'dependency_graph.dot'
model.write_graph(filename, True)

dotfile_to_png(filename)
dotfile_to_pdf(filename)

filename = re.sub(r"dot$","png",filename)

Image(filename=filename,width=600)

In [None]:
# Plot tree decomposition
sampler = Sampler(model)
print(f"Tree width: {sampler.treewidth()}")
filename="treedecomp"
sampler.plot_td(filename,'png')
sampler.plot_td(filename,'pdf')
sampler.plot_td(filename+".dot",'dot')
Image(filename=filename+".png",width=300)

# Negative design by sampling

In [None]:
target = targets[0]
n = len(target)

In [None]:
def is_mfe_design(sequence, target):
    fc = RNA.fold_compound(sequence)
    return fc.eval_structure(target) == fc.mfe()[1]

In [None]:
def single_target_design_model(target):
    n, bps = len(target), parse(target)
    model = Model(n, 4)
    model.add_constraints(BPComp(i, j) for (i, j) in bps)
    model.add_functions([GCCont(i) for i in range(n)], 'gc')
    model.add_functions([BPEnergy(i, j, (i-1, j+1) not in bps)
        for (i,j) in bps], 'energy')
    model.set_feature_weight(-1.5, 'energy')
    return model

In [None]:
# solve by direct sampling
sampler = Sampler(single_target_design_model(target))
sampler.set_target(0.7 * n, 0.1 * n, 'gc')
for i in range(50):
    seq = ass_to_seq(sampler.targeted_sample())
    if is_mfe_design(seq,target):
        print(f"{i} {seq}")

In [None]:
def target_frequency(sequence, target):
    fc = RNA.fold_compound(sequence)
    fc.pf()
    return fc.pr_structure(target)

In [None]:
sampler = Sampler(single_target_design_model(target))
sampler.set_target(0.7 * n, 0.1 * n, 'gc')
best = 0
for i in range(100):
    seq = ass_to_seq(sampler.targeted_sample())
    freq = target_frequency(seq,target)
    if freq > best:
        best = freq
        print(f"{i} {seq} {freq:.6f}")

## Disruptive base pairs - RNAPOND-like negative design

In [None]:
## a slightly harder instance
target = "(((((.((((((.((((((.((((((.((((((.((((((....((((((......)))))).)))))).(((((...(((((((...)))))))))))).)))))).((((((((((((...)))))))...))))).))))))....))))))....))))))....)))))" #Eterna100 39
n = len(target)
bps = parse(target)
steps = 50
from collections import Counter

In [None]:
def cg_design_iteration(dbps):
    model = single_target_design_model(target)
    model.add_constraints(NotBPComp(i, j) for (i, j) in dbps)
    sampler = Sampler(model, lazy=True)
    sampler.set_target(0.7 * n, 0.1 * n, 'gc' )
    if sampler.treewidth() > 10 or not sampler.is_consistent():
        return dbps, "Not found"
    ctr = Counter()    
    for i in range(steps):
        seq = ass_to_seq(sampler.targeted_sample())
        fc = RNA.fold_compound(seq)
        mfe, mfe_e = fc.mfe()
        if fc.eval_structure(target) == mfe_e:
            return dbps, seq
        ctr.update(parse(mfe))
    ndbps = [x[0] for x in ctr.most_common() if x[0] not in bps]
    return dbps + ndbps[:3], None
dbps, seq = [], None
while seq is None: dbps, seq = cg_design_iteration(dbps)
print(seq)

# Negative design optimization with resampling

In [None]:
## define multi-target design model for resampling of subsets 
def multi_design_model(targets, solution=None, subset=None):
    n = len(targets[0])
    model = Model(n, 4)
    if subset is None: subset = set(range(n))
    for i in set(range(n))-subset:
        value = solution.values()[i]
        model.restrict_domains(i,(value,value))
    model.add_functions([GCCont(i) for i in subset], 'gc')
    for target in targets:
        s = parse(target)
        ss = [(i, j) for (i, j) in s if i in subset or j in subset]
        model.add_constraints(BPComp(i, j) for (i, j) in ss)
        model.add_functions([BPEnergy(i, j, (i-1, j+1) not in s)
            for (i,j) in ss], 'energy')
    model.set_feature_weight(-1, 'energy')
    model.set_feature_weight(-0.3, 'gc')
    return model

In [None]:
def multi_defect(sequence, targets, xi=1):
    k = len(targets)
    fc = RNA.fold_compound(sequence)
    ee = fc.pf()[1]
    eos = [fc.eval_structure(target) for target in targets]
    diff_ee = sum(1/k * (eos[i] - ee) for i in range(k))
    diff_targets = sum(2/(k*(k-1)) * abs(eos[i]-eos[j])
        for i in range(k) for j in range(k) if i<j)
    return diff_ee + xi * diff_targets

In [None]:
from random import random, choices
from math import exp

In [None]:
def optimize(create_model, objective, steps, temp):
    model = create_model(None, None)
    cur = Sampler(model).sample()
    curval = objective(cur)
    ccs = model.connected_components()
    weights = [1/len(cc) for cc in ccs]
    bestval = math.inf
    for i in range(steps):
        cc = choices(ccs,weights)[0]
        new = Sampler(create_model(cur, cc)).sample()
        newval = objective(new)
        if (newval <= curval
            or random() <= exp(-(newval - curval ) / temp)):
            cur, curval = new, newval
            if curval < bestval:
                best, bestval = cur, curval            
    return (best, bestval)

In [None]:
def optimize_md(targets):
    best, bestval = \
        optimize(lambda sol, ss:
                 multi_design_model(targets, sol, ss),
                 lambda ass:
                 multi_defect(ass_to_seq(ass), targets, xi = 1),
                 1000, 0.015)
    best = ass_to_seq(best)
    
    fc = RNA.fold_compound(best)
    print(best, bestval, fc.mfe(),
          [fc.eval_structure(t) for t in targets],
          f'{100*sum(x in "GC" for x in best)/len(best):0.1f}%'
    )
optimize_md(targets)

In [None]:
# good solutions for the running 3-target example

seq = "AGGGUCCGGGGGGCCCGGGGGUUGACCCCGACCCU" # all mfe; GC 65.7% (1000 steps; ~6s)
seq = "GGGGCCCGGGGGGCCCGGGGGUUGACCCCGGCCCC"
seq = "CCCCUUGCCUCAAGGGCCCUCUUCAGAGGAAGGGG"
fc = RNA.fold_compound(seq)
print(fc.mfe())
print(fc.pf())
print([fc.eval_structure(t) for t in targets])
print([fc.pr_structure(t) for t in targets])
print(multi_defect(seq,targets,xi=1))

In [None]:
#01234567890123456789012345678901234
#GGGGCCCGGGGGGCCCGGGGGUUGACCCCGGCCCC