# mRNA Sequence Optimization via Adiabatic Quantum Computing
Authors: Siona Tagare, Shannen Espinosa, Gun Suer

YQuantum 2024 April 13-14



# Terminology
When analyzing mRNA, there are some key words to remember. mRNA is made up of amino acids, such as N or M). Each amino acid is made up of a codon, a sequence of 3 nucleic acids, (such as ATG or AAT). There are multiple options of codons for each amino acid – for example, the amino acid N can be made up of either codon AAT or codon AAC. The selection of codons is extremely important to the overall mRNA sequence as choosing certain codons in re

In [3]:
pip install git+https://github.com/Benjamin-Lee/CodonAdaptationIndex.git

Collecting git+https://github.com/Benjamin-Lee/CodonAdaptationIndex.git
  Cloning https://github.com/Benjamin-Lee/CodonAdaptationIndex.git to /tmp/pip-req-build-1k91joc_
  Running command git clone --filter=blob:none --quiet https://github.com/Benjamin-Lee/CodonAdaptationIndex.git /tmp/pip-req-build-1k91joc_
  Resolved https://github.com/Benjamin-Lee/CodonAdaptationIndex.git to commit b6e017a92c58829f6a5aec8c26a21262bc2a6610
  Preparing metadata (setup.py) ... [?25ldone
Collecting biopython (from CAI==1.0.5.dev3+gb6e017a)
  Using cached biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Using cached biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
Building wheels for collected packages: CAI
  Building wheel for CAI (setup.py) ... [?25ldone
[?25h  Created wheel for CAI: filename=CAI-1.0.5.dev3+gb6e017a.d20240414-py3-none-any.whl size=7822 sha256=7d35d3afb9cf7299417d4ebe5c0661da594add132251e650ae7fd43b18e71cf7
 

In [None]:
pip install biopython
pip install git+https://github.com/Benjamin-Lee/CodonAdaptationIndex.git

In [7]:
import numpy as np
import math

from CAI import CAI
from Bio.Seq import Seq

from qiskit_algorithms import NumPyMinimumEigensolver
from qiskit_optimization import QuadraticProgram
from qiskit_optimization.converters import QuadraticProgramToQubo
from qiskit_optimization.algorithms import MinimumEigenOptimizer
from qiskit_algorithms import QAOA
from qiskit_algorithms.optimizers import COBYLA
from qiskit.primitives import Sampler
from qiskit_ibm_runtime import QiskitRuntimeService
from qiskit_ibm_runtime import Estimator, Sampler, Session, Options
from qiskit_aer import AerSimulator

Traceback [1;36m(most recent call last)[0m:
[1;36m  Cell [1;32mIn[7], line 8[1;36m
[1;33m    from qiskit_optimization import QuadraticProgram[1;36m
[1;31mModuleNotFoundError[0m[1;31m:[0m No module named 'qiskit_optimization'

Use %tb to get the full traceback.


In [1]:
p = "NVD"
len(p)

3

In [2]:
def p2codons(p):
    aa_codons = {
        'A': ['GCT', 'GCC', 'GCA', 'GCG'],
        'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
        'N': ['AAT', 'AAC'],
        'D': ['GAT', 'GAC'],
        'C': ['TGT', 'TGC'],
        'Q': ['CAA', 'CAG'],
        'E': ['GAA', 'GAG'],
        'G': ['GGT', 'GGC', 'GGA', 'GGG'],
        'H': ['CAT', 'CAC'],
        'I': ['ATT', 'ATC', 'ATA'],
        'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
        'K': ['AAA', 'AAG'],
        'M': ['ATG'],
        'F': ['TTT', 'TTC'],
        'P': ['CCT', 'CCC', 'CCA', 'CCG'],
        'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
        'T': ['ACT', 'ACC', 'ACA', 'ACG'],
        'W': ['TGG'],
        'Y': ['TAT', 'TAC'],
        'V': ['GTT', 'GTC', 'GTA', 'GTG'],
    }
    
    codon_seq = []
    for aa in p:
        if aa in aa_codons:
            codons = aa_codons[aa]
            codon_seq.append(codons)
        else:
            codon_seq.append([''])

    return codon_seq

codons = p2codons(p)
print(codons)

[['AAT', 'AAC'], ['GTT', 'GTC', 'GTA', 'GTG'], ['GAT', 'GAC']]


In [3]:
def probabilities(codon):
    prob = {
        'GCT': 0.0777, 'GCC': 0.0777, 'GCA': 0.0777, 'GCG': 0.0777,
        'CGT': 0.0157, 'CGC': 0.0157, 'CGA': 0.0157, 'CGG': 0.0157,
        'AGA': 0.053, 'AGG': 0.053, 'CGA': 0.053, 'CGG': 0.053,
        'GAA': 0.0656, 'GAG': 0.0656,
        'TTT': 0.0405, 'TTC': 0.0405,
        'GGT': 0.0691, 'GGC': 0.0691, 'GGA': 0.0691, 'GGG': 0.0691,
        'CAT': 0.0227, 'CAC': 0.0227,
        'ATT': 0.0591, 'ATC': 0.0591, 'ATA': 0.0591,
        'AAA': 0.0595, 'AAG': 0.0595,
        'TTA': 0.096, 'TTG': 0.096, 'CTT': 0.096, 'CTC': 0.096, 'CTA': 0.096, 'CTG': 0.096,
        'ATG': 0.0238,
        'AAT': 0.0427, 'AAC': 0.0427,
        'CCT': 0.0469, 'CCC': 0.0469, 'CCA': 0.0469, 'CCG': 0.0469,
        'CAA': 0.0393, 'CAG': 0.0393,
        'CGT': 0.0526, 'CGC': 0.0526, 'CGA': 0.0526, 'CGG': 0.0526, 'AGA': 0.0526, 'AGG': 0.0526,
        'TCT': 0.0694, 'TCC': 0.0694, 'TCA': 0.0694, 'TCG': 0.0694, 'AGT': 0.0694, 'AGC': 0.0694,
        'ACT': 0.055, 'ACC': 0.055, 'ACA': 0.055, 'ACG': 0.055,
        'GTT': 0.0667, 'GTC': 0.0667, 'GTA': 0.0667, 'GTG': 0.0667,
        'TGG': 0.0118,
        'TAT': 0.0311, 'TAC': 0.0311
    }
    return prob.get(codon, 0)

In [4]:
def gc_content(codon):
    gc_count = 0.5
    total_count = 0
    for c in codon:
        gc_count += codon.count('G') + codon.count('C')
        total_count += len(codon)
    gc_content = gc_count / total_count if total_count > 0 else 0
    return gc_content
    
print(gc_content("GGG"))
print(gc_content("ACC"))
print(gc_content("AAA"))

1.0555555555555556
0.7222222222222222
0.05555555555555555


In [5]:
def minimize_repeats(codon1, codon2):
    repeat_count = 0

    if codon1[-1] == codon2[0]:
        repeat_count += 2
        if codon1[1] == codon1[-1] and codon1[0] == codon1[-1]:
            repeat_count += 2
        elif codon1[1] == codon1[-1]:
            repeat_count += 1

        if codon2[0] == codon2[1] and codon2[0] == codon2[-1]:
            repeat_count += 2
        elif codon2[0] == codon2[1]:
            repeat_count += 1

    if repeat_count == 0:
        return 1
    else:
        return 1 / repeat_count

print(minimize_repeats("ATT", "ACA"))
print(minimize_repeats("ATT", "TCA"))
print(minimize_repeats("ATT", "TTT"))
print(minimize_repeats("TTT", "TTA"))
print(minimize_repeats("TTT", "TTT"))

1
0.3333333333333333
0.2
0.2
0.16666666666666666


In [8]:
def cai_value(codon1, codon2):
    cai = CAI(Seq(codon1), reference=[Seq(codon2)])
    if math.isnan(cai):
        return 0
    else:
        return cai

print(cai_value("ATA", "ATG"))
print(cai_value("TAG", "GAT"))
print(cai_value("AAT", "AAC"))
print(cai_value("AAC", "ATA"))
print(cai_value("GCG", "AAA"))

1.0
0
0.5
1.0
1.0


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


In [9]:
def score(codon1, codon2):
    score = probabilities(codon1) * probabilities(codon2) * gc_content(codon1) * gc_content(codon2) * minimize_repeats(codon1, codon2) * cai_value(codon1, codon2)
    return score * 1000

print(score("ATA", "ATG"))
print(score("TAG", "GAT"))
print(score("AAT", "AAC"))
print(score("AAC", "ATA"))
print(score("GCG", "AAA"))

0.015194537037037038
0.0
0.019696033950617288
0.05452157407407408
0.27111064814814817


In [26]:
def best_score(codonlist1, codonlist2):
    max_score = 0
    
    for codon1 in codonlist1:
        for codon2 in codonlist2:
            current_score = score(codon1, codon2)
            if current_score > max_score:
                max_score = current_score
    
    return max_score

codonlist1 = ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC']
codonlist2 = ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC']

best = best_score(codonlist1, codonlist2)
print("Best score:", best)

Best score: 2.5122371604938274


In [27]:
j_matrix = np.zeros((len(p), len(p)))
codons = p2codons(p)

def calculate_j_matrix():
    for i in range(len(p) - 1):
        j_matrix[i, i + 1] = best_score(codons[i], codons[i + 1])
        j_matrix[i + 1, i] = j_matrix[i, i + 1]
    
    return j_matrix
        
print(calculate_j_matrix())

[[0.         0.79992651 0.        ]
 [0.79992651 0.         0.        ]
 [0.         0.         0.        ]]


## Finding matrix for linear coefficients
This matrix is meant to represent the probabililty of selecting a codon for an amino acid. The rows of this matrix represent all potential codons for all the amino acids we have found, and the columns represent all amino acids. Each cell represents the probability of selecting the jth codon for the ith amino acid.

In [33]:
aa_codons = {
    'A': ['GCT', 'GCC', 'GCA', 'GCG'],
    'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
    'N': ['AAT', 'AAC'],
    'D': ['GAT', 'GAC'],
    'C': ['TGT', 'TGC'],
    'Q': ['CAA', 'CAG'],
    'E': ['GAA', 'GAG'],
    'G': ['GGT', 'GGC', 'GGA', 'GGG'],
    'H': ['CAT', 'CAC'],
    'I': ['ATT', 'ATC', 'ATA'],
    'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
    'K': ['AAA', 'AAG'],
    'M': ['ATG'],
    'F': ['TTT', 'TTC'],
    'P': ['CCT', 'CCC', 'CCA', 'CCG'],
    'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
    'T': ['ACT', 'ACC', 'ACA', 'ACG'],
    'W': ['TGG'],
    'Y': ['TAT', 'TAC'],
    'V': ['GTT', 'GTC', 'GTA', 'GTG'],
}

codons = p2codons(p)
num_codons = 0

for amino in p:
    for codon in aa_codons[amino]:
        num_codons += 1

weights_matrix = np.zeros((num_codons, len(p)))

#find probabilities
for i, amino in enumerate(p):
    for j, codon in enumerate(codons):
        for k, c in enumerate(codon) :
            if c in aa_codons[amino]:
                weights_matrix[k][i] = probabilities(c)
            
print(weights_matrix)

[['AAT', 'AAC'], ['GTT', 'GTC', 'GTA', 'GTG'], ['GAT', 'GAC']]
[[0.0427 0.0667 0.    ]
 [0.0427 0.0667 0.    ]
 [0.     0.0667 0.    ]
 [0.     0.0667 0.    ]
 [0.     0.     0.    ]
 [0.     0.     0.    ]
 [0.     0.     0.    ]
 [0.     0.     0.    ]]


# Matrix explanations
We create our algorithm by generating linear coefficients and quadratic coefficients for our codons in certain amino acid positions. 

## Linear coefficients
The linear coefficients matrix has all possible codons in the rows and all amino acids in the columns. The numbers in these matrices correspond to the probability of selecting the ith codon for the jth amino acid.

For example, with the amino acid sequence NM:

![alt](linear.jpeg)

## Quadratic coefficients
The quadratic coefficients represent the stability of the sequence based on a pair of codons. The numbers correspond to the stability if the ith codon is selected for the previous amino acid and the i+1th codon is selected for the current amino acid. This nearest neighbor stability technique is adapted by many codon optimization solutions.

## Algorithm
These coefficients are then attached to a quadratic program alongside constraints to ensure each amino acid has only one codon choice. This quadratic program is maximized to obtain an objective function, and then cast to an Ising hamiltonian by Qiskit's MinimumEigenOptimizer. A QAOA algorithm is applied to obtain final results. 

In [34]:
j_matrix = calculate_j_matrix()
binary_vars = []

qp = QuadraticProgram()
linear_coefficients = {}
quadratic_coefficients = {}

#iterate through each codon for each position, adding a binary variable representing the value 
#of that codon appearing in that position
for i in range(num_codons):
    for j in range(num_positions):
        var = qp.binary_var(name=f'x_{i}_{j}')
        binary_vars.append(var)
        #attach a linear coefficient (representing how likely that codon is to appear in that position)
        #to the ith codon at the jth position 
        linear_coefficients[f'x_{i}_{j}'] = weights_matrix[i, j]
        if j > i:
            #attach a quadratic coefficient (representing the stability of the sequence 
            #if the current codon appears in the current position and the next codon appears in the next position)
            quadratic_coefficients[(f'x_{i}_{j-1}', f'x_{i+1}_{j}')] = j_matrix[i, j]
            
#attach linear constraints so each position can only have one codon appearing with 100% certainty
#all other codons are zero
for j in range(len(j_matrix)):
    constraint_dict = {var_name: 1.0 for var_name in [f'x_{i}_{j}' for i in range(len(j_matrix))]}
    qp.linear_constraint(linear=constraint_dict, sense='==', rhs=1.0)

#obtain the objective function based on coefficients
qp.maximize(linear=linear_coefficients, quadratic=quadratic_coefficients)

Traceback [1;36m(most recent call last)[0m:
[1;36m  Cell [1;32mIn[34], line 1[1;36m
[1;33m    j_matrix = calculate_j_matrix()[1;36m
[1;31mNameError[0m[1;31m:[0m name 'calculate_j_matrix' is not defined

Use %tb to get the full traceback.


In [24]:
service = QiskitRuntimeService(channel="ibm_quantum")
backend = service.get_backend("ibmq_qasm_simulator") # delete line when ready
simulator = AerSimulator()

qiskit_runtime_service.__init__:INFO:2024-04-14 13:16:53,568: Default instance: ibm-q/open/main


In [None]:
#for comparision with classical results
exact_mes = NumPyMinimumEigensolver()
exact = MinimumEigenOptimizer(exact_mes)
exact_result = exact.solve(qp)
print("classical result")
print_results(exact_result)

In [None]:
#define a qaoa to run 
qaoa = QAOA(sampler=simulator, optimizer=COBYLA())

#find the ising hamiltonian and run the qaoa
min_eigen_optimizer = MinimumEigenOptimizer(qaoa)

quantum_result = min_eigen_optimizer.solve(qp)
print("quantum results:")
print_results(quantum_result)