<a href="https://colab.research.google.com/github/shashankcuber/mRNA_Vacc_Stabilization/blob/main/mRNA_vaccine_stabilizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount dataset and rna tools from Drive

In [None]:
from google.colab import drive
#make sure you give the necessary authorization for colab to access your Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
open_vaccine_test_data_path = '/content/drive/MyDrive/Bio_informatics_Project/open_vac_data/test_data.csv'

# Installing RNA tools

In [None]:
!cd drive/MyDrive/Bio_informatics_Project/; sh install.sh

After copying from the drive contrafold package correct these files.


1.   LBFGS.ipp :  At line 110 write this->DoLineSearch
2.   Utilities.cpp:  Add #include<climits> for ULONG used in this file



# Setting up the arnie conf file and environment variable

In [None]:
import os
os.environ["ARNIEFILE"] = "/content/arnie.conf"
os.environ["DATAPATH"] = "/content/RNAstructure/data_tables"
!cd drive/MyDrive/Bio_informatics_Project/; sh make_arnie_conf.sh

/content
/content/arnie.conf
/content/RNAstructure/data_tables
TMP: /content/tmp
rnastructure: /content/RNAstructure/exe
contrafold_2: /content/contrafold-se/src
vienna_2: /usr/bin


# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm as tqdm
from arnie.mea.mea import MEA
from arnie.bpps import bpps
import arnie.utils as utils

# Test data replication using rna tools

In [None]:
class PrepareDataset:
  def __init__(self, rna_tool, path):
    self.package = rna_tool
    self.path = path
    self.df = pd.read_csv(path)


  def predict_structure(self, id, rna_sequence):
    base_pairing_mat = bpps(rna_sequence, package = self.package)
    secondary_struct = MEA(base_pairing_mat)
    return id, rna_sequence, secondary_struct.structure, secondary_struct.score_expected()[2], self.package

  
  def make_dataset(self):
    temp_df = self.df[['id','sequence']]
    
    dataset = []

    for j,(id,seq) in enumerate(tqdm(temp_df[['id', 'sequence']].values)):
      data = self.predict_structure(id, seq)
      dataset.append(data)

    df = pd.DataFrame(dataset, columns = ['id', 'sequence', 'structure','score' ,'package'])
    return df

In [None]:
#vienna_2
vienna = PrepareDataset('vienna_2', open_vaccine_test_data_path)
vienna_df = vienna.make_dataset()
vienna_df.head()

100%|██████████| 5/5 [00:02<00:00,  2.41it/s]


Unnamed: 0,id,sequence,structure,score,package
0,id_4e011753b,GGAAAUAUAUUGGUCCGUGUUCAGUCUAGCCUCCUACCCGCUAGGC...,..........(((((.(((((..(((((((.........)))))))...,0.875928,vienna_2
1,id_70747b441,GGAAACGGAUGGAAAUCGUCCGCGUGCGAAAAGCACGGCGUGGAAA...,.....((((((.....))))))(((((.....)))))(((((.......,0.968446,vienna_2
2,id_a71ee9a04,GGAAAGGGAAGGGACCGCAAAGCGGAACCCGGGCGCCCGAAAAAGG...,..............((((...)))).......((.((((......(...,0.63536,vienna_2
3,id_a03b10d8f,GGAAAGCCUAAACCGGGCGAUCUAGCCAUCGGCGGCACACAGAUCG...,.(...((((.....))))...)...................((((....,0.762015,vienna_2
4,id_5d366e1ba,GGAAACCAUAAUUCAGCAGGUAUCCCGACGCCUGAAAACAGCCAAA...,.............(..((..(((.(((.((((((...............,0.745095,vienna_2


In [None]:
#contrafold_2
contrafold = PrepareDataset('contrafold_2', open_vaccine_test_data_path)
contrafold_df = contrafold.make_dataset()
contrafold_df.head()

100%|██████████| 5/5 [00:03<00:00,  1.60it/s]


Unnamed: 0,id,sequence,structure,score,package
0,id_4e011753b,GGAAAUAUAUUGGUCCGUGUUCAGUCUAGCCUCCUACCCGCUAGGC...,...........((((.(((((..(((((((.........)))))))...,0.799682,contrafold_2
1,id_70747b441,GGAAACGGAUGGAAAUCGUCCGCGUGCGAAAAGCACGGCGUGGAAA...,.....(((((.......))))).((((.....)))).(((((.......,0.799663,contrafold_2
2,id_a71ee9a04,GGAAAGGGAAGGGACCGCAAAGCGGAACCCGGGCGCCCGAAAAAGG...,..............(((.....)))........................,0.454857,contrafold_2
3,id_a03b10d8f,GGAAAGCCUAAACCGGGCGAUCUAGCCAUCGGCGGCACACAGAUCG...,................(((((((.(((......................,0.578171,contrafold_2
4,id_5d366e1ba,GGAAACCAUAAUUCAGCAGGUAUCCCGACGCCUGAAAACAGCCAAA...,........................(((.(((((................,0.661927,contrafold_2


In [None]:
#rnastructure
rnastructure = PrepareDataset('rnastructure', open_vaccine_test_data_path)
rnastructure_df = rnastructure.make_dataset()
rnastructure_df.head()

100%|██████████| 5/5 [00:04<00:00,  1.01it/s]


Unnamed: 0,id,sequence,structure,score,package
0,id_4e011753b,GGAAAUAUAUUGGUCCGUGUUCAGUCUAGCCUCCUACCCGCUAGGC...,..........(((((.(((((..(((((((.........)))))))...,0.87574,rnastructure
1,id_70747b441,GGAAACGGAUGGAAAUCGUCCGCGUGCGAAAAGCACGGCGUGGAAA...,.....((((((.....))))))(((((.....)))))(((((.......,0.986415,rnastructure
2,id_a71ee9a04,GGAAAGGGAAGGGACCGCAAAGCGGAACCCGGGCGCCCGAAAAAGG...,..............((((...))))..........(((......))...,0.583683,rnastructure
3,id_a03b10d8f,GGAAAGCCUAAACCGGGCGAUCUAGCCAUCGGCGGCACACAGAUCG...,(((..((((.....))))..))).(((......))).....((((....,0.925298,rnastructure
4,id_5d366e1ba,GGAAACCAUAAUUCAGCAGGUAUCCCGACGCCUGAAAACAGCCAAA...,................((..(((.(((.((((((...............,0.675891,rnastructure
