# Using ARNIE

ARNIE can generate BPPs using a variety of different packages. Each of these packages do need to be installed separately. In this notebook, we install:
* Vienna
* Contrafold
* RNAstructure
* RNAsoft

We will then generate BPPs with these packages on a small subset of the data, in order to reduce the execution time.

In [None]:
!wget https://www.tbi.univie.ac.at/RNA/download/ubuntu/ubuntu_18_04/viennarna_2.4.15-1_amd64.deb
!apt-get install ./viennarna_2.4.15-1_amd64.deb -y
!git clone https://github.com/DasLab/arnie

!/opt/conda/bin/python3.7 -m pip install --upgrade pip
!git clone https://www.github.com/DasLab/draw_rna draw_rna_pkg
!cd draw_rna_pkg && python setup.py install

!yes '' | cpan -i Graph
!git clone https://github.com/hendrixlab/bpRNA

In [None]:
!git clone https://github.com/csfoo/contrafold-se.git
!apt-get install -y g++-4.8
!sed -i.bak "1 s/^.*$/CXX = g++-4.8/" contrafold-se/src/Makefile
!cd contrafold-se/src; make

In [None]:
!wget http://rna.urmc.rochester.edu/Releases/current/RNAstructureLinuxTextInterfaces64bit.tgz
!tar zxvf RNAstructureLinuxTextInterfaces64bit.tgz

In [None]:
!wget http://www.rnasoft.ca/download/MultiRNAFold-2.1.tar.gz
!tar zxvf MultiRNAFold-2.1.tar.gz
!apt-get -y install xutils-dev
!cd MultiRNAFold;make depend;make

In [None]:
import os
import sys

!echo "vienna_2: /usr/bin" > arnie.conf
!echo "contrafold_2: /kaggle/working/contrafold-se/src" >> arnie.conf
!echo "rnastructure: /kaggle/working/RNAstructure/exe" >> arnie.conf
!echo "rnasoft: /kaggle/working/MultiRNAFold" >> arnie.conf
!echo "TMP: /kaggle/working/tmp" >> arnie.conf
!mkdir -p /kaggle/working/tmp
os.environ["ARNIEFILE"] = f"/kaggle/working/arnie.conf"
os.environ["DATAPATH"] = f"/kaggle/working/RNAstructure/data_tables"
sys.path.append('/kaggle/working/draw_rna_pkg/')
sys.path.append('/kaggle/working/draw_rna_pkg/ipynb/')

In [None]:
!cat arnie.conf

## Setting

In [None]:

import numpy as np
import pandas as pd
from multiprocessing import Pool
from arnie.pfunc import pfunc
from arnie.mea.mea import MEA
from arnie.free_energy import free_energy
from arnie.bpps import bpps
from arnie.mfe import mfe
import arnie.utils as utils
from tqdm.notebook import tqdm as tqdm

n_candidates = 3
# turn off for all data
debug = False

In [None]:
!grep processor /proc/cpuinfo | wc -l

In [None]:
MAX_THRE = 4

In [None]:
train = pd.read_json('../input/stanford-covid-vaccine/train.json', lines=True)
test = pd.read_json('../input/stanford-covid-vaccine/test.json', lines=True)
train = train[:10]
test = test[:10]
target_df = train.append(test)

## Getting structure


In [None]:
os.mkdir('bpps')

In [None]:
def proc1(arg):
    sequence = arg[0]
    id = arg[1]
    log_gamma = arg[2]
    package = arg[3]
    struct = arg[4]
    if package == 'provided':
        bp_matrix = np.load(f'../input/stanford-covid-vaccine/bpps/{id}.npy')
    else:
        bp_matrix = bpps(sequence, package=package)
        
    mea_mdl = MEA(bp_matrix,gamma=10**log_gamma)
    np.save(f'bpps/{package}_{id}.npy', bp_matrix)
    return id, sequence, mea_mdl.structure, log_gamma, mea_mdl.score_expected()[2], package

# TODO add multiple packages for bpps
li = []
for log_gamma in [0]:
    for i, arr in enumerate(target_df[['sequence','id','structure']].values):
        for pack in ['rnasoft_07', 'vienna_2', 'contrafold_2', 'rnastructure']:
            li.append([arr[0], arr[1], log_gamma, pack, arr[2]])
            
p = Pool(processes=MAX_THRE)
results = []
for ret in tqdm(p.imap(proc1, li),total=len(li)):
    results.append(ret)
    #print(f'done for {ret[0]}')
df = pd.DataFrame(results, columns=['id', 'sequence', 'structure', 'log_gamma', 'score', 'package'])

df_tmp = target_df[['id', 'sequence', 'structure']].copy()
df_tmp['log_gamma'] = 100
df_tmp['score'] = 100
df = df.append(df_tmp).sort_values('score', ascending=False).reset_index(drop=True)

new_df = pd.DataFrame()
for id in df['id'].unique():
#     unq_df = df[df['id'] == id].drop_duplicates('structure')
#     unq_df['cnt'] = unq_df.shape[0]
    new_df = new_df.append(unq_df)

## Getting predicted_loop_type


In [None]:
!mkdir -p tmp_files
def get_predicted_loop_type(id, sequence, structure, debug=False):
    structure_fixed = structure.replace('.','0').replace('(','1').replace(')','2')
    pid = os.getpid()
    tmp_in_file = f'tmp_files/{id}_{structure_fixed}_{pid}.dbn'
    tmp_out_file = f'{id}_{structure_fixed}_{pid}.st'
    !echo $sequence > $tmp_in_file
    !echo "$structure" >> $tmp_in_file
    !export PERL5LIB=/root/perl5/lib/perl5 && perl bpRNA/bpRNA.pl $tmp_in_file
    result = [l.strip('\n') for l in open(tmp_out_file)]
    if debug:
        print(sequence)
        print(structure)
        print(result[5])
    else:
        !rm $tmp_out_file $tmp_in_file
    return id, structure, result[5]

def proc2(arg):
    result = get_predicted_loop_type(arg[0], arg[1], arg[2], debug=False)
    return result

li = []
for i, arr in enumerate(new_df[['id', 'sequence', 'structure']].values):
    li.append(arr)

p = Pool(processes=MAX_THRE)
results_loop_type = []
for ret in tqdm(p.imap(proc2, li),total=len(li)):
    results_loop_type.append(ret)
    #print(f'done for {ret[0]}')

new_df = new_df.merge(pd.DataFrame(results_loop_type, columns=('id', 'structure', 'predicted_loop_type')), on=['id','structure'], how='left')
new_df.to_csv('aug_data.csv', index=False)


In [None]:
new_df.head()

In [None]:
!rm -r tmp/
!rm -r RNAstructure/
!rm -r arnie/
!rm -r contrafold-se/
!rm -r draw_rna_pkg/
!rm -r bpRNA/
!rm -r MultiRNAFold/
!rm viennarna_2.4.15-1_amd64.deb
!rm RNAstructureLinuxTextInterfaces64bit.tgz
!rm arnie.conf

In [None]:
!zip -r bpps.zip bpps/

In [None]:
!rm -rf bpps