# Standard packages

In [1]:
import os
import numpy as np
import pandas as pd
import time

# Specific installs

In [2]:
method_name = "RNAfold"

In [3]:
!wget -q https://www.tbi.univie.ac.at/RNA/download/sourcecode/1_6_x/ViennaRNA-1.6.1.tar.gz
!tar xfz ViennaRNA-1.6.1.tar.gz

In [4]:
install_path = "/content/ViennaRNA-1.6.1/Progs/"

%cd /content/ViennaRNA-1.6.1
!./configure 
!make all
# (exit with 2 but RNAfold is compiled OK)
%cd /content

/content/ViennaRNA-1.6.1
checking for a BSD-compatible install... /usr/bin/install -c
checking whether build environment is sane... yes
checking for gawk... no
checking for mawk... mawk
checking whether make sets $(MAKE)... yes
checking if malloc debugging is wanted... no
checking build system type... x86_64-unknown-linux-gnu
checking host system type... x86_64-unknown-linux-gnu
checking for gcc... gcc
checking for C compiler default output file name... a.out
checking whether the C compiler works... yes
checking whether we are cross compiling... no
checking for suffix of executables... 
checking for suffix of object files... o
checking whether we are using the GNU C compiler... yes
checking whether gcc accepts -g... yes
checking for gcc option to accept ANSI C... none needed
checking for style of include used by make... GNU
checking dependency style of gcc... gcc3
checking whether gcc accepts -fpic... yes
checking for a BSD-compatible install... /usr/bin/install -c
checking for ranlib.

# S. cerevisiae (sce) 18 long non-coding RNA dataset
Data source: https://genie.weizmann.ac.il/pubs/PARS10/data/sce_genes_folded.tab.gz

In [5]:
gh_path = "https://raw.githubusercontent.com/sinc-lab/lncRNA-folding/master/data/"
sce = pd.read_csv(gh_path + "sce_genes_folded.tab", delimiter='\t', 
                  header=None, index_col=0, 
                  names=("Gene ID", "sequence", "PARS-assisted folding"))

In [6]:
# Sequences to process
yeast18lnc = ["snR81", "snR34", "snR43", "snR44",  "snR31",  "snR10",
              "snR63", "snR11", "snR82", "snR17b", "snR17a", "snR37",
              "SCR1",  "SRG1",  "snR19", "snR30",  "LSR1",   "TLC1"]

In [7]:
# Scores for constraints
pars = pd.read_csv(gh_path + "sce_PARS_score.tab", sep='\t').set_index('gene')

# Compute structures

In [8]:
def run_folding(fasta_name):

    # Compute structure
    command_line = [install_path + "RNAfold ",  
                    "-T 37 ",
                    "-C ",  
                    "< tmp.fasta > rnafold_tmp.fasta"]
    !{"".join(command_line)}
    
    
    return "rnafold_tmp.fasta"

In [9]:
!/content/ViennaRNA-1.6.1/Progs/RNAfold -h


usage:
RNAfold [-p[0]] [-C] [-T temp] [-4] [-d[2|3]] [-noGU] [-noCloseGU]
        [-noLP] [-e e_set] [-P paramfile] [-nsp pairs] [-S scale]
        [-noconv] [-circ] 



In [10]:
# Convert scores to contraints with RNAfold 1.6 format
# https://genie.weizmann.ac.il/pubs/PARS10/pars10_notes.html
def get_constraints(pars, lnc):
    scores = np.array(pars.loc[lnc, 'score'].split(';'), dtype=float)
    cons = ["."]*len(scores)
    for i,s in enumerate(scores):
        if s>5: cons[i] = "|"
        elif s<-5: cons[i] = "x"
    cons_str = "".join(cons)
    return cons_str
    
# Get prediction in a string to compare with PARS assited references
def get_pred(file_name, slen):
    str_out = ""
    with open(file_name,'r') as fasta:
        for i, line in enumerate(fasta):
            if i==2: str_out = line[:slen]
    return str_out

In [11]:
out_fasta_name = method_name + "_yeast18"
if os.path.exists(out_fasta_name + ".fasta"): os.remove(out_fasta_name + ".fasta")

lnc_ids = yeast18lnc
print("   \t lnc \t len \t time \t =ref(Kertesz2010)?")
for i, lnc in enumerate(lnc_ids): 

    start_time = time.time()
    seq = sce.loc[lnc]["sequence"]
    print(f"{i+1}/{len(lnc_ids)}\t{lnc} \t {len(seq)}", end='\t')

    cons = get_constraints(pars, lnc)
    #print("Con: \n", cons)

    # Write a one-sequence fasta
    with open("tmp.fasta", "w") as ofile: 
        ofile.write(f">{lnc}\n{seq}\n{cons}\n")

    dot_file_name = run_folding("tmp.fasta")

    # Concatenate outputs
    os.system("cat " + dot_file_name + " >> " + out_fasta_name + ".fasta") 

    print(f"{time.time() - start_time: .1f} s", end='\t')

    str_ref = sce.loc[lnc]["PARS-assisted folding"]
    str_pre = get_pred(dot_file_name, len(seq))
    if not (str_ref==str_pre):  
        print("Ref: \n" + str_ref)
        print("Pre: \n" + str_pre)
    else:
        print("  OK")

   	 lnc 	 len 	 time 	 =ref(Kertesz2010)?
1/18	snR81 	 201	 0.1 s	  OK
2/18	snR34 	 203	 0.1 s	  OK
3/18	snR43 	 209	 0.1 s	  OK
4/18	snR44 	 211	 0.1 s	  OK
5/18	snR31 	 225	 0.1 s	  OK
6/18	snR10 	 245	 0.1 s	  OK
7/18	snR63 	 255	 0.1 s	  OK
8/18	snR11 	 258	 0.2 s	  OK
9/18	snR82 	 268	 0.1 s	  OK
10/18	snR17b 	 332	 0.2 s	  OK
11/18	snR17a 	 333	 0.2 s	  OK
12/18	snR37 	 386	 0.2 s	  OK
13/18	SCR1 	 522	 0.3 s	  OK
14/18	SRG1 	 551	 0.3 s	  OK
15/18	snR19 	 568	 0.4 s	  OK
16/18	snR30 	 606	 0.4 s	  OK
17/18	LSR1 	 1175	 1.8 s	  OK
18/18	TLC1 	 1301	 2.1 s	  OK
