# Standard packages

In [1]:
import os
import pandas as pd
import time
import shutil

# Specific installs
Installation instructions are documented in https://github.com/jaswindersingh2/SPOT-RNA.

This notebook requires GPU, but it is not strictly required to run this method for prediction.

In [2]:
method_name = "SPOT-RNA"

In [None]:
!git clone --quiet https://github.com/jaswindersingh2/SPOT-RNA.git
os.chdir("/content/SPOT-RNA/")
!wget -q 'https://www.dropbox.com/s/dsrcf460nbjqpxa/SPOT-RNA-models.tar.gz'
!wget -q -O SPOT-RNA-models.tar.gz 'https://app.nihaocloud.com/f/fbf3315a91d542c0bdc2/?dl=1'
!tar -xzf SPOT-RNA-models.tar.gz && rm SPOT-RNA-models.tar.gz

# Older releases of some libraries are required, the following are removed to 
# avoid conflicts
!pip uninstall -qy tensorflow numpy kapre xarray scikit-image yellowbrick datascience albumentations imgaug keras-vis arviz  pymc3 pywavelets pyerfa pyarrow jaxlib jax cupy-cuda111 astropy
# And required libraries are installed
!pip install -q tensorflow-gpu==1.14.0
!pip install -q -r requirements.txt
# Library needed for dot-bracket notation
!sudo apt install -qq cpanminus && sudo cpanm Graph

os.chdir("/content/")

# S. cerevisiae (sce) 18 long non-coding RNA dataset
Data source: https://genie.weizmann.ac.il/pubs/PARS10/data/sce_genes_folded.tab.gz

In [4]:
gh_path = "https://raw.githubusercontent.com/sinc-lab/lncRNA-folding/master/data/"
sce = pd.read_csv(gh_path + "sce_genes_folded.tab", delimiter='\t', 
                  header=None, index_col=0, 
                  names=("Gene ID", "sequence", "PARS-assisted folding"))

In [5]:
# Sequences to process
yeast18lnc = ["snR81", "snR34", "snR43", "snR44",  "snR31",  "snR10",
              "snR63", "snR11", "snR82", "snR17b", "snR17a", "snR37",
              "SCR1",  "SRG1",  "snR19", "snR30",  "LSR1",   "TLC1"]

# Compute structures

In [6]:
def run_folding(fasta_name):
  os.chdir("/content/SPOT-RNA/")

  out_file_name = "tmp_out.fasta"
  shutil.rmtree("outputs")
  os.mkdir("outputs")
  
  # Compute structure
  os.system(f"python SPOT-RNA.py  --inputs ../{fasta_name} --outputs outputs --motifs True --plots True --gpu 0")
  with open(f"../{fasta_name}") as fin:
    seq_id = fin.readline().strip("> \n")
  
  # Read .st file and extract dot-bracket structure. 
  k = 0
  for line in open(f"outputs/{seq_id}.st"):
    if line[0] == "#":
      continue # its a comment
    if k == 0: # this is the sequence
      sequence = line.strip()
    elif k == 1: # this is the structure
      struct = line.strip()
    else:
      break  
    k += 1
      
  os.chdir("/content/")
  # write prediction in the output file
  with open(out_file_name, "w") as fout:
    fout.write(f">{seq_id}\n")
    fout.write(f"{sequence}\n")
    fout.write(f"{struct}\n")

  return out_file_name

In [7]:
out_fasta_name = method_name + "_yeast18"
if os.path.exists(out_fasta_name + ".fasta"): os.remove(out_fasta_name + ".fasta")

print(" \t seq    \t len \t time")
for i, lnc in enumerate(yeast18lnc): 

  start_time = time.time()
  seq = sce.loc[lnc]["sequence"]
  print(f"{i+1: 03}/{len(yeast18lnc)} \t {lnc: <6} \t {len(seq)}", end='\t')

  # Write a one-sequence fasta
  with open("tmp.fasta", "w") as ofile: 
    ofile.write(f">{lnc}\n{seq}\n")
  

  dot_file_name = run_folding("tmp.fasta")

  # Concatenate outputs
  os.system("cat " + dot_file_name + " >> " + out_fasta_name + ".fasta") 

  print(f"{time.time() - start_time: .1f} s")

 	 seq    	 len 	 time
 01/18 	 snR81  	 201	 39.5 s
 02/18 	 snR34  	 203	 35.5 s
 03/18 	 snR43  	 209	 35.4 s
 04/18 	 snR44  	 211	 35.0 s
 05/18 	 snR31  	 225	 36.1 s
 06/18 	 snR10  	 245	 37.1 s
 07/18 	 snR63  	 255	 37.7 s
 08/18 	 snR11  	 258	 37.6 s
 09/18 	 snR82  	 268	 38.1 s
 10/18 	 snR17b 	 332	 40.4 s
 11/18 	 snR17a 	 333	 40.6 s
 12/18 	 snR37  	 386	 42.9 s
 13/18 	 SCR1   	 522	 49.3 s
 14/18 	 SRG1   	 551	 50.6 s
 15/18 	 snR19  	 568	 51.3 s
 16/18 	 snR30  	 606	 54.1 s
 17/18 	 LSR1   	 1175	 116.1 s
 18/18 	 TLC1   	 1301	 131.7 s
