# Standard packages

In [1]:
import os
import pandas as pd
import time

# Specific installs

In [2]:
method_name = "LinearPartition"

In [3]:
!git clone https://github.com/LinearFold/LinearPartition

Cloning into 'LinearPartition'...
remote: Enumerating objects: 270, done.[K
remote: Counting objects: 100% (270/270), done.[K
remote: Compressing objects: 100% (192/192), done.[K
remote: Total 270 (delta 157), reused 174 (delta 74), pack-reused 0[K
Receiving objects: 100% (270/270), 3.39 MiB | 8.26 MiB/s, done.
Resolving deltas: 100% (157/157), done.


In [4]:
# Change path
os.chdir('LinearPartition/')

!make

chmod +x linearpartition draw_bpp_plot draw_heatmap
mkdir -p bin
g++ src/LinearPartition.cpp -std=c++11 -O3 -Dlpv -o bin/linearpartition_v 
g++ src/LinearPartition.cpp -std=c++11 -O3 -o bin/linearpartition_c


# S. cerevisiae (sce) 18 long non-coding RNA dataset
Data source: https://genie.weizmann.ac.il/pubs/PARS10/data/sce_genes_folded.tab.gz

In [None]:
import pandas as pd 
gh_path = "https://raw.githubusercontent.com/sinc-lab/lncRNA-folding/master/data/"
sce = pd.read_csv(gh_path + "sce_genes_folded.tab", delimiter='\t', 
                  header=None, index_col=0, 
                  names=("Gene ID", "sequence", "PARS-assisted folding"))

In [6]:
# Sequences to process
sce18 = ["snR81", "snR34", "snR43", "snR44",  "snR31",  "snR10",
         "snR63", "snR11", "snR82", "snR17b", "snR17a", "snR37",
         "SCR1",  "SRG1",  "snR19", "snR30",  "LSR1",   "TLC1"]

# Compute structures

In [7]:
def run_folding(fasta_name):

  #==========================================
  # VIENNA
  os.system(f'cat tmp.fasta | ./linearpartition -V -M > tmp.dot')

  # CONTRAFOLD
  #os.system(f'cat tmp.fasta | ./linearpartition -M > tmp.dot')
  #==========================================

  # Reading sequence information
  with open(f'{fasta_name}', 'r') as fp:
    name, sequence = fp.readlines()
  
  # Reading prediction
  with open('tmp.dot', 'r') as fp:
    lines = fp.read().strip().split('\n')
    prediction = lines[-1]
  
  # Building output
  out_file_name = "clean_tmp.dot"
  
  with open(out_file_name, "w") as out_file:
    out_file.write(f'{name}{sequence}{prediction}\n')

  return out_file_name

In [10]:
out_fasta_name = method_name + "_yeast18"

if os.path.exists(out_fasta_name + ".fasta"):
  os.remove(out_fasta_name + ".fasta")

print("   \t lnc \t len \t time")
for i, lnc in enumerate(sce18): 

  start_time = time.time()
  seq = sce.loc[lnc]["sequence"]
  print(f"{i+1}/{len(sce18)}\t{lnc} \t {len(seq)}", end='\t')

  # Write a one-sequence fasta
  with open("tmp.fasta", "w") as ofile: 
    ofile.write(f">{lnc}\n{seq}\n")
  
  dot_file_name = run_folding("tmp.fasta")

  # Concatenate outputs
  os.system("cat " + dot_file_name + " >> " + out_fasta_name + ".fasta") 

  print(f"{time.time() - start_time: .1f} s")

!mv LinearPartition_yeast18.fasta ../

   	 lnc 	 len 	 time
1/18	snR81 	 201	 0.4 s
2/18	snR34 	 203	 0.4 s
3/18	snR43 	 209	 0.5 s
4/18	snR44 	 211	 0.4 s
5/18	snR31 	 225	 0.3 s
6/18	snR10 	 245	 0.4 s
7/18	snR63 	 255	 0.4 s
8/18	snR11 	 258	 0.4 s
9/18	snR82 	 268	 0.4 s
10/18	snR17b 	 332	 0.7 s
11/18	snR17a 	 333	 0.7 s
12/18	snR37 	 386	 0.9 s
13/18	SCR1 	 522	 1.4 s
14/18	SRG1 	 551	 1.4 s
15/18	snR19 	 568	 1.6 s
16/18	snR30 	 606	 1.7 s
17/18	LSR1 	 1175	 4.4 s
18/18	TLC1 	 1301	 4.6 s
