# Standard packages

In [1]:
import os
import pandas as pd
import time

# Specific installs


In [2]:
method_name = "UFold"

In [3]:
!git clone https://github.com/uci-cbcl/UFold.git

Cloning into 'UFold'...
remote: Enumerating objects: 197, done.[K
remote: Counting objects: 100% (197/197), done.[K
remote: Compressing objects: 100% (190/190), done.[K
remote: Total 197 (delta 101), reused 6 (delta 0), pack-reused 0[K
Receiving objects: 100% (197/197), 474.20 KiB | 6.32 MiB/s, done.
Resolving deltas: 100% (101/101), done.


In [4]:
!cp UFold/process_data_newdataset.py UFold/process_data_newdataset_original.py

In [5]:
import os
os.chdir("UFold")

In [6]:
!pip -q install -r requirements.txt

[K     |████████████████████████████████| 753.4 MB 6.8 kB/s 
[K     |████████████████████████████████| 25.2 MB 1.2 MB/s 
[K     |████████████████████████████████| 4.9 MB 39.1 MB/s 
[K     |████████████████████████████████| 10.4 MB 25.4 MB/s 
[K     |████████████████████████████████| 22.3 MB 1.5 MB/s 
[?25h  Building wheel for numpy (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.8.0 requires tf-estimator-nightly==2.8.0.dev2021122109, which is not installed.
xarray 0.18.2 requires numpy>=1.17, but you have numpy 1.14.3 which is incompatible.
xarray 0.18.2 requires pandas>=1.0, but you have pandas 0.25.0 which is incompatible.
torchvision 0.11.1+cu111 requires torch==1.10.0, but you have torch 1.4.0 which is incompatible.
torchtext 0.11.0 requires torch==1.10.0, but you have torch 1.4.0 which is incompatible

In [7]:
# Download pretrained model weights from the oficial repository
!gdown --id 11KqNc--ojFskZ2s4Se79Ta_rVMs-j-kn --output models/ufold_train.pt

!sed -i 's/device(1)/device(0)/g' ufold_predict.py
!sed -i 's/cuda:1/cuda:0/g' ufold_predict.py
!sed -i 's/unet_train_on_merge_alldata_98.pt/ufold_train.pt/g' ufold_predict.py

os.chdir("../")

Downloading...
From: https://drive.google.com/uc?id=11KqNc--ojFskZ2s4Se79Ta_rVMs-j-kn
To: /content/UFold/models/ufold_train.pt
100% 34.6M/34.6M [00:00<00:00, 70.2MB/s]


# S. cerevisiae (sce) 18 long non-coding RNA dataset
Data source: https://genie.weizmann.ac.il/pubs/PARS10/data/sce_genes_folded.tab.gz

In [8]:
gh_path = "https://raw.githubusercontent.com/sinc-lab/lncRNA-folding/master/data/"
sce = pd.read_csv(gh_path + "sce_genes_folded.tab", delimiter='\t', 
                  header=None, index_col=0, 
                  names=("Gene ID", "sequence", "PARS-assisted folding"))

In [9]:
# Sequences to process
yeast18lnc = ["snR81", "snR34", "snR43", "snR44",  "snR31",  "snR10",
              "snR63", "snR11", "snR82", "snR17b", "snR17a", "snR37",
              "SCR1",  "SRG1",  "snR19", "snR30",  "LSR1",   "TLC1"]

# Compute structures

In [28]:
import shutil
def run_folding(fasta_name, len):
  out_file_name = f"{fasta_name}.dot"
  # Compute structure
  shutil.copyfile(f"{fasta_name}", "UFold/data/input.txt")
  if len <=600:
    os.chdir("UFold")
    os.system("python ufold_predict.py")
    os.chdir("../")
    shutil.copyfile("UFold/results/input_dot_ct_file.txt", out_file_name)
  else:
    print("Structure cannot be computed (len>600)")
    os.system(f"python touch {out_file_name}")
  
  return out_file_name

In [29]:
out_fasta_name = method_name + "_yeast18"
if os.path.exists(out_fasta_name + ".fasta"): os.remove(out_fasta_name + ".fasta")

lnc_ids = yeast18lnc
print("   \t lnc \t len \t time")
for i, lnc in enumerate(lnc_ids): 

  start_time = time.time()
  seq = sce.loc[lnc]["sequence"]
  print(f"{i+1}/{len(lnc_ids)}\t{lnc} \t {len(seq)}", end='\t')

  # Write a one-sequence fasta
  with open("tmp.fasta", "w") as ofile: 
    ofile.write(f">{lnc}\n{seq}\n")
  
  dot_file_name = run_folding("tmp.fasta", len(seq))

  # Concatenate outputs
  os.system("cat " + dot_file_name + " >> " + out_fasta_name + ".fasta") 

  print(f"{time.time() - start_time: .1f} s")

   	 lnc 	 len 	 time
1/18	snR81 	 201	 4.9 s
2/18	snR34 	 203	 4.8 s
3/18	snR43 	 209	 4.8 s
4/18	snR44 	 211	 4.8 s
5/18	snR31 	 225	 5.0 s
6/18	snR10 	 245	 5.2 s
7/18	snR63 	 255	 5.4 s
8/18	snR11 	 258	 5.4 s
9/18	snR82 	 268	 5.3 s
10/18	snR17b 	 332	 6.1 s
11/18	snR17a 	 333	 6.3 s
12/18	snR37 	 386	 7.1 s
13/18	SCR1 	 522	 9.7 s
14/18	SRG1 	 551	 9.7 s
15/18	snR19 	 568	 11.1 s
16/18	snR30 	 606	Structure cannot be computed (len>600)
 0.1 s
17/18	LSR1 	 1175	Structure cannot be computed (len>600)
 0.1 s
18/18	TLC1 	 1301	Structure cannot be computed (len>600)
 0.1 s
