<a href="https://colab.research.google.com/github/claugomezv/ProLink/blob/uniseq/ProLink.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


#<font color='yellow'>***/***</font> ***ProLink*** <font color='yellow'>***/***</font>  

2022-2024 @ *Universidad de Zaragoza*

For more information please visit https://github.com/unizar-flav/ProLink


In [44]:
#@title Install environment
#@markdown Run this cell first and only once! It will install all the dependencies required to run the ProLink package.
%%capture

import os

!apt-get update
!wget https://www.megasoftware.net/do_force_download/megax-cc_10.2.6-1_amd64.deb
!apt-get install -y ghostscript ncbi-blast+ ./megax-cc_10.2.6-1_amd64.deb

!wget -O /content/muscle https://github.com/rcedgar/muscle/releases/download/v5.1/muscle5.1.linux_intel64
!chmod +x /content/muscle
os.environ['PATH'] += ":/content"

!wget -qO- https://github.com/soedinglab/MMseqs2/releases/download/15-6f452/mmseqs-linux-avx2.tar.gz | tar -xvz
os.environ['PATH'] += ":/content/mmseqs/bin"

!git clone --branch uniseq https://github.com/claugomezv/ProLink.git /content/ProLink

# local BLAST databases
!mkdir -p /content/blast_db
!wget -qO- https://ftp.ncbi.nlm.nih.gov/blast/db/pdbaa.tar.gz | tar -xvz -C /content/blast_db/
!wget -qO- https://ftp.ncbi.nlm.nih.gov/blast/db/landmark.tar.gz | tar -xvz -C /content/blast_db/
os.environ['BLASTDB'] = '/content/blast_db'
blast_db_local = ('pdbaa', 'landmark')

import logging
import sys
from datetime import datetime
from google.colab import files

sys.path.append("/content/ProLink")
sys.path.append("/content/ProLink/ProLink")
print("Contenido de /content/ProLink:", os.listdir("/content/ProLink"))
print("Contenido de /content/ProLink/ProLink:", os.listdir("/content/ProLink/ProLink"))

try:
    from ProLink.prolink import pro_link
    print("✅ Importación exitosa de pro_link")
except ImportError as e:
    print("❌ Error al importar pro_link:", e)

!python3 -m ProLink.prolink


In [47]:
#@title Introduce the parameters

parameters = dict()

#@markdown ----QUERY PROTEINS (PROTEIN SEQUENCE CODES, COMMA SEPARATED)----
query_proteins = "ABQ62490.1" #@param {type:"string"}
query_proteins = query_proteins.replace(' ', '').split(',')

#@markdown ----BLAST PARAMETERS----
hitlist_size = 5000 #@param {type:"integer"}
parameters['hitlist_size'] = hitlist_size

blast_database = "Reference Proteins (refseq_protein)" #@param ["Non redundant protein sequences (nr)", "Reference Proteins (refseq_protein)", "Model organisms (landmark)", "Protein Data Bank proteins (pdbaa)"]
parameters['blast_database'] = blast_database.split('(')[1].split(')')[0] # database name between parenthesis
parameters['blast_local'] = parameters['blast_database'] in blast_db_local

length_restrict = False #@param {type:"boolean"}
parameters['length_restrict'] = length_restrict

length_margin = 0.5 #@param {type:"number"}
parameters['length_margin'] = length_margin

include_low_identity_seqs = True #@param {type:"boolean"}
parameters['include_low_identity_seqs'] = include_low_identity_seqs

identity_blast = 0.25  # @param {type:"number"}
parameters['identity_blast'] = identity_blast

pro_blast_ = False #@param {type:"boolean"}
parameters['pro_blast_'] = pro_blast_

min_low_identity_seqs = 0 # @param {type:"number"}
parameters['min_low_identity_seqs'] = min_low_identity_seqs

#@markdown ----CLUSTERING----
cluster_seqs = True #@param {type:"boolean"}
parameters['cluster_seqs'] = cluster_seqs

identity_cluster = 0.6 #@param {type:"number"} #Initial minimum sequence identity treshold to group the sequences into clusters.
parameters['identity_cluster'] = identity_cluster

pro_clustering_ = True #@param {type:"boolean"}
parameters['pro_clustering_'] = pro_clustering_

identity_cluster_step = 0.02 #@param {type:"number"} #Step to increase or decrease the minimum sequence identity threshold
parameters['identity_cluster_step'] = identity_cluster_step

min_number_clusters = 50 #@param {type:"integer"}
parameters['min_number_clusters'] = min_number_clusters

max_number_clusters = 300 #@param {type:"integer"}
parameters['max_number_clusters'] = max_number_clusters

#@markdown ----PFAM DOMAINS----
check_pfam_domains = True #@param {type:"boolean"}
parameters['check_pfam_domains'] = check_pfam_domains

#@markdown ----ALIGNMENT----
align_seqs = True #@param {type:"boolean"}
parameters['align_seqs'] = align_seqs

trim = False #@param {type:"boolean"}
parameters['trim'] = trim

#@markdown ----SEQUENCE LOGO GENERATION----
generate_logo = True #@param {type:"boolean"}
parameters['generate_logo'] = generate_logo

#@markdown ----PHYLOGENETIC TREE GENERATION----
generate_tree = True #@param {type:"boolean"}
parameters['generate_tree'] = generate_tree

tree_type = "NJ" #@param ["NJ", "ML"]
parameters['tree_type'] = tree_type

bootstrap_replications = "250" #@param ["100", "250", "500", "1000", "2000", "5000"]
parameters['boostrap_replications'] = bootstrap_replications

#@markdown For more advanced options, please feel free to edit the ProLink/parameters.yaml file.

In [46]:
#@title Execute the script
#@markdown Running the script (This may take a while)

extra_verbose = False #@param {type:"boolean"}

# outputs directory
now = datetime.now().strftime("%Y%m%d-%H.%M.%S")
outputs_dir = f"outputs_{now}"
outputs_zip = f"ProLink_{outputs_dir}.zip"
os.makedirs(f'/content/{outputs_dir}', exist_ok=True)
os.chdir(f'/content/{outputs_dir}')

# re-configure logging to override colab settings
logging.basicConfig(
    level = logging.DEBUG if extra_verbose else logging.INFO,
    format = '%(message)s',
    handlers = [
        logging.FileHandler(f'ProLink.log', mode='w'),
        logging.StreamHandler()
        ],
    force = True
    )

# main script
pro_link(query_proteins, **parameters)

NameError: name 'pro_link' is not defined

In [None]:
#@title Download outputs
#@markdown Run the cell to download the results as a zip

!zip -r ../$outputs_zip *
files.download(f"../{outputs_zip}")