In [1]:
!pip install tqdm




In [4]:
import os
import subprocess

def combine_fasta_files(input_dir, output_file):
    """
    Combine multiple FASTA files into one single file.
    """
    with open(output_file, 'w') as outfile:
        for filename in os.listdir(input_dir):
            if filename.endswith(".fasta"):
                filepath = os.path.join(input_dir, filename)
                with open(filepath, 'r') as infile:
                    outfile.write(infile.read() + '\n')
    print(f"Combined all FASTA files into: {output_file}")

def create_symbolic_links(a3m_ffdata, a3m_ffindex, cs219_ffdata, cs219_ffindex):
    """
    Create symbolic links for the .ffdata and .ffindex files.
    """
    if not os.path.exists(cs219_ffdata):
        os.symlink(a3m_ffdata, cs219_ffdata)
        print(f"Created symbolic link: {cs219_ffdata}")

    if not os.path.exists(cs219_ffindex):
        os.symlink(a3m_ffindex, cs219_ffindex)
        print(f"Created symbolic link: {cs219_ffindex}")

def run_hhblits(bin_path, protein_file, db_path, output_hhr, output_a3m):
    """
    Run HHblits on the combined FASTA file and capture any errors.
    """
    cmd = f'{bin_path} -i {protein_file} -d "{db_path}" -o {output_hhr} -oa3m {output_a3m} -cpu 8'
    print(f"Running command: {cmd}")
    
    try:
        result = subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        print("HHblits Output:", result.stdout)
    except subprocess.CalledProcessError as e:
        print("Error running HHblits:")
        print(e.stderr)

# Define file paths
input_dir = "kiba_dataset/proteins"  # Directory with individual FASTA files
combined_fasta_file = "combined_proteins.fasta"  # Output combined FASTA file
a3m_ffdata = "/home/saeed/Documents/base paper implementation/datasets/uniclust30_2018_08/uniclust30_2018_08_a3m.ffdata"
a3m_ffindex = "/home/saeed/Documents/base paper implementation/datasets/uniclust30_2018_08/uniclust30_2018_08_a3m.ffindex"
cs219_ffdata = "/home/saeed/Documents/base paper implementation/datasets_cs219_cs219.ffdata"
cs219_ffindex = "/home/saeed/Documents/base paper implementation/datasets_cs219_cs219.ffindex"
bin_path = "/home/saeed/miniconda3/bin/hhblits"
db_path = "/home/saeed/Documents/base paper implementation/datasets_cs219"
output_hhr = "output.hhr"
output_a3m = "output.a3m"

# Step 1: Combine all FASTA files into one
combine_fasta_files(input_dir, combined_fasta_file)

# Step 2: Create symbolic links for .ffdata and .ffindex files
create_symbolic_links(a3m_ffdata, a3m_ffindex, cs219_ffdata, cs219_ffindex)

# Step 3: Run HHblits on the combined FASTA file
run_hhblits(bin_path, combined_fasta_file, db_path, output_hhr, output_a3m)


Combined all FASTA files into: combined_proteins.fasta
Running command: /home/saeed/miniconda3/bin/hhblits -i combined_proteins.fasta -d "/home/saeed/Documents/base paper implementation/datasets_cs219" -o output.hhr -oa3m output.a3m -cpu 8
Error running HHblits:
- 14:55:36.507 ERROR: Could find neither hhm_db nor a3m_db!




In [13]:
import os
import subprocess

def run_hhblits(bin_path, protein_file, db_path, output_hhr, output_a3m):
    cmd = f'{bin_path} -i {protein_file} -d "{db_path}" -o {output_hhr} -oa3m {output_a3m} -cpu 8'
    print(f"Running command: {cmd}")
    
    try:
        result = subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        print("HHblits Output:", result.stdout)
    except subprocess.CalledProcessError as e:
        print("Error running HHblits:")
        print(e.stderr)

# Example usage
bin_path = "/home/saeed/miniconda3/bin/hhblits"
protein_file = "combined_proteins.fasta"
db_path = "/home/saeed/Documents/base paper implementation/datasets/uniclust30_2018_08_a3m_db"  # Corrected path to a3m_db
output_hhr = "output.hhr"
output_a3m = "output.a3m"

run_hhblits(bin_path, protein_file, db_path, output_hhr, output_a3m)


Running command: /home/saeed/miniconda3/bin/hhblits -i combined_proteins.fasta -d "/home/saeed/Documents/base paper implementation/datasets/uniclust30_2018_08_a3m_db" -o output.hhr -oa3m output.a3m -cpu 8
Error running HHblits:
- 15:57:00.956 ERROR: In /opt/conda/conda-bld/hhsuite_1720673754217/work/src/ffindexdatabase.cpp:11: FFindexDatabase:

- 15:57:00.956 ERROR: 	could not open file '/home/saeed/Documents/base paper implementation/datasets/uniclust30_2018_08_a3m_db_cs219.ffdata'




In [12]:
import os

file_path = "/home/saeed/Documents/base paper implementation/datasets/uniclust30_2018_08/a3m_db.ffdata"

if os.path.exists(file_path):
    print(f"File exists: {file_path}")
else:
    print(f"File does not exist: {file_path}")


File exists: /home/saeed/Documents/base paper implementation/datasets/uniclust30_2018_08/a3m_db.ffdata


In [3]:
#!/usr/bin/env python3

import os
import sys
import subprocess
from Bio import SeqIO
from multiprocessing import Pool

# Paths (Adjust these paths according to your setup)
input_fasta = "/home/saeed/Documents/base paper implementation/combined_proteins.fasta"
split_sequences_dir = "/home/saeed/Documents/base paper implementation/split_sequences"
alignments_dir = "/home/saeed/Documents/base paper implementation/alignments"
database = "/media/saeed/88420B84420B766A/UniRef30_2022_02_hhsuite/UniRef30_2022_02"
hhblits_path = "/home/saeed/miniconda3/bin/hhblits"  # If hhblits is not in PATH, provide the full path

# Number of CPU cores to use for HHblits (adjust as needed)
cpu_cores = 4

# Number of parallel processes for HHblits
parallel_processes = 4  # Adjust based on your system's capabilities

def split_fasta(input_file, output_dir):
    """
    Splits a multi-FASTA file into individual FASTA files.
    """
    os.makedirs(output_dir, exist_ok=True)
    with open(input_file, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            seq_id = record.id.replace("/", "_")  # Replace '/' to avoid directory issues
            output_file = os.path.join(output_dir, f"{seq_id}.fasta")
            with open(output_file, "w") as out_handle:
                SeqIO.write(record, out_handle, "fasta")

def run_hhblits(seq_file):
    """
    Runs HHblits on a single sequence file.
    """
    seq_name = os.path.splitext(os.path.basename(seq_file))[0]
    output_file = os.path.join(alignments_dir, f"{seq_name}.a3m")
    hhr_file = os.path.join(alignments_dir, f"{seq_name}.hhr")
    
    # Construct the HHblits command
    hhblits_cmd = [
        hhblits_path,
        "-i", seq_file,
        "-d", database,
        "-oa3m", output_file,
        "-o", hhr_file,
        "-n", "3",
        "-e", "1e-5",
        "-cpu", str(cpu_cores)
    ]
    
    # Run HHblits
    try:
        subprocess.run(hhblits_cmd, check=True)
        print(f"HHblits completed for {seq_name}")
    except subprocess.CalledProcessError as e:
        print(f"Error running HHblits on {seq_name}: {e}")

def main():
    # Step 1: Split the combined FASTA file into individual sequences
    print("Splitting the combined FASTA file into individual sequences...")
    split_fasta(input_fasta, split_sequences_dir)
    print("Splitting completed.")

    # Step 2: Create the alignments directory if it doesn't exist
    os.makedirs(alignments_dir, exist_ok=True)

    # Step 3: Get a list of all sequence files
    seq_files = [
        os.path.join(split_sequences_dir, f)
        for f in os.listdir(split_sequences_dir)
        if f.endswith(".fasta")
    ]

    # Step 4: Run HHblits on each sequence using multiprocessing
    print(f"Running HHblits on {len(seq_files)} sequences...")
    with Pool(processes=parallel_processes) as pool:
        pool.map(run_hhblits, seq_files)

    print("All HHblits jobs completed.")

if __name__ == "__main__":
    main()


Splitting the combined FASTA file into individual sequences...
Splitting completed.
Running HHblits on 229 sequences...


- 16:21:20.155 INFO: Searching 32053680 column state sequences.

- 16:21:20.185 INFO: Searching 32053680 column state sequences.

- 16:21:20.214 INFO: /home/saeed/Documents/base paper implementation/split_sequences/P29323.fasta is in A2M, A3M or FASTA format

- 16:21:20.246 INFO: /home/saeed/Documents/base paper implementation/split_sequences/Q15118.fasta is in A2M, A3M or FASTA format

- 16:21:20.408 INFO: Iteration 1

- 16:21:20.408 INFO: Iteration 1

- 16:21:20.913 INFO: Prefiltering database

- 16:21:21.344 INFO: Prefiltering database



KeyboardInterrupt: 

In [1]:
#!/usr/bin/env python3

import os
import subprocess
from multiprocessing import Pool

# Paths (Adjust these paths according to your setup)
proteins_dir = "/home/saeed/Documents/base paper implementation/kiba_dataset/proteins"
alignments_dir = "/home/saeed/Documents/base paper implementation/kiba_dataset/alignments"
database = "/media/saeed/88420B84420B766A/UniRef30_2022_02_hhsuite/UniRef30_2022_02"
hhblits_path = "/home/saeed/miniconda3/bin/hhblits"  # Provide full path if hhblits is not in PATH

# Number of CPU cores to use for HHblits (adjust as needed)
cpu_cores = 6

# Number of parallel processes for HHblits
parallel_processes = 6 # Adjust based on your system's capabilities

def run_hhblits(seq_file):
    """
    Runs HHblits on a single sequence file.
    """
    seq_name = os.path.splitext(os.path.basename(seq_file))[0]
    output_a3m = os.path.join(alignments_dir, f"{seq_name}.a3m")
    output_hhr = os.path.join(alignments_dir, f"{seq_name}.hhr")
    
    # Construct the HHblits command
    hhblits_cmd = [
        hhblits_path,
        "-i", seq_file,
        "-d", database,
        "-oa3m", output_a3m,
        "-o", output_hhr,
        "-n", "3",
        "-e", "1e-5",
        "-cpu", str(cpu_cores)
    ]
    
    # Run HHblits
    try:
        subprocess.run(hhblits_cmd, check=True)
        print(f"HHblits completed for {seq_name}")
    except subprocess.CalledProcessError as e:
        print(f"Error running HHblits on {seq_name}: {e}")

def main():
    # Ensure the alignments directory exists
    os.makedirs(alignments_dir, exist_ok=True)

    # Get a list of all .fasta files in the proteins directory
    seq_files = [
        os.path.join(proteins_dir, f)
        for f in os.listdir(proteins_dir)
        if f.endswith(".fasta")
    ]

    # Run HHblits on each sequence using multiprocessing
    print(f"Running HHblits on {len(seq_files)} sequences...")
    with Pool(processes=parallel_processes) as pool:
        pool.map(run_hhblits, seq_files)

    print("All HHblits jobs completed.")

if __name__ == "__main__":
    main()


Running HHblits on 229 sequences...


In [1]:
#!/usr/bin/env python3

import os
import subprocess
from multiprocessing import Pool, Manager
from tqdm import tqdm
import time

# Paths (Adjust these paths according to your setup)
proteins_dir = "/home/saeed/Documents/base paper implementation/kiba_dataset/proteins"
alignments_dir = "/home/saeed/Documents/base paper implementation/kiba_dataset/alignments"
database = "/media/saeed/88420B84420B766A/UniRef30_2022_02_hhsuite/UniRef30_2022_02"
hhblits_path = "/home/saeed/miniconda3/bin/hhblits"  # Provide full path if hhblits is not in PATH

# Number of CPU cores to use for HHblits (adjust as needed)
cpu_cores = 4

# Number of parallel processes for HHblits
parallel_processes = 4 # Adjust based on your system's capabilities

def run_hhblits(args):
    """
    Runs HHblits on a single sequence file.
    """
    seq_file, seq_name, progress_queue = args
    output_a3m = os.path.join(alignments_dir, f"{seq_name}.a3m")
    output_hhr = os.path.join(alignments_dir, f"{seq_name}.hhr")
    
    # Record the start time
    start_time = time.time()

    # Construct the HHblits command
    hhblits_cmd = [
        hhblits_path,
        "-i", seq_file,
        "-d", database,
        "-oa3m", output_a3m,
        "-o", output_hhr,
        "-n", "3",
        "-e", "1e-5",
        "-cpu", str(cpu_cores)
    ]
    
    # Run HHblits
    try:
        subprocess.run(hhblits_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        # Calculate time taken
        elapsed_time = time.time() - start_time
        # Send the time taken to the progress queue
        progress_queue.put(elapsed_time)
    except subprocess.CalledProcessError as e:
        # Handle errors and send a negative time to indicate failure
        progress_queue.put(-1)
        print(f"Error running HHblits on {seq_name}: {e}")

def main():
    # Ensure the alignments directory exists
    os.makedirs(alignments_dir, exist_ok=True)

    # Get a list of all .fasta files in the proteins directory
    seq_files = [
        (os.path.join(proteins_dir, f), os.path.splitext(f)[0])
        for f in os.listdir(proteins_dir)
        if f.endswith(".fasta")
    ]

    total_sequences = len(seq_files)
    print(f"Running HHblits on {total_sequences} sequences...")

    # Create a Manager to share data between processes
    manager = Manager()
    progress_queue = manager.Queue()

    # Start the Pool of worker processes
    with Pool(processes=parallel_processes) as pool:
        # Use tqdm to display a progress bar
        with tqdm(total=total_sequences, ncols=80) as pbar:
            # Map the function with arguments including the progress queue
            result = pool.map_async(run_hhblits, [(seq_file, seq_name, progress_queue) for seq_file, seq_name in seq_files])

            processed_sequences = 0
            total_time = 0.0

            while not result.ready() or not progress_queue.empty():
                # Check if there's new progress information
                while not progress_queue.empty():
                    elapsed_time = progress_queue.get()
                    processed_sequences += 1
                    if elapsed_time >= 0:
                        total_time += elapsed_time
                        avg_time_per_seq = total_time / processed_sequences
                        est_total_time = avg_time_per_seq * total_sequences
                        est_time_remaining = est_total_time - total_time
                        # Update the progress bar description
                        pbar.set_description(f"Avg Time/Seq: {avg_time_per_seq:.2f}s, ETA: {est_time_remaining/60:.1f}m")
                    else:
                        # Handle failed sequences if necessary
                        pbar.set_description(f"Sequence {processed_sequences} failed.")
                    pbar.update(1)
                time.sleep(0.1)  # Avoid busy waiting

            # Ensure all results are collected
            result.wait()

    print("All HHblits jobs completed.")

if __name__ == "__main__":
    main()


Running HHblits on 229 sequences...


  0%|                                                   | 0/229 [00:00<?, ?it/s]