In [29]:
import Bio
from Bio.Align.Applications import ProbconsCommandline
import os
import tempfile
import subprocess
from typing import List

In [17]:
probcons_cline = ProbconsCommandline(input="exp.fa")

stdout, stderr = probcons_cline()
stdout

'>2022H3-MA\nMKAIIA-LSNILCLVFAQKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELV\nQNSSIGKICNSP-HQILDGGNCTLIDALLGDPQCDGFQN-KEWDLFVERSR-ANSSCYPY\nDVPDYASLRSLVASSGTLE---FKNESFNWTGV-KQNGTSSACKRGSSSSFFSRLNWLTS\nLNNIYPAQNVTMPNKEQFDKLYIWGVHHPDTDKNQFSLFAQSSGRITVSTKRSQQAVIPN\nIGSRPRVRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKI-RSGKSSIMRSDAPIG\nKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPEKQ----TRGI\nFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQISGKLNRLIGKTNEK\nFHQIEKEFSEVEGRVQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKT\nKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNETYDHNVYRDEALNNRFQIKGVELKSGY\nKDWILWISF-AMSCFLLCIALLGFIMWACQKGNIRCNICI\n>1968H3-HK\nMKTIIA-LSYIFCLALGQDLPGNDNSTATLCLGHHAVPNGTLVKTITDDQIEVTNATELV\nQSSSTGKICNNP-HRILDGIDCTLIDALLGDPHCDVFQN-ETWDLFVERSK-AFSNCYPY\nDVPDYASLRSLVASSGTLE---FITEGFTWTGV-TQNGGSNACKRGPGSGFFSRLNWLTK\nSGSTYPVLNVTMPNNDNFDKLYIWGVHHPSTNQEQTSLYVQASGRVTVSTRRSQQTIIPN\nIGSRPWVRGMSSRISIYWTIVKPGDVLVINSNGNLIAPRGYFKM-RTGKSSIMRSDAPID\nTCISECITPNGSIPNDKPFQNVNKITYGACPKYVKQNTLKLATGMRNVPEKQ----TRGL\nFGA

In [36]:
probcons_cline = ProbconsCommandline(input="q1.fa")

stdout, stderr = probcons_cline()
stdout

'>query\nMK-CPQSFAAHG\n>ref\nMKDCPTSFAA--\n'

In [40]:
ref_numbering("MK-CPQSFAAHG", "MKDCPTSFAA--", probcons_path= "/home/william/miniforge3/envs/ab/bin/probcons")

['>query', 'MK-CPQSFAAHG', '>ref', 'MKDCPTSFAA--']


['1', '2', '4', '5', '6', '7', '8', '9', '10', '-', '-']

In [39]:
def ref_numbering(query: str, ref: str, probcons_path: str = "probcons") -> List[str]:
    if not query or not ref:
        raise ValueError("Query and reference sequences cannot be empty")

    # Remove any existing gaps for clean alignment
    query = query.replace("-", "")
    ref = ref.replace("-", "")

    # Create temporary fasta file for alignment
    with tempfile.NamedTemporaryFile(mode="w", suffix=".fasta", delete=False) as f:
        f.write(">query\n{}\n>ref\n{}\n".format(query, ref))
        temp_path = f.name

    try:
        # Run probcons alignment
        result = subprocess.run(
            [probcons_path, temp_path], capture_output=True, text=True, check=True
        )
        aligned = result.stdout.strip()
    finally:
        # Clean up temp file
        os.unlink(temp_path)

    # Parse alignment output
    lines = aligned.split("\n")
    if len(lines) < 4:
        raise ValueError("Invalid alignment output")

    print(lines)
    aligned_query = lines[1]  # Skip header line
    aligned_ref = lines[3]  # Skip header line

    if len(aligned_query) != len(aligned_ref):
        raise ValueError("Alignment lengths do not match")

    # Generate numbering
    ref_pos = 0
    numbering = []

    for q, r in zip(aligned_query, aligned_ref):
        if q == "-":
            ref_pos += 1
            continue  # Skip gaps in query
        if r == "-":
            numbering.append("-")  # Gap in reference
        else:
            ref_pos += 1
            numbering.append(str(ref_pos))

    return numbering
