In [None]:
# Load predictions file.
# Load fasta file or folder
# Fetch full sequence
# Subset to keep domain

# Main
  # Load predictions file
  # Load fasta if 1 fasta file
  # For lysin in file
    # Fetch sequence from fasta (whether 1 fasta file or folder of fastas)
    # For domain in lysin
      # Subset to keep domain
      # Save to output fasta with appropriate header

In [None]:
from Bio import SeqIO
import os

In [None]:
def load_predictions(filename):
    df = pd.read_csv(filename, index_col= 0)
    df.index = df.index.astype(str)
    df.linkers = df.linkers.astype(str)
    df.disordered = df.disordered.astype(str)
    df.domains = df.domains.astype(str)
    return df

In [None]:
def search_for_fasta(directory, name):
    name = name.lower()
    for root, dirs, files in os.walk(directory, topdown=True):
        for e in files+dirs:
            if os.path.splitext(e)[0].lower() == name:
                if os.path.splitext(e)[1] in ["faa", "fasta", "fa"]:
                    return os.path.join(root, e)

    print(f"No fasta file found for {name} with extension .fa, .faa, .fasta")
    return None

In [None]:
def load_fasta(fasta_path, prot):
    if prot != "":
        fasta_path = os.path.join(fasta_path, f"{prot}.{ext}")
    
    seqs = dict()
    with open(fasta_path) as f:
        lines = f.readlines()
        for i in range(0, len(lines)):
            s=lines[i].strip()
            if s[0] == '>':
                key=s[1:]
            else:
                seqs[key] = s
    return seqs

In [None]:
def load_fasta(fasta_path, prot):

In [None]:
def get_seq(fasta_seqs, prot):

In [None]:
def subset_dom_seq(preds, prot, seq, mode="domains"):
    if preds.loc[prot, "domains"] == "": return []

    dom_seqs = []
    for dom in preds.loc[prot, "domains"].split(";"):
        start_ind = int(dom.split("-")[0])-1
        end_ind = int(dom.split("-")[1])-1

        dom_seq = seq[start_ind:end_ind+1]
        dom_seqs.append(dom_seq)
        
    return dom_seqs

In [None]:
def save_to_fasta(dom_seqs, prot, output_path):
    with open(output_path, "a") as f:
        for i, dom in enumerate(dom_seqs):
            header=f">{prot}_{i}"
            f.write(header)
            f.write(dom)

In [None]:
def main(pred_path, fasta_path, output_path="predicted_domain_seqs.faa", mode="domains"):
    preds = load_predictions(pred_path)

    if os.path.isfile(fasta_path):
        try: fasta_seqs = load_fasta(fasta_path, prot="")
        except: print(f"Fasta file not formatted correctly: {fasta_path}")

    if os.path.isfile(output_path):
        print("Output file already exists and will be overwritten.")
        os.path.delete(output_path)

    for prot in preds.index:
        #fetch sequence from fasta
        if os.path.isfile(fasta_path):
            seq = get_seq(fasta_seqs, prot):
        else:
            fasta_seq = load_fasta(fasta_path, prot)
            seq = get_seq(fasta_seq, prot)

        #subset domains from sequence
        dom_seqs = subset_dom_seq(preds, prot, seq)

        #save to output fasta file
        save_to_fasta(dom_seqs, prot, output_path)