In [4]:
# Load predictions file.
# Load fasta file or folder
# Fetch full sequence
# Subset to keep domain

# Main
  # Load predictions file
  # Load fasta if 1 fasta file
  # For lysin in file
    # Fetch sequence from fasta (whether 1 fasta file or folder of fastas)
    # For domain in lysin
      # Subset to keep domain
      # Save to output fasta with appropriate header

In [38]:
import os

In [41]:
def load_predictions(filename):
    df = pd.read_csv(filename, index_col= 0)
    df.index = df.index.astype(str)
    df.linkers = df.linkers.astype(str)
    df.disordered = df.disordered.astype(str)
    df.domains = df.domains.astype(str)
    return df

In [69]:
def search_for_fasta(directory, name):
    for root, dirs, files in os.walk(directory, topdown=True):
        for e in files+dirs:
            if os.path.splitext(e)[0] == name:
                if os.path.splitext(e)[1] in [".faa", ".fasta", ".fa"]:
                    return os.path.join(root, e)

    print(f"No fasta file found for {name} with extension .fa, .faa or .fasta")
    return None

In [78]:
def load_fasta(fasta_path, prot):
    if prot != "":
        fasta_path = search_for_fasta(fasta_path, prot)
        if fasta_path is None: return None
    
    seqs = dict()
    with open(fasta_path) as f:
        lines = f.readlines()
        for i in range(0, len(lines)):
            s=lines[i].strip()
            display(s)
            if s == "": pass
            elif s[0] == '>':
                key=s[1:]
            else:
                seqs[key] = s
    return seqs

In [71]:
def get_seq(fasta_seqs, prot):
    return fasta_seqs[prot]

In [72]:
def subset_dom_seq(preds, prot, seq, mode="domains"):
    if preds.loc[prot, "domains"] == "": return []

    dom_seqs = []
    for dom in preds.loc[prot, "domains"].split(";"):
        start_ind = int(dom.split("-")[0])-1
        end_ind = int(dom.split("-")[1])-1

        dom_seq = seq[start_ind:end_ind+1]
        dom_seqs.append(dom_seq)
        
    return dom_seqs

In [73]:
def save_to_fasta(dom_seqs, prot, output_path):
    with open(output_path, "a") as f:
        for i, dom in enumerate(dom_seqs):
            header=f">{prot}_{i}"
            f.write(header); f.write("\n")
            f.write(dom); f.write("\n")

In [74]:
def main(pred_path, fasta_path, output_path="predicted_domain_seqs.faa", mode="domains"):
    preds = load_predictions(pred_path)

    if os.path.isfile(fasta_path):
        try: fasta_seqs = load_fasta(fasta_path, prot="")
        except: print(f"Fasta file not formatted correctly: {fasta_path}")

    if os.path.isfile(output_path):
        print("Output file already exists and will be overwritten.")
        os.remove(output_path)

    for prot in preds.index:
        #fetch sequence from fasta
        if os.path.isfile(fasta_path):
            seq = get_seq(fasta_seqs, prot)
        else:
            fasta_seq = load_fasta(fasta_path, prot)
            if fasta_seq is None: continue
            seq = get_seq(fasta_seq, prot)

        #subset domains from sequence
        dom_seqs = subset_dom_seq(preds, prot, seq)

        #save to output fasta file
        save_to_fasta(dom_seqs, prot, output_path)

In [58]:
main("../Roberto/spaed_predictions_roberto.csv", "../Roberto/lysins/lysins_wDomains.faa")

Output file already exists and will be overwritten.


In [80]:
main("../Roberto/spaed_predictions_roberto.csv", "../Roberto/lysins/test")

Output file already exists and will be overwritten.


'>A0A060QRT9'

'MSVPQSIVNWFVIHRNLLTYSMFGSRNGSDGTADCSGSMSQALKDAGIPIQGLPSTVTLGQQLAKNGFYRVSINQDWDASTGDIVMMSWGADMSQSGGAGGHVGVMMDSVNFISCDYSTQGAVGQAINTYPWNDYYAANKPNYIEVWRYAESAPQTNNQANTAVVPQQKAYYEANDVQFVNGIWQIKCDYLCPIGFNYFQNGVPVTMVNWVDKDGNDLPDGADQEFKAGMFFSFAGDENNITDTGEGGYYGGYYYRRFEFGQFGTVWLSCWNKDDLVNYYQ'

No fasta file found for A0A068YG91 with extension .fa, .faa or .fasta
No fasta file found for A0A097PAT3 with extension .fa, .faa or .fasta
No fasta file found for A0A0A0YV41 with extension .fa, .faa or .fasta
No fasta file found for A0A0B4MYW5 with extension .fa, .faa or .fasta
No fasta file found for A0A0B5A075 with extension .fa, .faa or .fasta
No fasta file found for A0A0M3ULP7 with extension .fa, .faa or .fasta
No fasta file found for A0A126GGF1 with extension .fa, .faa or .fasta
No fasta file found for A0A126GGG2 with extension .fa, .faa or .fasta
No fasta file found for A0A126GGH3 with extension .fa, .faa or .fasta
No fasta file found for A0A126GGK7 with extension .fa, .faa or .fasta
No fasta file found for A0A126GGL2 with extension .fa, .faa or .fasta
No fasta file found for A0A126GGL6 with extension .fa, .faa or .fasta
No fasta file found for A0A141E024 with extension .fa, .faa or .fasta
No fasta file found for A0A1B0RXF2 with extension .fa, .faa or .fasta
No fasta file found 

'>PlySW7'

'MSVQQSIVNWFVNHRGKLTYSMFGSRNGSDGTADCSGSISQALKEAGIGIQGLPSTVTLGQQLANNGFYRVSINQDWDALTGDIVMMSWGADMSQSGGAGGHVGVMMDATYFISCDYSTQGAVGQAINTYPWNDYYAANKPSYIEVWRYSDSATQTNNQANTAVAPQQKAYYEANDVQFVNGIWQIKCDYLCPIGFDWVNMVNWVDKDGNDLPDGADQEFKAGMFFSFAGDENNITDTGEGGYYGGYYFRKFEFGQFGTVWLSCWNKDDLVNYYQ'

''

No fasta file found for PlyVA214 with extension .fa, .faa or .fasta
No fasta file found for Q2I7P1 with extension .fa, .faa or .fasta
No fasta file found for Q6DMS8 with extension .fa, .faa or .fasta
No fasta file found for Q708K0 with extension .fa, .faa or .fasta
No fasta file found for Q9AF60 with extension .fa, .faa or .fasta
No fasta file found for Q9MCJ0 with extension .fa, .faa or .fasta
No fasta file found for Q9T1D5 with extension .fa, .faa or .fasta
No fasta file found for RVLD_1039 with extension .fa, .faa or .fasta
No fasta file found for RVLD_1052 with extension .fa, .faa or .fasta
No fasta file found for RVLD_1082 with extension .fa, .faa or .fasta
No fasta file found for RVLD_1085 with extension .fa, .faa or .fasta
No fasta file found for RVLD_1112 with extension .fa, .faa or .fasta
No fasta file found for RVLD_1139 with extension .fa, .faa or .fasta
No fasta file found for RVLD_1142 with extension .fa, .faa or .fasta
No fasta file found for RVLD_1148 with extension .fa,

In [47]:
import pandas as pd
lysins = pd.read_excel("../Roberto/lysins/lysins_wDomains.xlsx")

In [48]:
with open("../Roberto/lysins/lysins_wDomains.faa", "w") as file:
    for l in lysins.proteinID:
        if len(lysins.loc[lysins.proteinID == l, "seq.aa"]) > 0:
            file.write(f">{l}\n")
            file.write(lysins.loc[lysins.proteinID == l, "seq.aa"].values[0])
            file.write("\n")