## Hybrid de novo protein assembly workflow

The hybrid assembly workflow considers contigs obtained from 2 assembly methods: greedy and DBG.

In [1]:
import os
import sys

script_dir = os.getcwd()
sys.path.append(os.path.join(script_dir, "../src"))

In [21]:
# my modules
import dbg
import greedy_method as greedy
import mapping as map
import consensus as cons
import alignment as align
import clustering as clus
import preprocessing as prep
import compute_statistics as comp_stat
import model_peptide_selector as selector

# import libraries
from pathlib import Path
from Bio import SeqIO

import joblib
import json
import Bio
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
try:
    # works if you are in a script: __file__ exists
    BASE_DIR = Path(__file__).resolve().parents[2]
except NameError:
    # works if you are in a notebook: __file__ does not exist
    BASE_DIR = Path().resolve()
    # go up until the project folder
    while BASE_DIR.name != "InstaNexus" and BASE_DIR != BASE_DIR.parent:
        BASE_DIR = BASE_DIR.parent

JSON_DIR = BASE_DIR / "json"
INPUT_DIR = BASE_DIR / "inputs"
FASTA_DIR = BASE_DIR / "fasta"
OUTPUTS_DIR = BASE_DIR / "outputs"
FIGURES_DIR = BASE_DIR / "figures"

In [4]:
def get_sample_metadata(run, chain="", json_path=JSON_DIR / "sample_metadata.json"):
    with open(json_path, "r") as f:
        all_meta = json.load(f)

    if run not in all_meta:
        raise ValueError(f"Run '{run}' not found in metadata.")

    entries = all_meta[run]

    for entry in entries:
        if entry["chain"] == chain:
            return entry

    raise ValueError(f"No metadata found for run '{run}' with chain '{chain}'.")

In [5]:
def get_colors_from_run(cat, is_scaffold=False, json_path=JSON_DIR / "colors.json"):
    if not os.path.exists(json_path):
        raise FileNotFoundError(f"Missing color file: {json_path}")

    with open(json_path, "r") as f:
        colors = json.load(f)

    category = cat.split("_")[0].lower()
    key = "scaffold" if is_scaffold else "contig"

    try:
        return colors[category][key]
    except KeyError:
        raise ValueError(f"Color not defined for category '{category}' and key '{key}'")

In [6]:
def get_combination_name(
    ass_method,
    conf,
    kmer_size,
    size_threshold,
    min_overlap
):
    if ass_method in ("dbg", "hybrid"):
        return f"comb_{ass_method}_c{conf}_ks{kmer_size}_ts{size_threshold}_mo{min_overlap}"
    
    elif ass_method == "greedy":
        return f"comb_{ass_method}_c{conf}_ts{size_threshold}_mo{min_overlap}"

In [7]:
run = "ma1"
chain = "light"

meta = get_sample_metadata(run, chain)

protein = meta["protein"]
chain = meta["chain"]
proteases = meta["proteases"]

print(chain)
print(proteases)

light
['Thermo', 'Papain', 'Chemo', 'Trypsin', 'Elastase', 'ProtK', 'GluC']


In [8]:
ass_method = "hybrid"
kmer_size = 6
conf = 0.95
size_threshold = 0
min_overlap = 2

In [9]:
comb = get_combination_name(
    ass_method,
    conf,
    kmer_size,
    size_threshold,
    min_overlap)

print(comb)

comb_hybrid_c0.95_ks6_ts0_mo2


In [10]:
params = {
    "ass_method": ass_method,
    "conf": conf,
    "size_threshold": size_threshold,
    "min_overlap": min_overlap
}

In [11]:
folder_outputs = OUTPUTS_DIR / f"{run}{chain}"

prep.create_directory(folder_outputs)

combination_folder_out = os.path.join(
    folder_outputs,
    f"comb_{ass_method}_c{conf}_ks{kmer_size}_ts{size_threshold}_mo{min_overlap}",
)

prep.create_subdirectories_outputs(combination_folder_out)

### Data cleaning

In [12]:
protein_norm = prep.normalize_sequence(protein)

In [13]:
df = pd.read_csv(INPUT_DIR / f"{run}.csv")

df["protease"] = df["experiment_name"].apply(
    lambda name: prep.extract_protease(name, proteases)
)

df.head(10)

Unnamed: 0,experiment_name,scan_number,preds,log_probs,protease
0,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,49,VLLPLSLLR,-11.921587,Thermo
1,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,52,KLLLLKKK,-23.50606,Thermo
2,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,56,,-1.0,Thermo
3,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,57,VLVLSDTNSDSDDDK,-24.000002,Thermo
4,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,72,LPQLLLLGR,-10.374332,Thermo
5,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,75,,-1.0,Thermo
6,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,86,PSPEEDEEEEQK,-14.129684,Thermo
7,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,89,CMDCDDDEDEDSK,-24.80718,Thermo
8,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,110,LLVPLLAPK,-10.222238,Thermo
9,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,148,HCCEGGGGGGGGGGGK,-28.656862,Thermo


In [14]:
df.protease.unique()

array(['Thermo', 'Papain', 'Chemo', 'Trypsin', 'Elastase', 'ProtK',
       'GluC'], dtype=object)

In [15]:
df = prep.clean_dataframe(df)

In [16]:
df.head()

Unnamed: 0,experiment_name,scan_number,preds,log_probs,protease,conf
19671,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_Pap...,10553,DYFPEPVT,-0.000135,Papain,0.999865
12570,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_Pap...,2359,LFPPKPKD,-0.000173,Papain,0.999827
12531,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_Pap...,2315,CSVMHEALH,-0.000177,Papain,0.999823
27565,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_Che...,5996,LFPPKPKDTLY,-0.000201,Chemo,0.999799
4364,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,6010,LFPPKPKDTLY,-0.000211,Thermo,0.999789


In [17]:
df["cleaned_preds"] = df["preds"].apply(prep.remove_modifications)

In [19]:
df.head(10)

Unnamed: 0,experiment_name,scan_number,preds,log_probs,protease,conf,cleaned_preds
19671,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_Pap...,10553,DYFPEPVT,-0.000135,Papain,0.999865,DYFPEPVT
12570,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_Pap...,2359,LFPPKPKD,-0.000173,Papain,0.999827,LFPPKPKD
12531,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_Pap...,2315,CSVMHEALH,-0.000177,Papain,0.999823,CSVMHEALH
27565,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_Che...,5996,LFPPKPKDTLY,-0.000201,Chemo,0.999799,LFPPKPKDTLY
4364,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,6010,LFPPKPKDTLY,-0.000211,Thermo,0.999789,LFPPKPKDTLY
60271,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_Pro...,4915,CQASQDLSNY,-0.000217,ProtK,0.999783,CQASQDLSNY
4984,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,6741,LFPPKPKDTLY,-0.000226,Thermo,0.999774,LFPPKPKDTLY
29948,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_Che...,8931,NWYVDGVEVH,-0.00023,Chemo,0.99977,NWYVDGVEVH
28077,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_Che...,6645,LFPPKPKDTLY,-0.000235,Chemo,0.999765,LFPPKPKDTLY
13649,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_Pap...,3595,NKALPAPLEKTLSK,-0.00024,Papain,0.99976,NKALPAPLEKTLSK


In [20]:
df.shape

(71308, 7)

In [22]:
def filter_contaminants_df(df, seq_col, run, contaminants_fasta):

    contam_records = []
    for record in SeqIO.parse(contaminants_fasta, "fasta"):
        if run == "bsa" and "Bovine serum albumin precursor" in record.description:
            continue
        contam_records.append(str(record.seq))

    mask = ~df[seq_col].apply(
        lambda seq: any(seq in contam_seq for contam_seq in contam_records)
    )

    filtered_df = df[mask].copy()
    removed_count = (~mask).sum()

    print(f"Removed {removed_count} contaminant sequences, {len(filtered_df)} remaining.")
    return filtered_df

In [23]:
df = filter_contaminants_df(df, "cleaned_preds", run, FASTA_DIR / "contaminants.fasta")

Removed 276 contaminant sequences, 71032 remaining.


In [26]:
df.reset_index(drop=True, inplace=True)

In [28]:
df.head()

Unnamed: 0,experiment_name,scan_number,preds,log_probs,protease,conf,cleaned_preds
0,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_Pap...,10553,DYFPEPVT,-0.000135,Papain,0.999865,DYFPEPVT
1,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_Pap...,2359,LFPPKPKD,-0.000173,Papain,0.999827,LFPPKPKD
2,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_Pap...,2315,CSVMHEALH,-0.000177,Papain,0.999823,CSVMHEALH
3,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_Che...,5996,LFPPKPKDTLY,-0.000201,Chemo,0.999799,LFPPKPKDTLY
4,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,6010,LFPPKPKDTLY,-0.000211,Thermo,0.999789,LFPPKPKDTLY


### Comments

A reference free approach assume you do not have any reference protein.\
In this scenario it is not possible to know when the PSM coverage is 100%.\
Having high PSM coverage would allow a graph based approach to work properly.\ 

In [29]:
print(conf)

df = df[df["conf"] > conf]

0.95


In [33]:
filtered_preds = df["cleaned_preds"].tolist()

In [34]:
filtered_preds[:5]

['DYFPEPVT', 'LFPPKPKD', 'CSVMHEALH', 'LFPPKPKDTLY', 'LFPPKPKDTLY']

### Hybrid assembly

In [35]:
greedy_contigs = greedy.assemble_contigs(filtered_preds, min_overlap)

Finding overlaps: 1654it [00:03, 481.01it/s]
Finding overlaps: 1402it [00:02, 550.71it/s]
Finding overlaps: 1226it [00:01, 619.45it/s]
Finding overlaps: 1084it [00:01, 696.13it/s]
Finding overlaps: 967it [00:01, 773.77it/s]
Finding overlaps: 879it [00:01, 863.97it/s]
Finding overlaps: 810it [00:00, 928.87it/s]
Finding overlaps: 748it [00:00, 997.25it/s] 
Finding overlaps: 698it [00:00, 1060.10it/s]
Finding overlaps: 655it [00:00, 1127.22it/s]
Finding overlaps: 620it [00:00, 1165.81it/s]
Finding overlaps: 592it [00:00, 1238.30it/s]
Finding overlaps: 570it [00:00, 1257.30it/s]
Finding overlaps: 556it [00:00, 1278.16it/s]
Finding overlaps: 545it [00:00, 1304.58it/s]
Finding overlaps: 536it [00:00, 1308.69it/s]
Finding overlaps: 530it [00:00, 1339.45it/s]
Finding overlaps: 524it [00:00, 1352.94it/s]
Finding overlaps: 518it [00:00, 1356.83it/s]
Finding overlaps: 514it [00:00, 1353.87it/s]
Finding overlaps: 511it [00:00, 1322.89it/s]
Finding overlaps: 510it [00:00, 1342.90it/s]


In [None]:
assembled_contigs = list(set(greedy_contigs)) # Remove duplicates

In [None]:
print(len(final_psms))
print(len(assembled_contigs))

In [None]:
# print the minimum length of the assembled_contigs
print(min([len(c) for c in assembled_contigs]))
print(max([len(c) for c in assembled_contigs]))

In [None]:
assembled_contigs = sorted(assembled_contigs, key=len, reverse=True)

In [None]:
display(assembled_contigs[:5])
print(len(assembled_contigs))

In [None]:
# remove sequences shorter than 7 from a list of strings
assembled_contigs = [c for c in assembled_contigs if len(c) >= 7]
print(len(assembled_contigs))

### Optimised DBG approach

In [None]:
kmers = dbg.get_kmers(assembled_contigs, kmer_size=9)

In [None]:
print(len(kmers))

In [None]:
edges = dbg.get_debruijn_edges_from_kmers(kmers)

In [None]:
dbg_contigs = dbg.assemble_contigs(edges)

In [None]:
dbg_contigs = sorted(dbg_contigs, key=len, reverse=True)

In [None]:
dbg_contigs = list(set(dbg_contigs))

In [None]:
dbg_contigs = [seq for seq in dbg_contigs if len(seq) > size_threshold]

In [None]:
dbg_contigs = sorted(dbg_contigs, key=len, reverse=True)

In [None]:
print(dbg_contigs[:5])
print(len(dbg_contigs))

In [None]:
dbg_contigs

### Scaffolds

In [None]:
assembled_scaffolds = dbg.create_scaffolds(dbg_contigs, min_overlap)

assembled_scaffolds = list(set(assembled_scaffolds))

assembled_scaffolds = sorted(assembled_scaffolds, key=len, reverse=True)

assembled_scaffolds = [
    scaffold for scaffold in assembled_scaffolds if len(scaffold) > size_threshold
]

assembled_scaffolds = dbg.merge_sequences(assembled_scaffolds)

assembled_scaffolds = list(set(assembled_scaffolds))

assembled_scaffolds = sorted(assembled_scaffolds, key=len, reverse=True)

assembled_scaffolds = [
    scaffold for scaffold in assembled_scaffolds if len(scaffold) > size_threshold
]

In [None]:
protein_norm = prep.normalize_sequence(protein)

In [None]:
protein_norm

In [None]:
assembled_scaffolds

In [None]:
mapped_scaffolds = map.process_protein_contigs_scaffold(
    assembled_scaffolds, protein_norm, max_mismatches, min_identity
)

map.mapping_substitutions(mapped_scaffolds, protein_norm, title= "scaffolds mapped in RF-selected peptides")

In [None]:
records = []

for i, seq in enumerate(assembled_scaffolds):
    record = Bio.SeqRecord.SeqRecord(
        Bio.Seq.Seq(seq), id=f"scaffold_{i+1}", description=f"length: {len(seq)}"
    )
    records.append(record)

In [None]:
Bio.SeqIO.write(
    records,
    f"{combination_folder_out}/scaffolds/{ass_method}_scaffold_{conf}_{run}.fasta",
    "fasta",
)

### Clustering

In [None]:
scaffolds_folder_out = OUTPUTS_DIR / f"{run}{chain}/{comb}/scaffolds"
print(f"scaffolds_folder_out: {scaffolds_folder_out}")

In [None]:
clus.cluster_fasta_files(input_folder=scaffolds_folder_out)

In [None]:
cluster_folder_out = os.path.join(scaffolds_folder_out, "cluster")
print(cluster_folder_out)

In [None]:
cluster_tsv_folder = os.path.join(scaffolds_folder_out, "cluster")
output_base_folder = os.path.join(scaffolds_folder_out, "cluster_fasta")

for fasta_file in os.listdir(scaffolds_folder_out):
    if fasta_file.endswith(".fasta"):
        fasta_path = os.path.join(scaffolds_folder_out, fasta_file)
        clus.process_fasta_and_clusters(
            fasta_path, cluster_tsv_folder, output_base_folder
        )

### Alignment

In [None]:
cluster_fasta_folder = os.path.join(scaffolds_folder_out, "cluster_fasta")
align_folder = os.path.join(scaffolds_folder_out, "align")
prep.create_directory(align_folder)

In [None]:
for cluster_folder in os.listdir(cluster_fasta_folder):
    cluster_folder_path = os.path.join(cluster_fasta_folder, cluster_folder)
    if os.path.isdir(cluster_folder_path):

        output_cluster_folder = os.path.join(align_folder, cluster_folder)
        os.makedirs(output_cluster_folder, exist_ok=True)

        for fasta_file in os.listdir(cluster_folder_path):
            if fasta_file.endswith(".fasta"):
                fasta_file_path = os.path.join(cluster_folder_path, fasta_file)
                base_filename = os.path.splitext(fasta_file)[0]
                output_file = os.path.join(
                    output_cluster_folder, f"{base_filename}_out.afa"
                )

                align.align_or_copy_fasta(fasta_file_path, output_file)

print("All alignment tasks completed.")

### Consensus

In [None]:
consensus_folder = os.path.join(scaffolds_folder_out, "consensus")

In [None]:
cons.process_alignment_files(align_folder, consensus_folder)

In [None]:
all_sequences = cons.load_all_consensus_sequences(consensus_folder)

In [None]:
all_sequences