## Hybrid de novo protein assembly workflow

The hybrid assembly workflow considers contigs obtained from 2 assembly methods: greedy and DBG.

In [1]:
import os
import sys

script_dir = os.getcwd()
sys.path.append(os.path.join(script_dir, "../src"))

# my modules
import dbg
import greedy_method as greedy
import mapping as map
import consensus as cons
import alignment as align
import clustering as clus
import preprocessing as prep
import compute_statistics as comp_stat
import model_peptide_selector as selector

# import libraries
from pathlib import Path

import joblib
import json
import Bio
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
try:
    # works if you are in a script: __file__ exists
    BASE_DIR = Path(__file__).resolve().parents[2]
except NameError:
    # works if you are in a notebook: __file__ does not exist
    BASE_DIR = Path().resolve()
    # go up until the project folder
    while BASE_DIR.name != "InstaNexus" and BASE_DIR != BASE_DIR.parent:
        BASE_DIR = BASE_DIR.parent

JSON_DIR = BASE_DIR / "json"
INPUT_DIR = BASE_DIR / "inputs"
FASTA_DIR = BASE_DIR / "fasta"
OUTPUTS_DIR = BASE_DIR / "outputs"
FIGURES_DIR = BASE_DIR / "figures"

In [3]:
def get_sample_metadata(run, chain="", json_path=JSON_DIR / "sample_metadata.json"):
    with open(json_path, "r") as f:
        all_meta = json.load(f)

    if run not in all_meta:
        raise ValueError(f"Run '{run}' not found in metadata.")

    entries = all_meta[run]

    for entry in entries:
        if entry["chain"] == chain:
            return entry

    raise ValueError(f"No metadata found for run '{run}' with chain '{chain}'.")

In [4]:
def get_colors_from_run(cat, is_scaffold=False, json_path=JSON_DIR / "colors.json"):
    if not os.path.exists(json_path):
        raise FileNotFoundError(f"Missing color file: {json_path}")

    with open(json_path, "r") as f:
        colors = json.load(f)

    category = cat.split("_")[0].lower()
    key = "scaffold" if is_scaffold else "contig"

    try:
        return colors[category][key]
    except KeyError:
        raise ValueError(f"Color not defined for category '{category}' and key '{key}'")

In [5]:
def get_combination_name(
    ass_method,
    conf,
    kmer_size,
    size_threshold,
    min_overlap
):
    if ass_method in ("dbg", "hybrid"):
        return f"comb_{ass_method}_c{conf}_ks{kmer_size}_ts{size_threshold}_mo{min_overlap}"
    
    elif ass_method == "greedy":
        return f"comb_{ass_method}_c{conf}_ts{size_threshold}_mo{min_overlap}"

In [6]:
run = "ma1"
chain = "light"

meta = get_sample_metadata(run, chain)

protein = meta["protein"]
chain = meta["chain"]
proteases = meta["proteases"]

print(chain)
print(proteases)

light
['Thermo', 'Papain', 'Chemo', 'Trypsin', 'Elastase', 'ProtK', 'GluC']


In [8]:
ass_method = "hybrid"
kmer_size = 6
conf = 0.95
size_threshold = 0
min_overlap = 2

In [9]:
comb = get_combination_name(
    ass_method,
    conf,
    kmer_size,
    size_threshold,
    min_overlap)

print(comb)

comb_hybrid_c0.95_ks6_ts0_mo2


In [10]:
params = {
    "ass_method": ass_method,
    "conf": conf,
    "size_threshold": size_threshold,
    "min_overlap": min_overlap
}

In [11]:
folder_outputs = OUTPUTS_DIR / f"{run}{chain}"

prep.create_directory(folder_outputs)

combination_folder_out = os.path.join(
    folder_outputs,
    f"comb_{ass_method}_c{conf}_ks{kmer_size}_ts{size_threshold}_mo{min_overlap}",
)

prep.create_subdirectories_outputs(combination_folder_out)

### Data cleaning

In [12]:
#protein_norm = prep.normalize_sequence(protein)

df = pd.read_csv(INPUT_DIR / f"{run}.csv")

df["protease"] = df["experiment_name"].apply(
    lambda name: prep.extract_protease(name, proteases)
)

df.head(10)

Unnamed: 0,experiment_name,scan_number,preds,log_probs,protease
0,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,49,VLLPLSLLR,-11.921587,Thermo
1,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,52,KLLLLKKK,-23.50606,Thermo
2,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,56,,-1.0,Thermo
3,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,57,VLVLSDTNSDSDDDK,-24.000002,Thermo
4,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,72,LPQLLLLGR,-10.374332,Thermo
5,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,75,,-1.0,Thermo
6,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,86,PSPEEDEEEEQK,-14.129684,Thermo
7,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,89,CMDCDDDEDEDSK,-24.80718,Thermo
8,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,110,LLVPLLAPK,-10.222238,Thermo
9,20240628_FJ_Jubba_MP_Easy15_35min_1835_DDA_The...,148,HCCEGGGGGGGGGGGK,-28.656862,Thermo


In [13]:
df = prep.clean_dataframe(df)

In [14]:
df["cleaned_preds"] = df["preds"].apply(prep.remove_modifications)

In [15]:
df[df.duplicated(subset=["cleaned_preds"], keep=False)][["cleaned_preds"]].value_counts()

cleaned_preds
LFPPKPKDT        30
LFPPKPKDTLY      25
DLVMTQSPSS       23
TLSKADYEKHKVY    20
DLVMTQSPSSL      19
                 ..
FGQGTKVELKRTV     2
LVKGFYPSDLLG      2
LVKRVVG           2
FGLDRLLR          2
YVPRGPG           2
Name: count, Length: 1664, dtype: int64

In [16]:
print(df.loc[df["cleaned_preds"].str.len().sort_values(ascending=False).index, "cleaned_preds"])

71525    RGGGGGGGGGGGGGGGGGGGGGGGGGGGGGR
77062    QPPRLLRHLLQQTPTPGLVQKNLLTVMRLLT
31831    MLVMTQSPPEEEEELKPSPGPPTLTNTLSNY
25822    LGSSSSSHSSSSSSSSSPSEDSDSPPEEEDK
3607     NPSSASFSPQSSSLQSSHLLSLHQELHTGEK
                      ...               
75243                              FLLLR
24999                              LYSKL
71                                 PREAK
23391                              YCQRY
23199                               MDCC
Name: cleaned_preds, Length: 71308, dtype: object


In [17]:
df.protease.unique()

array(['Papain', 'Chemo', 'Thermo', 'ProtK', 'Trypsin', 'Elastase',
       'GluC'], dtype=object)

In [18]:
cleaned_psms = df["cleaned_preds"].tolist()

In [19]:
filtered_psms = prep.filter_contaminants(
cleaned_psms, run, FASTA_DIR / "contaminants.fasta"
)

In [20]:
df = df[df["cleaned_preds"].isin(filtered_psms)]

### Peptide selection

In [None]:
model_path = BASE_DIR / "peptide_selector.pkl"

In [None]:
bundle = joblib.load(model_path)
model = bundle["model"]
threshold = bundle["threshold"]
feature_cols = bundle["features"]

print(f"Model loaded from {model_path}")

In [None]:
aa_props = selector.load_aa_properties(JSON_DIR / "aa_properties.json")
protease_rules = selector.load_protease_rules(JSON_DIR / "protease_rules.json")

df =selector.build_reference_free_features(df, aa_props, protease_rules)

In [None]:
df.columns

In [None]:
df["protease"] = df["protease"].astype("category").cat.codes

In [None]:
df.head(5)

In [None]:
X = df[feature_cols]
df["model_score"] = model.predict_proba(X)[:, 1]
df["accepted"] = (df["model_score"] >= threshold).astype(int)

# Keep accepted peptides only
accepted = df[df["accepted"] == 1]

In [None]:
accepted_df = accepted.reset_index(drop=True)

accepted_df.head(5)

In [None]:
print(f"Total PSMs: {len(df)}, Accepted: {len(accepted)}, Rejected: {len(df) - len(accepted)}")

In [None]:
def plot_peptide_length_distribution(df, output_dir="figures"):
    """Plot peptide length distribution (Accepted vs Discarded) using counts."""
    df = df.copy()
    df["status"] = df["accepted"].map({1: "Accepted", 0: "Discarded"})

    plt.figure(figsize=(8, 5))
    ax = sns.histplot(
        data=df,
        x="seq_length",
        hue="status",
        bins=30,
        kde=False,
        alpha=0.6,
        stat="count",
        palette={"Accepted": "#FF7F0E", "Discarded": "#1F77B4"},
    )
    ax.set_title("Peptide length distribution")
    ax.set_xlabel("Peptide length")
    ax.set_ylabel("Count")
    ax.legend(title=None)
    ax.grid(False)
    ax.tick_params(bottom=True, left=True, top=False, right=False)

    # Force legend to appear with proper labels
    handles, labels = ax.get_legend_handles_labels()
    if handles and labels:
        ax.legend(handles=handles, labels=labels, title=None, frameon=False, loc="upper right")

    plt.tight_layout()
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    #plt.savefig(Path(output_dir) / "peptide_length_distribution.svg", format="svg")
    plt.show()


In [None]:
plot_peptide_length_distribution(df, FIGURES_DIR)

In [None]:
protein_norm = prep.normalize_sequence(protein)

In [None]:
final_psms = accepted_df["cleaned_preds"].tolist()

In [None]:
# show me final_psms duplicates
print(len(final_psms))

In [None]:
unique_peptides = list(set(final_psms)) # remove duplicates
print(len(unique_peptides))

In [None]:
non_redundant = []
sorted_peps = sorted(unique_peptides, key=len, reverse=True)

for pep in sorted_peps:
    if not any(pep in other for other in non_redundant):
        non_redundant.append(pep)


In [None]:
print(len(non_redundant))

In [None]:
# remove short peptides
filtered = [p for p in non_redundant if len(p) >= 7]
print(len(filtered))

In [None]:
from Bio import pairwise2
from collections import Counter, defaultdict


def seq_identity(a, b):
    """Compute global sequence identity between two peptides."""
    aln_score = pairwise2.align.globalxx(a, b, one_alignment_only=True, score_only=True)
    return aln_score / max(len(a), len(b))


def cluster_peptides_by_identity_same_length(peptides, threshold=0.9):
    """
    Cluster peptides by sequence identity only if they have the same length.
    Select representative peptide based on highest frequency in the list.
    """
    # Count occurrences in the input list
    freq = Counter(peptides)

    # Group peptides by length
    by_length = defaultdict(list)
    for pep in peptides:
        by_length[len(pep)].append(pep)

    clusters = []

    # Cluster within each length group
    for length_group in by_length.values():
        local_clusters = []
        for pep in length_group:
            placed = False
            for cluster in local_clusters:
                if any(seq_identity(pep, c) >= threshold for c in cluster):
                    cluster.append(pep)
                    placed = True
                    break
            if not placed:
                local_clusters.append([pep])
        clusters.extend(local_clusters)

    # Select representatives
    representatives = []
    for cluster in clusters:
        cluster_sorted = sorted(cluster, key=lambda x: (freq[x], len(x)), reverse=True)
        representatives.append(cluster_sorted[0])

    return representatives, clusters


In [None]:
representatives, clusters = cluster_peptides_by_identity_same_length(filtered, threshold=0.9)

print(f"Original peptides: {len(filtered)}")
print(f"Clusters found: {len(clusters)}")
print(f"Non-redundant peptides: {len(representatives)}")


In [None]:
representatives

In [None]:
mapped_psms = map.process_protein_contigs_scaffold(
    representatives, protein_norm, max_mismatches, min_identity
)

In [None]:
mapped_psms[:5]

In [None]:
map.mapping_substitutions(mapped_psms, protein_norm, title= "psms mapped in RF-selected peptides")

In [None]:
def plot_confidence_distribution(df, accepted_mask, output_dir, filename="psm_selected_rf_distribution.svg"):
    """
    Plot and save the distribution of peptide confidence scores using Seaborn."""

    accepted = df.loc[accepted_mask, "conf"]

    plt.figure(figsize=(8, 5))
    sns.histplot(
        accepted,
        bins=50,
        kde=False,
        color="#2E86AB",
        edgecolor="black",
        alpha=0.8,
    )

    plt.xlabel("Confidence score", fontsize=12)
    plt.ylabel("Peptide count", fontsize=12)
    plt.title("Distribution of accepted peptides", fontsize=13, pad=15)
    sns.despine()
    plt.tight_layout()

    # ensure directory exists
    output_dir.mkdir(parents=True, exist_ok=True)
    out_path = output_dir / filename
    plt.savefig(out_path, format="svg", dpi=300)
    plt.close()

    print(f"Confidence distribution plot saved as: {out_path}")

In [None]:
plot_confidence_distribution(df, df["accepted"] == 1, FIGURES_DIR)

In [None]:
def plot_length_distribution(df, output_dir, filename="length_distribution.svg"):
    """
    Plot peptide length distribution for accepted vs all peptides.

    Parameters
    ----------
    df : pandas.DataFrame
        Must contain 'seq_length' (int) and 'accepted' (0/1) columns.
    output_dir : pathlib.Path
        Directory where the SVG file will be saved.
    filename : str
        Output filename.

    Returns
    -------
    pathlib.Path
        Path to the saved SVG file.
    """
    sns.set_theme(style="white", font_scale=1.1)
    plt.figure(figsize=(7, 5))

    sns.kdeplot(
        data=df,
        x="seq_length",
        hue="accepted",
        common_norm=False,
        fill=True,
        alpha=0.5,
        palette={0: "#B0BEC5", 1: "#2E86AB"},
        linewidth=1.5,
    )

    plt.xlabel("Peptide Length (AAs)", fontsize=12)
    plt.ylabel("Density", fontsize=12)
    plt.title("Distribution of Peptide Lengths (Accepted vs All)", fontsize=13, pad=12)
    sns.despine()
    plt.grid(False)
    plt.tight_layout()

    output_dir.mkdir(parents=True, exist_ok=True)
    out_path = output_dir / filename
    plt.savefig(out_path, format="svg", dpi=300)
    plt.close()

    print(f"Peptide length distribution saved as: {out_path}")
    return out_path


In [None]:
greedy_psms = representatives.copy()
dbg_psms = representatives.copy()
print(len(greedy_psms), len(dbg_psms))

### Optimised greedy contig approach

In [None]:
greedy_contigs = greedy.assemble_contigs(greedy_psms, min_overlap)

In [None]:
assembled_contigs = list(set(greedy_contigs))

In [None]:
greedy_contigs = sorted(greedy_contigs, key=len, reverse=True)

In [None]:
greedy_contigs

In [None]:
print(greedy_contigs[:10])
print(len(greedy_contigs))

In [None]:
greedy_contigs

In [None]:
protein_norm

In [None]:
mapped_contigs = map.process_protein_contigs_scaffold(
    greedy_contigs, protein_norm, max_mismatches, min_identity
)

In [None]:
mapped_contigs

In [None]:
map.mapping_substitutions(mapped_contigs, protein_norm, title= "psms mapped in RF-selected peptides")

### Optimised DBG approach

In [None]:
kmers = dbg.get_kmers(dbg_psms, kmer_size=4)

In [None]:
edges = dbg.get_debruijn_edges_from_kmers(kmers)

In [None]:
dbg_contigs = dbg.assemble_contigs(edges)

In [None]:
dbg_contigs = sorted(dbg_contigs, key=len, reverse=True)

In [None]:
dbg_contigs = list(set(dbg_contigs))

In [None]:
dbg_contigs = [seq for seq in dbg_contigs if len(seq) > size_threshold]

In [None]:
dbg_contigs = sorted(dbg_contigs, key=len, reverse=True)

In [None]:
print(dbg_contigs[:5])
print(len(dbg_contigs))

In [None]:
dbg_contigs

### Scaffolds

In [None]:
total_contigs = greedy_contigs + dbg_contigs
print(len(total_contigs))

In [None]:
total_contigs

In [None]:
# order contigs by length
total_contigs = sorted(total_contigs, key=len, reverse=True)

In [None]:
# remove duplicates in the list
total_contigs = list(set(total_contigs))

In [None]:
len(total_contigs)

In [None]:
assembled_scaffolds = dbg.create_scaffolds(assembled_contigs, min_overlap)

assembled_scaffolds = list(set(assembled_scaffolds))

assembled_scaffolds = sorted(assembled_scaffolds, key=len, reverse=True)

assembled_scaffolds = [
    scaffold for scaffold in assembled_scaffolds if len(scaffold) > size_threshold
]

assembled_scaffolds = dbg.merge_sequences(assembled_scaffolds)

assembled_scaffolds = list(set(assembled_scaffolds))

assembled_scaffolds = sorted(assembled_scaffolds, key=len, reverse=True)

assembled_scaffolds = [
    scaffold for scaffold in assembled_scaffolds if len(scaffold) > size_threshold
]

In [None]:
assembled_scaffolds

In [None]:
mapped_scaffolds = map.process_protein_contigs_scaffold(
    assembled_scaffolds, protein_norm, max_mismatches, min_identity
)

map.mapping_substitutions(mapped_scaffolds, protein_norm, title= "scaffolds mapped in RF-selected peptides")

In [None]:
records = []

for i, seq in enumerate(assembled_scaffolds):
    record = Bio.SeqRecord.SeqRecord(
        Bio.Seq.Seq(seq), id=f"scaffold_{i+1}", description=f"length: {len(seq)}"
    )
    records.append(record)

In [None]:
Bio.SeqIO.write(
    records,
    f"{combination_folder_out}/scaffolds/{ass_method}_scaffold_{conf}_{run}.fasta",
    "fasta",
)

### Clustering

In [None]:
scaffolds_folder_out = OUTPUTS_DIR / f"{run}{chain}/{comb}/scaffolds"
print(f"scaffolds_folder_out: {scaffolds_folder_out}")

In [None]:
clus.cluster_fasta_files(input_folder=scaffolds_folder_out)

In [None]:
cluster_folder_out = os.path.join(scaffolds_folder_out, "cluster")
print(cluster_folder_out)

In [None]:
cluster_tsv_folder = os.path.join(scaffolds_folder_out, "cluster")
output_base_folder = os.path.join(scaffolds_folder_out, "cluster_fasta")

for fasta_file in os.listdir(scaffolds_folder_out):
    if fasta_file.endswith(".fasta"):
        fasta_path = os.path.join(scaffolds_folder_out, fasta_file)
        clus.process_fasta_and_clusters(
            fasta_path, cluster_tsv_folder, output_base_folder
        )

### Alignment

In [None]:
cluster_fasta_folder = os.path.join(scaffolds_folder_out, "cluster_fasta")
align_folder = os.path.join(scaffolds_folder_out, "align")
prep.create_directory(align_folder)

In [None]:
for cluster_folder in os.listdir(cluster_fasta_folder):
    cluster_folder_path = os.path.join(cluster_fasta_folder, cluster_folder)
    if os.path.isdir(cluster_folder_path):

        output_cluster_folder = os.path.join(align_folder, cluster_folder)
        os.makedirs(output_cluster_folder, exist_ok=True)

        for fasta_file in os.listdir(cluster_folder_path):
            if fasta_file.endswith(".fasta"):
                fasta_file_path = os.path.join(cluster_folder_path, fasta_file)
                base_filename = os.path.splitext(fasta_file)[0]
                output_file = os.path.join(
                    output_cluster_folder, f"{base_filename}_out.afa"
                )

                align.align_or_copy_fasta(fasta_file_path, output_file)

print("All alignment tasks completed.")

### Consensus

In [None]:
consensus_folder = os.path.join(scaffolds_folder_out, "consensus")

In [None]:
cons.process_alignment_files(align_folder, consensus_folder)

In [None]:
all_sequences = cons.load_all_consensus_sequences(consensus_folder)

In [None]:
all_sequences