## Hybrid de novo protein assembly workflow

The hybrid assembly workflow considers contigs obtained from 2 assembly methods: greedy and DBG.

In [None]:
import os
import sys

script_dir = os.getcwd()
sys.path.append(os.path.join(script_dir, "../src/instanexus"))

In [None]:
# my modules
import dbg
import greedy_method as greedy
import mapping as map
import consensus as cons
import alignment as align
import clustering as clus
import preprocessing as prep
import compute_statistics as comp_stat
#import model_peptide_selector as selector

# import libraries
from pathlib import Path
from Bio import SeqIO

#import joblib
import json
import Bio
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
try:
    # works if you are in a script: __file__ exists
    BASE_DIR = Path(__file__).resolve().parents[2]
except NameError:
    # works if you are in a notebook: __file__ does not exist
    BASE_DIR = Path().resolve()
    # go up until the project folder
    while BASE_DIR.name != "InstaNexus" and BASE_DIR != BASE_DIR.parent:
        BASE_DIR = BASE_DIR.parent

JSON_DIR = BASE_DIR / "json"
INPUT_DIR = BASE_DIR / "inputs"
FASTA_DIR = BASE_DIR / "fasta"
OUTPUTS_DIR = BASE_DIR / "outputs"
FIGURES_DIR = BASE_DIR / "figures"

In [None]:
def get_sample_metadata(run, chain="", json_path=JSON_DIR / "sample_metadata.json"):
    with open(json_path, "r") as f:
        all_meta = json.load(f)

    if run not in all_meta:
        raise ValueError(f"Run '{run}' not found in metadata.")

    entries = all_meta[run]

    for entry in entries:
        if entry["chain"] == chain:
            return entry

    raise ValueError(f"No metadata found for run '{run}' with chain '{chain}'.")

In [None]:
def get_colors_from_run(cat, is_scaffold=False, json_path=JSON_DIR / "colors.json"):
    if not os.path.exists(json_path):
        raise FileNotFoundError(f"Missing color file: {json_path}")

    with open(json_path, "r") as f:
        colors = json.load(f)

    category = cat.split("_")[0].lower()
    key = "scaffold" if is_scaffold else "contig"

    try:
        return colors[category][key]
    except KeyError:
        raise ValueError(f"Color not defined for category '{category}' and key '{key}'")

In [None]:
def get_combination_name(
    ass_method,
    conf,
    kmer_size,
    size_threshold,
    min_overlap
):
    if ass_method in ("dbg", "hybrid"):
        return f"comb_{ass_method}_c{conf}_ks{kmer_size}_ts{size_threshold}_mo{min_overlap}"
    
    elif ass_method == "greedy":
        return f"comb_{ass_method}_c{conf}_ts{size_threshold}_mo{min_overlap}"

In [None]:
run = "ma1"
chain = "light"

meta = get_sample_metadata(run, chain)

protein = meta["protein"]
chain = meta["chain"]
proteases = meta["proteases"]

print(chain)
print(proteases)

In [None]:
ass_method = "hybrid"
kmer_size = 6
conf = 0.95
size_threshold = 10
min_overlap = 3

In [None]:
comb = get_combination_name(
    ass_method,
    conf,
    kmer_size,
    size_threshold,
    min_overlap)

print(comb)

In [None]:
params = {
    "ass_method": ass_method,
    "conf": conf,
    "size_threshold": size_threshold,
    "min_overlap": min_overlap
}

In [None]:
folder_outputs = OUTPUTS_DIR / f"{run}{chain}"

prep.create_directory(folder_outputs)

combination_folder_out = os.path.join(
    folder_outputs,
    f"comb_{ass_method}_c{conf}_ks{kmer_size}_ts{size_threshold}_mo{min_overlap}",
)

prep.create_subdirectories_outputs(combination_folder_out)

### Data cleaning

In [None]:
protein_norm = prep.normalize_sequence(protein)

In [None]:
df = pd.read_csv(INPUT_DIR / f"{run}.csv")

df["protease"] = df["experiment_name"].apply(
    lambda name: prep.extract_protease(name, proteases)
)

df.head(10)

In [None]:
df.protease.unique()

In [None]:
df = prep.clean_dataframe(df)

In [None]:
df.head()

In [None]:
df["cleaned_preds"] = df["preds"].apply(prep.remove_modifications)

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
def filter_contaminants_df(df, seq_col, run, contaminants_fasta):

    contam_records = []
    for record in SeqIO.parse(contaminants_fasta, "fasta"):
        if run == "bsa" and "Bovine serum albumin precursor" in record.description:
            continue
        contam_records.append(str(record.seq))

    mask = ~df[seq_col].apply(
        lambda seq: any(seq in contam_seq for contam_seq in contam_records)
    )

    filtered_df = df[mask].copy()
    removed_count = (~mask).sum()

    print(f"Removed {removed_count} contaminant sequences, {len(filtered_df)} remaining.")
    return filtered_df

In [None]:
df = filter_contaminants_df(df, "cleaned_preds", run, FASTA_DIR / "contaminants.fasta")

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df.head()

### Comments

A reference free approach assume you do not have any reference protein.\
In this scenario it is not possible to know when the PSM coverage is 100%.\
Having high PSM coverage would allow a graph based approach to work properly.\ 

In [None]:
print(conf)

df = df[df["conf"] > conf]

In [None]:
filtered_preds = df["cleaned_preds"].tolist()

In [None]:
filtered_preds[:5]

### Hybrid assembly

#### Greedy assembly

In [None]:
greedy_contigs = greedy.assemble_contigs(filtered_preds, min_overlap)

In [None]:
assembled_contigs = list(set(greedy_contigs))

In [None]:
assembled_contigs

In [None]:
assembled_contigs = sorted(assembled_contigs, key=len, reverse=True)

In [None]:
display(assembled_contigs[:5])
print(len(assembled_contigs))

In [None]:
greedy_scaffolds = greedy.scaffold_iterative_greedy(assembled_contigs,
                                                   min_overlap,
                                                   size_threshold)

In [None]:
greedy_scaffolds

### DBG assembly

In [None]:
kmers = dbg.get_kmers(assembled_contigs, kmer_size=9)

In [None]:
print(len(kmers))

In [None]:
edges = dbg.get_debruijn_edges_from_kmers(kmers)

In [None]:
dbg_contigs = dbg.assemble_contigs(edges)

In [None]:
dbg_contigs = sorted(dbg_contigs, key=len, reverse=True)

In [None]:
dbg_contigs

In [None]:
dbg_contigs = list(set(dbg_contigs))

In [None]:
dbg_contigs = [seq for seq in dbg_contigs if len(seq) > size_threshold]

In [None]:
dbg_contigs = sorted(dbg_contigs, key=len, reverse=True)

In [None]:
dbg_contigs

In [None]:
dbg_scaffolds = dbg.scaffold_iterative(dbg_contigs, min_overlap, size_threshold)

In [None]:
all_scaffolds = dbg_scaffolds + greedy_scaffolds

In [None]:
print(len(all_scaffolds))
print(len(dbg_scaffolds))
print(len(greedy_scaffolds))

In [None]:
mapped_scaffolds = map.process_protein_contigs_scaffold(
    all_scaffolds, protein_norm, max_mismatches = 0, min_identity = 0.90
)

map.mapping_substitutions(mapped_scaffolds, protein_norm, title= "scaffolds mapped in RF-selected peptides")

In [None]:
records = []

for i, seq in enumerate(all_scaffolds):
    record = Bio.SeqRecord.SeqRecord(
        Bio.Seq.Seq(seq), id=f"scaffold_{i+1}", description=f"length: {len(seq)}"
    )
    records.append(record)

In [None]:
Bio.SeqIO.write(
    records,
    f"{combination_folder_out}/scaffolds/scaffolds.fasta",
    "fasta",
)

## Postprocessing

In [None]:
run_id = f"{ass_method}_{conf}_{run}"
scaffolds_folder_out = Path(f"{combination_folder_out}/scaffolds")
clustering_out = scaffolds_folder_out / "clustering"
alignment_out = scaffolds_folder_out / "alignment"
consensus_out = scaffolds_folder_out / "consensus"

clustering_out.mkdir(parents=True, exist_ok=True)
alignment_out.mkdir(parents=True, exist_ok=True)
consensus_out.mkdir(parents=True, exist_ok=True)

### Clustering

In [None]:
clus.cluster_fasta_files(input_folder=str(scaffolds_folder_out))

fasta_input = scaffolds_folder_out / f"scaffolds.fasta"

cluster_tsv_folder = clustering_out / run_id
    
clus.process_fasta_and_clusters(
        fasta_file=str(fasta_input),
        input_folder=str(scaffolds_folder_out),
        )

### Alignment

In [None]:
align.process_alignment(input_folder=str(scaffolds_folder_out))

### Consensus

In [None]:
cons.process_alignment_files(
        align_folder=str(alignment_out),
        output_folder=str(consensus_out),
        run_id=run_id,
    )