## Testing DBG weighted, X and FUSION 

In [None]:
import os
import sys

script_dir = os.getcwd()
sys.path.append(os.path.join(script_dir, "../src/instanexus"))

import json
from pathlib import Path

In [None]:
import pandas as pd
import mapping as map
import helpers
import numpy as np
from instanexus.assembly import Assembler
import instanexus.preprocessing

In [None]:
# read a pre cleaned data file
#data = pd.read_csv("../outputs/bsa/comb_dbg_c0.9_ks7_ts12_mo3/cleaned/cleaned_data.csv")

In [None]:
# import the raw data file

import re

file_name = 'bsa'

data = pd.read_csv(f'../inputs/{file_name}.csv'.format(file_name=file_name))

data["log_probs"] = data["log_probs"].replace(-1, -10)

data = data.dropna(subset=["preds"])

data["preds"] = data["preds"].apply(lambda x: re.sub(r"\(.*?\)", "", x))


data.loc[:, "conf"] = np.exp(data["log_probs"])

data.head()

In [None]:
data = data[data["conf"] > 0.88].reset_index(drop=True)

cleaned_psms = data["preds"].tolist()

run = "bsa"

In [None]:
from pathlib import Path

repo_folder = Path("../")

filtered_psms = instanexus.preprocessing.filter_contaminants(
        cleaned_psms, run, repo_folder / "fasta/contaminants.fasta"
    )

data = data[data["preds"].isin(filtered_psms)]

In [None]:
data.head(15)

In [None]:
data.shape

### DBG weighted

In [None]:
sequences = data["preds"].dropna().tolist()
output_folder = "outputs/weighted_dbg_run"

In [None]:
assembler = Assembler(
    mode="dbg_weighted",
    kmer_size=7,          
    size_threshold=0,    
    min_weight=2,         # filter low-weight edges
    refine_rounds=3,      # optional iterative refinement
)

In [None]:
scaffolds_dbg_w = assembler.run(sequences, output_folder=output_folder, protein_norm=None)

In [None]:
# order in descending length
scaffolds_dbg_w = sorted(scaffolds_dbg_w, key=len, reverse=True)

In [None]:
# top 20
scaffolds_dbg_w = scaffolds_dbg_w[:20]

In [None]:
scaffolds_dbg_w

In [None]:
run = "bsa"

meta = helpers.get_sample_metadata(run)

protein = meta["protein"]
chain = meta["chain"]
proteases = meta["proteases"]

In [None]:
protein_norm = instanexus.preprocessing.normalize_sequence(protein)

print(protein_norm)

In [None]:
mapped_contigs = map.process_protein_contigs_scaffold(scaffolds_dbg_w, protein_norm, max_mismatches = 10, min_identity = 0.8)

In [None]:
mapped_contigs

In [None]:
try:
    # works if you are in a script: __file__ exists
    BASE_DIR = Path(__file__).resolve().parents[2]
except NameError:
    # works if you are in a notebook: __file__ does not exist
    BASE_DIR = Path().resolve()
    # go up until the project folder
    while BASE_DIR.name != "InstaNexus" and BASE_DIR != BASE_DIR.parent:
        BASE_DIR = BASE_DIR.parent

JSON_DIR = BASE_DIR / "json"
INPUT_DIR = BASE_DIR / "inputs"
FASTA_DIR = BASE_DIR / "fasta"
OUTPUTS_DIR = BASE_DIR / "outputs"
FIGURES_DIR = BASE_DIR / "figures"

In [None]:
def get_colors_from_run(cat, is_scaffold=False, json_path=JSON_DIR / "colors.json"):
    if not os.path.exists(json_path):
        raise FileNotFoundError(f"Missing color file: {json_path}")

    with open(json_path, "r") as f:
        colors = json.load(f)

    category = cat.split("_")[0].lower()
    key = "scaffold" if is_scaffold else "contig"

    try:
        return colors[category][key]
    except KeyError:
        raise ValueError(f"Color not defined for category '{category}' and key '{key}'")

In [None]:
map.mapping_substitutions(
    mapped_sequences=mapped_contigs,
    prot_seq=protein_norm,
    title=f"{run} DBG Weighted Assembly",
    contig_colors=get_colors_from_run("nanobodies", is_scaffold=False),
    match_color=get_colors_from_run("bsa", is_scaffold=False),
    output_file=f"{run}_dbg_weighted.svg",
    output_folder=".",
)

### DBG X

In [None]:
assembler_dbgx = Assembler(
    mode="dbgX",
    kmer_size=7,
    size_threshold=10,     
    min_weight=2,         
)

In [None]:
scaffolds_dbgx = assembler_dbgx.run(
    sequences=sequences,
    output_folder=output_folder,
    protein_norm=None
)

In [None]:
mapped_scaffolds_dbgx = map.process_protein_contigs_scaffold(scaffolds_dbgx, protein_norm, max_mismatches = 10, min_identity = 0.8)

In [None]:
# top 20
mapped_scaffolds_dbgx = mapped_scaffolds_dbgx[:20]

In [None]:
map.mapping_substitutions(
    mapped_sequences=mapped_scaffolds_dbgx,
    prot_seq=protein_norm,
    title=f"{run} DBG eXtended Assembly",
    contig_colors=get_colors_from_run("bsa", is_scaffold=False),
    match_color=get_colors_from_run("bsa", is_scaffold=False),
    output_file=f"{run}_dbgx.svg",
    output_folder=".",
)

### FUSION

In [None]:
assembler_fusion = Assembler(
    mode="fusion",
    kmer_size=7,
    size_threshold=10,
    min_overlap=3,    
    min_weight=2,
)

In [None]:
output_folder_fusion = "outputs/fusion_run"

In [None]:
scaffolds_fusion = assembler_fusion.run(
    sequences=sequences,
    output_folder=output_folder_fusion,
    protein_norm=None
)

In [None]:
mapped_scaffolds_fusion = map.process_protein_contigs_scaffold(scaffolds_fusion, protein_norm, max_mismatches=10, min_identity=0.8)

# top 20
mapped_scaffolds_fusion = mapped_scaffolds_fusion[:20]

In [None]:
mapped_scaffolds_fusion

In [None]:
map.mapping_substitutions(
    mapped_sequences=mapped_scaffolds_fusion,
    prot_seq=protein_norm,
    title=f"{run} Fusion Assembly",
    contig_colors=get_colors_from_run("bsa", is_scaffold=False),
    match_color=get_colors_from_run("bsa", is_scaffold=False),
    output_file=f"{run}_fusion.svg",
    output_folder=".",
)