## De Bruijn assembly workflow

In [1]:
import os
import sys

script_dir = os.getcwd()
sys.path.append(os.path.join(script_dir, "../src"))

In [2]:
# my modules
import dbg
import mapping as map
import consensus as cons
import alignment as align
import clustering as clus
import preprocessing as prep
import compute_statistics as comp_stat

# import libraries
from pathlib import Path

import json
import Bio
import pandas as pd

In [3]:
try:
    # works if you are in a script: __file__ exists
    BASE_DIR = Path(__file__).resolve().parents[2]
except NameError:
    # works if you are in a notebook: __file__ does not exist
    BASE_DIR = Path().resolve()
    # go up until the project folder
    while BASE_DIR.name != "InstaNexus" and BASE_DIR != BASE_DIR.parent:
        BASE_DIR = BASE_DIR.parent

JSON_DIR = BASE_DIR / "json"
INPUT_DIR = BASE_DIR / "inputs"
FASTA_DIR = BASE_DIR / "fasta"
OUTPUTS_DIR = BASE_DIR / "outputs"
FIGURES_DIR = BASE_DIR / "figures"

In [4]:
def get_sample_metadata(run, chain="", json_path=JSON_DIR / "sample_metadata.json"):
    with open(json_path, "r") as f:
        all_meta = json.load(f)

    if run not in all_meta:
        raise ValueError(f"Run '{run}' not found in metadata.")

    entries = all_meta[run]

    for entry in entries:
        if entry["chain"] == chain:
            return entry

    raise ValueError(f"No metadata found for run '{run}' with chain '{chain}'.")

In [5]:
def get_colors_from_run(cat, is_scaffold=False, json_path=JSON_DIR / "colors.json"):
    if not os.path.exists(json_path):
        raise FileNotFoundError(f"Missing color file: {json_path}")

    with open(json_path, "r") as f:
        colors = json.load(f)

    category = cat.split("_")[0].lower()
    key = "scaffold" if is_scaffold else "contig"

    try:
        return colors[category][key]
    except KeyError:
        raise ValueError(f"Color not defined for category '{category}' and key '{key}'")

In [6]:
def get_combination_name(
    ass_method,
    conf,
    kmer_size,
    size_threshold,
    min_overlap,
    min_identity,
    max_mismatches,
):
    if ass_method == "dbg":
        return f"comb_{ass_method}_c{conf}_ks{kmer_size}_ts{size_threshold}_mo{min_overlap}_mi{min_identity}_mm{max_mismatches}"
    else:
        return f"comb_{ass_method}_c{conf}_ts{size_threshold}_mo{min_overlap}_mi{min_identity}_mm{max_mismatches}"

In [7]:
run = "bsa"

meta = get_sample_metadata(run, chain="")

protein = meta["protein"]
chain = meta["chain"]
proteases = meta["proteases"]

In [None]:
# antibodies,ma3heavy,dbg,comb_dbg_c0.9_ks7_ts0_mo4_mi0.8_mm12,39,0.968,72,0.251,30,0.144,0.939,0.907,0.619
# nb6 comb_dbg_c0.86_ks6_ts0_mo4_mi0.9_mm12
# bd17 comb_dbg_c0.88_ks7_ts10_mo4_mi0.9_mm12
# bd15 comb_dbg_c0.92_ks6_ts0_mo4_mi0.9_mm12

# best contig dbg results

# ma1 heavy comb_dbg_c0.88_ks7_ts0_mo4_mi0.9_mm14
# ma2 heavy comb_dbg_c0.86_ks6_ts0_mo3_mi0.9_mm10
# ma3 heavy comb_dbg_c0.88_ks6_ts0_mo4_mi0.8_mm8

# ma1 light comb_dbg_c0.86_ks7_ts0_mo4_mi0.9_mm8
# ma2 light comb_dbg_c0.92_ks7_ts5_mo3_mi0.9_mm10
# ma3 light comb_dbg_c0.92_ks6_ts10_mo3_mi0.8_mm14

In [8]:
ass_method = "dbg"
kmer_size = 7
conf = 0.8
size_threshold = 20
min_overlap = 3
min_identity = 0.8
max_mismatches = 20

In [9]:
comb = get_combination_name(
    ass_method,
    conf,
    kmer_size,
    size_threshold,
    min_overlap,
    min_identity,
    max_mismatches,
)

print(comb)

comb_dbg_c0.8_ks7_ts20_mo3_mi0.8_mm20


In [10]:
params = {
    "ass_method": ass_method,
    "conf": conf,
    "kmer_size": kmer_size,
    "min_overlap": min_overlap,
    "min_identity": min_identity,
    "max_mismatches": max_mismatches,
    "size_threshold": size_threshold,
}

In [11]:
folder_outputs = OUTPUTS_DIR / f"{run}{chain}"

prep.create_directory(folder_outputs)

combination_folder_out = os.path.join(
    folder_outputs,
    f"comb_{ass_method}_c{conf}_ks{kmer_size}_ts{size_threshold}_mo{min_overlap}_mi{min_identity}_mm{max_mismatches}",
)

prep.create_subdirectories_outputs(combination_folder_out)

### Data cleaning

In [12]:
protein_norm = prep.normalize_sequence(protein)

df = pd.read_csv(INPUT_DIR / f"{run}.csv")

In [13]:
df["protease"] = df["experiment_name"].apply(
    lambda name: prep.extract_protease(name, proteases)
)

df = prep.clean_dataframe(df)

In [14]:
df["cleaned_preds"] = df["preds"].apply(prep.remove_modifications)

In [15]:
cleaned_psms = df["cleaned_preds"].tolist()

In [16]:
filtered_psms = prep.filter_contaminants(
    cleaned_psms, run, FASTA_DIR / "contaminants.fasta"
)

In [17]:
df = df[df["cleaned_preds"].isin(filtered_psms)]

In [18]:
df["mapped"] = df["cleaned_preds"].apply(
    lambda x: "True" if x in protein_norm else "False"
)

In [19]:
df = df[df["conf"] > conf]

In [20]:
df.reset_index(drop=True, inplace=True)

In [21]:
final_psms = df["cleaned_preds"].tolist()

In [22]:
mapped_psms = map.process_protein_contigs_scaffold(
    final_psms, protein_norm, max_mismatches, min_identity
)

### Assembly

In [23]:
kmers = dbg.get_kmers(final_psms, kmer_size=kmer_size)

In [24]:
edges = dbg.get_debruijn_edges_from_kmers(kmers)

In [25]:
assembled_contigs = dbg.assemble_contigs(edges)

Traversing nodes: 100%|██████████| 333/333 [00:00<00:00, 208525.42it/s]


In [26]:
assembled_contigs = sorted(assembled_contigs, key=len, reverse=True)

In [27]:
assembled_contigs = list(set(assembled_contigs))

In [28]:
assembled_contigs = [seq for seq in assembled_contigs if len(seq) > size_threshold]

In [29]:
assembled_contigs = sorted(assembled_contigs, key=len, reverse=True)

In [None]:
# assembled_contigs = list(dict.fromkeys(assembled_contigs))

# assembled_contigs = [seq for seq in assembled_contigs if len(seq) > size_threshold]

# assembled_contigs = sorted(assembled_contigs, key=len, reverse=True)

# set could be the problem

In [None]:
records = [
    Bio.SeqRecord.SeqRecord(
        Bio.Seq.Seq(contig), id=f"contig_{idx+1}", description=f"length: {len(contig)}"
    )
    for idx, contig in enumerate(assembled_contigs)
]

In [None]:
Bio.SeqIO.write(
    records,
    f"{combination_folder_out}/contigs/{ass_method}_contig_{conf}_{run}.fasta",
    "fasta",
)

In [30]:
mapped_contigs = map.process_protein_contigs_scaffold(
    assembled_contigs, protein_norm, max_mismatches, min_identity
)

In [31]:
df_contigs_mapped = map.create_dataframe_from_mapped_sequences(data=mapped_contigs)

In [32]:
comp_stat.compute_assembly_statistics(
    df=df_contigs_mapped,
    sequence_type="contigs",
    output_folder=f"{combination_folder_out}/statistics",
    reference=protein_norm,
    **params,
)

{'ass_method': 'dbg',
 'conf': 0.8,
 'kmer_size': 7,
 'min_overlap': 3,
 'min_identity': 0.8,
 'max_mismatches': 20,
 'size_threshold': 20,
 'reference_start': 0,
 'reference_end': 608,
 'total_sequences': 278,
 'average_length': 29.194244604316548,
 'min_length': 22,
 'max_length': 51,
 'coverage': 0.662828947368421,
 'mean_identity': 0.9023056708308186,
 'median_identity': 0.9018429487179487,
 'perfect_matches': 10,
 'total_mismatches': 32,
 'N50': 28,
 'N90': 24}

In [None]:
map.mapping_substitutions(
    mapped_sequences=mapped_contigs,
    prot_seq=protein_norm,
    title=f"Contig mapping to reference sequence, {run}",
    contig_colors=get_colors_from_run("nanobodies", is_scaffold=False),
    match_color=get_colors_from_run("bsa", is_scaffold=False),
    output_file=FIGURES_DIR / f"fig_X_{run}_substitution_map_contigs_dbg.svg",
    output_folder=".",
)

In case scaffold_iterative is too slow or does not work, it is recommeneded to run only the few iteration in the following raw cell

In [None]:
assembled_scaffolds = dbg.create_scaffolds(assembled_contigs, min_overlap)

assembled_scaffolds = list(set(assembled_scaffolds))

assembled_scaffolds = sorted(assembled_scaffolds, key=len, reverse=True)

assembled_scaffolds = [
    scaffold for scaffold in assembled_scaffolds if len(scaffold) > size_threshold
]

assembled_scaffolds = dbg.merge_sequences(assembled_scaffolds)

assembled_scaffolds = list(set(assembled_scaffolds))

assembled_scaffolds = sorted(assembled_scaffolds, key=len, reverse=True)

assembled_scaffolds = [
    scaffold for scaffold in assembled_scaffolds if len(scaffold) > size_threshold
]

In [None]:
records = []
for i, seq in enumerate(assembled_scaffolds):
    record = Bio.SeqRecord.SeqRecord(
        Bio.Seq.Seq(seq), id=f"scaffold_{i+1}", description=f"length: {len(seq)}"
    )
    records.append(record)

In [None]:
Bio.SeqIO.write(
    records,
    f"{combination_folder_out}/scaffolds/{ass_method}_scaffold_{conf}_{kmer_size}_{run}.fasta",
    "fasta",
)

In [None]:
mapped_scaffolds = map.process_protein_contigs_scaffold(
    assembled_scaffolds, protein_norm, max_mismatches, min_identity
)

In [None]:
df_scaffolds_mapped = map.create_dataframe_from_mapped_sequences(data=mapped_scaffolds)

In [None]:
comp_stat.compute_assembly_statistics(
    df=df_scaffolds_mapped,
    sequence_type="scaffolds",
    output_folder=f"{combination_folder_out}/statistics",
    reference=protein_norm,
    **params,
)

In [None]:
map.mapping_substitutions(
    mapped_sequences=mapped_scaffolds,
    prot_seq=protein_norm,
    title=f"Scaffold mapping to reference sequence, {run} {chain}",
    contig_colors=get_colors_from_run("bsa", is_scaffold=True),
    match_color=get_colors_from_run("bsa", is_scaffold=True),
    output_file=f"fig_X_{run}{chain}_substitution_map_scaffolds_dbg.svg",
    output_folder=FIGURES_DIR,
)

### Clustering

In [None]:
scaffolds_folder_out = OUTPUTS_DIR / f"{run}{chain}/{comb}/scaffolds"
print(f"scaffolds_folder_out: {scaffolds_folder_out}")

In [None]:
clus.cluster_fasta_files(input_folder=scaffolds_folder_out)

In [None]:
cluster_folder_out = os.path.join(scaffolds_folder_out, "cluster")
print(cluster_folder_out)

In [None]:
cluster_tsv_folder = os.path.join(scaffolds_folder_out, "cluster")
output_base_folder = os.path.join(scaffolds_folder_out, "cluster_fasta")

for fasta_file in os.listdir(scaffolds_folder_out):
    if fasta_file.endswith(".fasta"):
        fasta_path = os.path.join(scaffolds_folder_out, fasta_file)
        clus.process_fasta_and_clusters(
            fasta_path, cluster_tsv_folder, output_base_folder
        )

### Alignment

In [None]:
cluster_fasta_folder = os.path.join(scaffolds_folder_out, "cluster_fasta")
align_folder = os.path.join(scaffolds_folder_out, "align")
prep.create_directory(align_folder)

In [None]:
for cluster_folder in os.listdir(cluster_fasta_folder):
    cluster_folder_path = os.path.join(cluster_fasta_folder, cluster_folder)
    if os.path.isdir(cluster_folder_path):

        output_cluster_folder = os.path.join(align_folder, cluster_folder)
        os.makedirs(output_cluster_folder, exist_ok=True)

        for fasta_file in os.listdir(cluster_folder_path):
            if fasta_file.endswith(".fasta"):
                fasta_file_path = os.path.join(cluster_folder_path, fasta_file)
                base_filename = os.path.splitext(fasta_file)[0]
                output_file = os.path.join(
                    output_cluster_folder, f"{base_filename}_out.afa"
                )

                align.align_or_copy_fasta(fasta_file_path, output_file)

print("All alignment tasks completed.")

### Consensus

In [None]:
consensus_folder = os.path.join(scaffolds_folder_out, "consensus")

In [None]:
cons.process_alignment_files(align_folder, consensus_folder)

In [None]:
all_sequences = cons.load_all_consensus_sequences(consensus_folder)