## Composite score notebook

In [None]:
import os
import pandas as pd
import plotly.graph_objects as go
from sklearn.preprocessing import MinMaxScaler
import json
import numpy as np
import re
import glob

In [None]:
def plot_composite_scores_plotly(
    df_contigs,
    df_scaffolds,
    runs_to_include,
    contig_color,
    scaffold_color,
    title,
    title_x_axis,
    save=True,
    output_dir=".",
    file_suffix="",
):

    binder_set = {"BIND15", "BIND16", "BIND17"}

    if binder_set.issubset(set(runs_to_include)):
        width, height = 600, 500
    else:
        width, height = 1200, 500

    contigs_scores = (
        df_contigs.groupby("run")["composite_score"]
        .max()
        .reindex(runs_to_include)
        .fillna(0)
    )
    scaffolds_scores = (
        df_scaffolds.groupby("run")["composite_score"]
        .max()
        .reindex(runs_to_include)
        .fillna(0)
    )

    fig = go.Figure()

    fig.add_trace(
        go.Bar(
            x=runs_to_include,
            y=contigs_scores,
            name="Contigs",
            marker_color=contig_color,
        )
    )

    fig.add_trace(
        go.Bar(
            x=runs_to_include,
            y=scaffolds_scores,
            name="Scaffolds",
            marker_color=scaffold_color,
        )
    )

    fig.update_layout(
        title=title,
        xaxis=dict(
            title=title_x_axis,
            showline=True,
            mirror=False,
            linecolor="black",
            ticks="outside",
            tickangle=0,
        ),
        yaxis=dict(
            title="Composite score",
            showline=True,
            mirror=False,
            linecolor="black",
            ticks="outside",
            range=[0, 1],
        ),
        barmode="group",
        bargap=0.3,
        width=width,
        height=height,
        plot_bgcolor="white",
        paper_bgcolor="white",
    )

    if save:
        os.makedirs(output_dir, exist_ok=True)
        filename = f"{title_x_axis}_composite_score_grouped_barplot{file_suffix}.svg"
        fig.write_image(os.path.join(output_dir, filename), format="svg", scale=2)


def plot_max_coverage_plotly(
    df_contigs,
    df_scaffolds,
    runs_to_include,
    title,
    title_x_axis,
    contig_color,
    scaffold_color,
    save=True,
    output_dir=".",
    file_suffix="",
):

    binder_set = {"BIND15", "BIND16", "BIND17"}

    if binder_set.issubset(set(runs_to_include)):
        width, height = 600, 500
    else:
        width, height = 1200, 500

    max_contigs = (
        df_contigs.groupby("run")["coverage"].max().reindex(runs_to_include).fillna(0)
    )
    max_scaffolds = (
        df_scaffolds.groupby("run")["coverage"].max().reindex(runs_to_include).fillna(0)
    )

    fig = go.Figure()

    fig.add_trace(
        go.Bar(
            x=runs_to_include, y=max_contigs, name="Contigs", marker_color=contig_color
        )
    )

    fig.add_trace(
        go.Bar(
            x=runs_to_include,
            y=max_scaffolds,
            name="Scaffolds",
            marker_color=scaffold_color,
        )
    )

    fig.update_layout(
        title=title,
        xaxis=dict(
            title=title_x_axis,
            showline=True,
            linecolor="black",
            ticks="outside",
        ),
        yaxis=dict(
            title="Max Coverage",
            showline=True,
            linecolor="black",
            ticks="outside",
            range=[0, 1],
        ),
        barmode="group",
        bargap=0.3,
        width=width,
        height=height,
        plot_bgcolor="white",
        paper_bgcolor="white",
    )

    if save:
        os.makedirs(output_dir, exist_ok=True)
        filename = f"{title_x_axis}_max_coverage_grouped_barplot{file_suffix}.svg"
        fig.write_image(os.path.join(output_dir, filename), format="svg", scale=2)

In [None]:
def compute_composite_score(*dfs):
    metrics = ["total_sequences", "max_length", "N50", "coverage"]
    weights = {"total_sequences": 0.1, "max_length": 0.1, "N50": 0.3, "coverage": 0.5}
    df_full = pd.concat(dfs, ignore_index=True)

    results = []
    for method, group_df in df_full.groupby("ass_method"):
        df_metrics = group_df[metrics].copy()

        scaler = MinMaxScaler()
        scaled = scaler.fit_transform(df_metrics)
        df_scaled = pd.DataFrame(scaled, columns=metrics, index=group_df.index)
        df_scaled["total_sequences"] = 1 - df_scaled["total_sequences"]

        df_scaled["composite_score"] = df_scaled[list(weights)].dot(pd.Series(weights))
        df_scaled = df_scaled.rename(columns={col: f"{col}_scaled" for col in metrics})

        df_result = pd.concat(
            [group_df.reset_index(drop=True), df_scaled.reset_index(drop=True)], axis=1
        )
        results.append(df_result)

    final_df = pd.concat(results, ignore_index=True)
    final_df = final_df.sort_values(by="composite_score", ascending=False)

    cols = (
        ["category", "source", "ass_method"]
        + [val for m in metrics for val in (m, f"{m}_scaled")]
        + ["composite_score", "run"]
    )

    return final_df[cols] if "category" in final_df.columns else final_df


def extract_grid_params_from_source(df):

    pattern_dbg = re.compile(
        r"c(?P<conf>[\d.]+)_ks(?P<kmer_size>\d+)_ts(?P<size_threshold>\d+)_mo(?P<min_overlap>\d+)_mi(?P<min_identity>[\d.]+)_mm(?P<max_mismatches>\d+)"
    )

    pattern_greedy = re.compile(
        r"c(?P<conf>[\d.]+)_ts(?P<size_threshold>\d+)_mo(?P<min_overlap>\d+)_mi(?P<min_identity>[\d.]+)_mm(?P<max_mismatches>\d+)"
    )

    param_cols = [
        "conf",
        "kmer_size",
        "size_threshold",
        "min_overlap",
        "min_identity",
        "max_mismatches",
    ]
    for col in param_cols:
        df[col] = pd.NA

    for idx, row in df.iterrows():
        source_str = row["source"]
        method = row.get("ass_method", None)

        if method == "dbg":
            match = pattern_dbg.search(source_str)
        elif method == "greedy":
            match = pattern_greedy.search(source_str)
        else:
            match = None

        if match:
            for param in param_cols:
                if param in match.groupdict():
                    df.at[idx, param] = pd.to_numeric(
                        match.group(param), errors="coerce"
                    )

    return df


def combine_json_to_csv(run, type_sequence):
    """
    Walks through directories, reads JSON files, and combines them into separate CSV files per method.
    type_sequence: can be contigs or scaffolds, for example.
    """
    base_path = "../outputs/" + run
    dataframes_by_method = {}
    files_added = {}

    for root, dirs, _ in os.walk(base_path):
        for dir_name in dirs:
            if dir_name.startswith("comb_dbg"):
                method = "dbg"
            elif dir_name.startswith("comb_greedy"):
                method = "greedy"
            else:
                method = "other"

            json_path = os.path.join(
                root, dir_name, "statistics", f"{type_sequence}_stats.json"
            )

            if os.path.exists(json_path):
                try:
                    with open(json_path, "r") as f:
                        data = json.load(f)

                    df = pd.json_normalize(data)
                    df["source"] = dir_name

                    if method not in dataframes_by_method:
                        dataframes_by_method[method] = []
                        files_added[method] = 0

                    dataframes_by_method[method].append(df)
                    files_added[method] += 1
                except Exception as e:
                    print(f"Error loading {json_path}: {e}")

    for method, dfs in dataframes_by_method.items():
        if dfs:
            combined_df = pd.concat(dfs, ignore_index=True)
            combined_df["sequence_type"] = type_sequence
            combined_df["run"] = run
            combined_df["ass_method"] = method

            output_file = os.path.join(
                base_path, f"{type_sequence}_combined_stats_{method}.csv"
            )
            combined_df.to_csv(output_file, index=False, sep=",", header=True)
            print(f"[{method}] Combined JSON saved to CSV: {output_file}")
            print(f"[{method}] Files successfully added: {files_added[method]}")
        else:
            print(f"[{method}] No dataframes to concatenate.")


def summarize_mode(df, group_col="run", value_cols=None):
    if value_cols is None:
        value_cols = df.columns.difference([group_col, "source"])

    modes = []
    for grp, sub in df.groupby(group_col):
        row = {group_col: grp}
        for col in value_cols:
            try:
                row[f"{col}_mode"] = sub[col].mode().iloc[0]
            except Exception:
                row[f"{col}_mode"] = None
        modes.append(row)
    return pd.DataFrame(modes)


def summarize_mode_by_category(df, category_col="category", value_cols=None):
    """Compute the mode (most frequent value) per category for given value columns.

    Args:
        df: DataFrame with composite scores.
        category_col: column indicating category.
        value_cols: list of metric columns to compute mode for.

    Returns:
        DataFrame with one row per category and _mode columns.
    """
    if value_cols is None:
        value_cols = df.columns.difference(
            ["run", "source", "composite_score", "category"]
        )

    modes = []
    for cat, sub in df.groupby(category_col):
        row = {category_col: cat}
        for col in value_cols:
            try:
                row[f"{col}_mode"] = sub[col].mode().iloc[0]
            except Exception:
                row[f"{col}_mode"] = None
        modes.append(row)
    return pd.DataFrame(modes)


def get_category(name):

    l = name.lower()
    if l == "bsa":
        return "bsa"
    if l.startswith("nb"):
        return "nanobodies"
    if l.startswith("bind"):
        return "binders"
    if l.startswith("pama"):
        return "policlonals"
    return "antibodies"


def summarize_best_param_modes_by_category(df, category_col="category"):
    """Compute the mode of parameters for the best composite score per category.
    Args:
        df: DataFrame with composite scores and parameters.
        category_col: column indicating category.
    """
    df = extract_grid_params_from_source(df)

    param_cols = [
        "conf",
        "size_threshold",
        "min_identity",
        "max_mismatches",
        "kmer_size",
        "min_overlap",
    ]

    mode_rows = []

    for category, sub_df in df.groupby(category_col):
        best_rows = sub_df.loc[sub_df.groupby("run")["composite_score"].idxmax()]

        row = {"category": category}
        for param in param_cols:
            try:
                row[f"{param}_mode"] = best_rows[param].mode().iloc[0]
            except Exception:
                row[f"{param}_mode"] = None
        mode_rows.append(row)

    return pd.DataFrame(mode_rows)

In [None]:
order_by_cat = {
    "policlonals": [
        "pama1light",
        "pama1heavy",
        "pama2light",
        "pama2heavy",
        "pama3light",
        "pama3heavy",
    ],
    "antibodies": [
        "ma1light",
        "ma1heavy",
        "ma2light",
        "ma2heavy",
        "ma3light",
        "ma3heavy",
    ],
    "nanobodies": [
        "NB1",
        "NB2",
        "NB3",
        "NB4",
        "NB5",
        "NB6",
        "NB8",
        "NB10",
        "NB12",
        "NB13",
    ],
    "binders": ["BIND15", "BIND16", "BIND17"],
    "bsa": ["bsa"],
}

In [None]:
output_base = "../outputs"
runs = [
    d for d in os.listdir(output_base) if os.path.isdir(os.path.join(output_base, d))
]
seq_types = ["contigs", "scaffolds"]

for r in runs:
    for seq in seq_types:
        combine_json_to_csv(run=r, type_sequence=seq)

contig_dfs, scaffold_dfs = [], []

for r in runs:
    cat = get_category(r)

    contig_csvs = glob.glob(
        os.path.join(output_base, r, "contigs_combined_stats_*.csv")
    )
    scaffold_csvs = glob.glob(
        os.path.join(output_base, r, "scaffolds_combined_stats_*.csv")
    )

    for c_path in contig_csvs:
        df = pd.read_csv(c_path)
        df["run"] = r
        df["category"] = cat
        contig_dfs.append(df)

    for s_path in scaffold_csvs:
        df = pd.read_csv(s_path)
        df["run"] = r
        df["category"] = cat
        scaffold_dfs.append(df)

contigs = pd.concat(contig_dfs, ignore_index=True) if contig_dfs else pd.DataFrame()
scaffolds = (
    pd.concat(scaffold_dfs, ignore_index=True) if scaffold_dfs else pd.DataFrame()
)

contigs_dbg = contigs[contigs["ass_method"] == "dbg"]
contigs_greedy = contigs[contigs["ass_method"] == "greedy"]

scaffolds_dbg = scaffolds[scaffolds["ass_method"] == "dbg"]
scaffolds_greedy = scaffolds[scaffolds["ass_method"] == "greedy"]

contig_scores_dbg = (
    compute_composite_score(contigs_dbg) if not contigs_dbg.empty else pd.DataFrame()
)
contig_scores_greedy = (
    compute_composite_score(contigs_greedy)
    if not contigs_greedy.empty
    else pd.DataFrame()
)

scaffold_scores_dbg = (
    compute_composite_score(scaffolds_dbg)
    if not scaffolds_dbg.empty
    else pd.DataFrame()
)
scaffold_scores_greedy = (
    compute_composite_score(scaffolds_greedy)
    if not scaffolds_greedy.empty
    else pd.DataFrame()
)

summary_dir = "summary_tables"
os.makedirs(summary_dir, exist_ok=True)

for cat in contig_scores_dbg["category"].dropna().unique():
    subdir = os.path.join(summary_dir, cat)
    os.makedirs(subdir, exist_ok=True)
    df_cat = contig_scores_dbg[contig_scores_dbg["category"] == cat]
    df_cat.to_csv(os.path.join(subdir, f"contig_scores_{cat}_dbg.csv"), index=False)

for cat in contig_scores_greedy["category"].dropna().unique():
    subdir = os.path.join(summary_dir, cat)
    os.makedirs(subdir, exist_ok=True)
    df_cat = contig_scores_greedy[contig_scores_greedy["category"] == cat]
    df_cat.to_csv(os.path.join(subdir, f"contig_scores_{cat}_greedy.csv"), index=False)

for cat in scaffold_scores_dbg["category"].dropna().unique():
    subdir = os.path.join(summary_dir, cat)
    os.makedirs(subdir, exist_ok=True)
    df_cat = scaffold_scores_dbg[scaffold_scores_dbg["category"] == cat]
    df_cat.to_csv(os.path.join(subdir, f"scaffold_scores_{cat}_dbg.csv"), index=False)

for cat in scaffold_scores_greedy["category"].dropna().unique():
    subdir = os.path.join(summary_dir, cat)
    os.makedirs(subdir, exist_ok=True)
    df_cat = scaffold_scores_greedy[scaffold_scores_greedy["category"] == cat]
    df_cat.to_csv(
        os.path.join(subdir, f"scaffold_scores_{cat}_greedy.csv"), index=False
    )

gridsearch_params_dict = {
    "dbg": [
        "kmer_size",
        "min_identity",
        "max_mismatches",
        "size_threshold",
        "min_overlap",
        "conf",
    ],
    "greedy": [
        "min_identity",
        "max_mismatches",
        "size_threshold",
        "min_overlap",
        "conf",
    ],
}


summary_dir = "summary_tables"
os.makedirs(summary_dir, exist_ok=True)

cats = (
    pd.concat(
        [
            contig_scores_dbg,
            contig_scores_greedy,
            scaffold_scores_dbg,
            scaffold_scores_greedy,
        ]
    )["category"]
    .dropna()
    .unique()
)

for cat in cats:
    subdir = os.path.join(summary_dir, cat)
    os.makedirs(subdir, exist_ok=True)

    contig_sub = contig_scores_dbg[contig_scores_dbg["category"] == cat]
    scaffold_sub = scaffold_scores_dbg[scaffold_scores_dbg["category"] == cat]

    wanted = order_by_cat.get(cat, [])

    actual_runs = contig_sub["run"].unique()

    run_ids = [r for r in wanted if r in actual_runs]

    if cat == "antibodies":
        contig_color, scaffold_color = (
            "#8dd3c7",
            "#1b9e77",
        )
    elif cat == "nanobodies":
        contig_color, scaffold_color = "#a6cee3", "#1f78b4"
    elif cat == "binders":
        contig_color, scaffold_color = "#D8D9E8", "#BCBDD9"
    elif cat == "bsa":
        contig_color, scaffold_color = "#fdbb84", "#e34a33"
    else:
        contig_color, scaffold_color = "lightgrey", "darkgrey"

    best_contigs = contig_sub.loc[contig_sub.groupby("run")["composite_score"].idxmax()]
    best_scaffolds = scaffold_sub.loc[
        scaffold_sub.groupby("run")["composite_score"].idxmax()
    ]

    gridsearch_params = gridsearch_params_dict["dbg"]

    def format_table(df):
        df = df.copy()
        float_cols = df.select_dtypes(include="float").columns
        df[float_cols] = df[float_cols].round(3)
        ordered_cols = (
            ["category", "run", "ass_method", "source"]
            + [c for c in gridsearch_params if c in df.columns]
            + [
                c
                for c in df.columns
                if c
                not in ["category", "run", "ass_method", "source"] + gridsearch_params
            ]
        )
        return df[ordered_cols]

    best_contigs_formatted = format_table(best_contigs)
    best_scaffolds_formatted = format_table(best_scaffolds)

    best_contigs_formatted.to_csv(
        os.path.join(subdir, f"contig_scores_{cat}_dbg.csv"), index=False
    )
    best_scaffolds_formatted.to_csv(
        os.path.join(subdir, f"scaffold_scores_{cat}_dbg.csv"), index=False
    )

    composite_dir = os.path.join("barplot_composite", cat)
    plot_composite_scores_plotly(
        contig_sub,
        scaffold_sub,
        run_ids,
        contig_color,
        scaffold_color,
        title="Composite score comparison - dbg",
        title_x_axis=cat,
        save=True,
        output_dir=composite_dir,
        file_suffix="_dbg",
    )

    coverage_dir = os.path.join("barplot_coverage", cat)
    plot_max_coverage_plotly(
        contig_sub,
        scaffold_sub,
        run_ids,
        title="Max coverage comparison - dbg",
        title_x_axis=cat,
        contig_color=contig_color,
        scaffold_color=scaffold_color,
        save=True,
        output_dir=coverage_dir,
        file_suffix="_dbg",
    )


for cat in cats:
    subdir = os.path.join(summary_dir, cat)
    os.makedirs(subdir, exist_ok=True)

    contig_sub = contig_scores_greedy[contig_scores_greedy["category"] == cat]
    scaffold_sub = scaffold_scores_greedy[scaffold_scores_greedy["category"] == cat]

    wanted = order_by_cat.get(cat, [])

    actual_runs = contig_sub["run"].unique()

    run_ids = [r for r in wanted if r in actual_runs]

    if cat == "antibodies":
        contig_color, scaffold_color = (
            "#8dd3c7",
            "#1b9e77",
        )
    elif cat == "nanobodies":
        contig_color, scaffold_color = "#a6cee3", "#1f78b4"
    elif cat == "binders":
        contig_color, scaffold_color = "#D8D9E8", "#BCBDD9"
    elif cat == "bsa":
        contig_color, scaffold_color = "#fdbb84", "#e34a33"
    else:
        contig_color, scaffold_color = "lightgrey", "darkgrey"

    best_contigs = contig_sub.loc[contig_sub.groupby("run")["composite_score"].idxmax()]
    best_scaffolds = scaffold_sub.loc[
        scaffold_sub.groupby("run")["composite_score"].idxmax()
    ]

    gridsearch_params = gridsearch_params_dict["greedy"]

    def format_table(df):
        df = df.copy()
        float_cols = df.select_dtypes(include="float").columns
        df[float_cols] = df[float_cols].round(3)
        ordered_cols = (
            ["category", "run", "ass_method", "source"]
            + [c for c in gridsearch_params if c in df.columns]
            + [
                c
                for c in df.columns
                if c
                not in ["category", "run", "ass_method", "source"] + gridsearch_params
            ]
        )
        return df[ordered_cols]

    best_contigs_formatted = format_table(best_contigs)
    best_scaffolds_formatted = format_table(best_scaffolds)

    best_contigs_formatted.to_csv(
        os.path.join(subdir, f"contig_scores_{cat}_greedy.csv"), index=False
    )
    best_scaffolds_formatted.to_csv(
        os.path.join(subdir, f"scaffold_scores_{cat}_greedy.csv"), index=False
    )

    composite_dir = os.path.join("barplot_composite", cat)
    plot_composite_scores_plotly(
        contig_sub,
        scaffold_sub,
        run_ids,
        contig_color,
        scaffold_color,
        title="Composite score comparison - greedy",
        title_x_axis=cat,
        save=True,
        output_dir=composite_dir,
        file_suffix="_greedy",
    )

    coverage_dir = os.path.join("barplot_coverage", cat)
    plot_max_coverage_plotly(
        contig_sub,
        scaffold_sub,
        run_ids,
        title="Max coverage comparison - greedy",
        title_x_axis=cat,
        contig_color=contig_color,
        scaffold_color=scaffold_color,
        save=True,
        output_dir=coverage_dir,
        file_suffix="_greedy",
    )


# DBG contigs
best_param_modes_contigs_dbg = summarize_best_param_modes_by_category(contig_scores_dbg)
cols = ["category"] + [
    c for c in best_param_modes_contigs_dbg.columns if c != "category"
]
best_param_modes_contigs_dbg = best_param_modes_contigs_dbg[cols]
best_param_modes_contigs_dbg[
    best_param_modes_contigs_dbg.select_dtypes(include="float").columns
] = best_param_modes_contigs_dbg.select_dtypes(include="float").round(3)
best_param_modes_contigs_dbg = best_param_modes_contigs_dbg.sort_values(by="category")
best_param_modes_contigs_dbg.to_csv(
    os.path.join(summary_dir, "best_param_modes_contigs_dbg.csv"), index=False
)

# DBG scaffolds
best_param_modes_scaffolds_dbg = summarize_best_param_modes_by_category(
    scaffold_scores_dbg
)
cols = ["category"] + [
    c for c in best_param_modes_scaffolds_dbg.columns if c != "category"
]
best_param_modes_scaffolds_dbg = best_param_modes_scaffolds_dbg[cols]
best_param_modes_scaffolds_dbg[
    best_param_modes_scaffolds_dbg.select_dtypes(include="float").columns
] = best_param_modes_scaffolds_dbg.select_dtypes(include="float").round(3)
best_param_modes_scaffolds_dbg = best_param_modes_scaffolds_dbg.sort_values(
    by="category"
)
best_param_modes_scaffolds_dbg.to_csv(
    os.path.join(summary_dir, "best_param_modes_scaffolds_dbg.csv"), index=False
)

# GREEDY contigs
best_param_modes_contigs_greedy = summarize_best_param_modes_by_category(
    contig_scores_greedy
)
cols = ["category"] + [
    c for c in best_param_modes_contigs_greedy.columns if c != "category"
]
best_param_modes_contigs_greedy = best_param_modes_contigs_greedy[cols]
best_param_modes_contigs_greedy[
    best_param_modes_contigs_greedy.select_dtypes(include="float").columns
] = best_param_modes_contigs_greedy.select_dtypes(include="float").round(3)
best_param_modes_contigs_greedy = best_param_modes_contigs_greedy.sort_values(
    by="category"
)
best_param_modes_contigs_greedy.to_csv(
    os.path.join(summary_dir, "best_param_modes_contigs_greedy.csv"), index=False
)

# GREEDY scaffolds
best_param_modes_scaffolds_greedy = summarize_best_param_modes_by_category(
    scaffold_scores_greedy
)
cols = ["category"] + [
    c for c in best_param_modes_scaffolds_greedy.columns if c != "category"
]
best_param_modes_scaffolds_greedy = best_param_modes_scaffolds_greedy[cols]
best_param_modes_scaffolds_greedy[
    best_param_modes_scaffolds_greedy.select_dtypes(include="float").columns
] = best_param_modes_scaffolds_greedy.select_dtypes(include="float").round(3)
best_param_modes_scaffolds_greedy = best_param_modes_scaffolds_greedy.sort_values(
    by="category"
)
best_param_modes_scaffolds_greedy.to_csv(
    os.path.join(summary_dir, "best_param_modes_scaffolds_greedy.csv"), index=False
)

In [None]:
def plot_coverage_boxplots(contig_path: str, scaffold_path: str):
    contig_df = pd.read_csv(contig_path)
    scaffold_df = pd.read_csv(scaffold_path)

    contig_coverage = contig_df["coverage"]
    scaffold_coverage = scaffold_df["coverage"]

    fig = go.Figure()

    fig.add_trace(
        go.Box(
            y=contig_coverage,
            x=[0] * len(contig_coverage),
            name="Contig",
            marker_color="#a6cee3",
            boxmean=True,
            boxpoints=False,
        )
    )

    fig.add_trace(
        go.Box(
            y=scaffold_coverage,
            x=[1] * len(scaffold_coverage),
            name="Scaffold",
            marker_color="#1f78b4",
            boxmean=True,
            boxpoints=False,
        )
    )

    jittered_x_contig = np.random.normal(
        loc=-0.5, scale=0.03, size=len(contig_coverage)
    )
    fig.add_trace(
        go.Scatter(
            x=jittered_x_contig,
            y=contig_coverage,
            mode="markers",
            marker=dict(color="#a6cee3", size=6),
            showlegend=False,
        )
    )

    jittered_x_scaffold = np.random.normal(
        loc=1.5, scale=0.03, size=len(scaffold_coverage)
    )
    fig.add_trace(
        go.Scatter(
            x=jittered_x_scaffold,
            y=scaffold_coverage,
            mode="markers",
            marker=dict(color="#1f78b4", size=6),
            showlegend=False,
        )
    )

    fig.update_layout(
        title="Coverage comparison: contig and scaffold",
        yaxis=dict(
            title="Coverage",
            showgrid=True,
            zeroline=True,
            showline=True,
            linecolor="black",
            linewidth=1,
            ticks="outside",
            range=[0.94, 0.98],
        ),
        xaxis=dict(
            title="",
            showline=True,
            linecolor="black",
            linewidth=1,
            ticks="outside",
            tickvals=[0, 1],
            ticktext=["Contig", "Scaffold"],
        ),
        width=500,
        height=400,
        template="plotly_white",
    )

    fig.show()
    fig.write_image("coverage_boxplot_separated_dots.svg", format="svg", scale=2)

In [6]:
plot_coverage_boxplots(
    "summary_tables/nanobodies/contig_scores_nanobodies_greedy.csv",
    "summary_tables/nanobodies/scaffold_scores_nanobodies_greedy.csv",
)