# <b>spacemake</b> - Saturation Analysis

In [None]:
import datetime
import spacemake as smk

from spacemake.report import utils as report_utils
from functools import partial

In [None]:
project_id = ...
sample_id = ...
downsampled_dge_summary = ...
run_modes = ...
puck_barcode_file_id = ...
PCT_DOWNSAMPLE_TO_PLOT = [20, 40, 60, 80, 100]
DOWNSAMPLE_PCTS = list(range(10, 110, 10))

SAMPLEINFO_VARS = [
    "species",
    "sequencing_date",
    "investigator",
    "experiment",
    "barcode_flavor",
    "sequencing_date",
    "puck",
]

In [None]:
if isinstance(downsampled_dge_summary, str):
    downsampled_dge_summary = [downsampled_dge_summary]
if isinstance(run_modes, str):
    run_modes = [run_modes]
if (
    isinstance(run_modes, list)
    and isinstance(downsampled_dge_summary, list)
    and len(run_modes) == len(downsampled_dge_summary)
):
    downsampled_dge_summary = {
        f"downsampled_dge_summary.{run_mode}.{d_pct}.{puck_barcode_file_id}": d
        for run_mode, d, d_pct in zip(run_modes, downsampled_dge_summary, DOWNSAMPLE_PCTS)
    }
elif (
    isinstance(run_modes, list)
    and isinstance(downsampled_dge_summary, list)
    and len(run_modes) != len(downsampled_dge_summary)
):
    raise ValueError("'run_modes' and 'downsampled_dge_summary' must have the same length")

In [None]:
def load_dge_summary_downsampling(downsampled_dge_summary, run_mode, puck_barcode_file_id):
    obs_df = pd.DataFrame()
    for downsample_pct in DOWNSAMPLE_PCTS:
        _obs_df = pd.read_csv(
            downsampled_dge_summary[f"downsampled_dge_summary.{run_mode}.{downsample_pct}.{puck_barcode_file_id}"]
        )
        _obs_df["_downsample_pct_report"] = downsample_pct
        obs_df = pd.concat([obs_df, _obs_df])

    return obs_df

In [None]:
report = {
    "type": "saturation_analysis",
    "runinformation": [],
    "date": datetime.date.today().strftime("%Y/%m/%d"),
    "plots": [],
}
main_plots = {"histostats": [], "medianstats": [], "deciledmedian": []}
report["plots"] = main_plots

# Loading project_df for metadata
config = ConfigFile.from_yaml("config.yaml")
project_df = ProjectDF("project_df.csv", config=config)

# Table: sample info
sample_info = project_df.get_sample_info(project_id, sample_id)
report["sampleinfo"] = {}
report["sampleinfo"]["project_id"] = project_id
report["sampleinfo"]["sample_id"] = sample_id
report["sampleinfo"].update({svar: sample_info[svar] for svar in SAMPLEINFO_VARS})

# Loading all dge data summaries
# Table: summarised metrics over beads
dge_summaries = {}
for run_mode in run_modes:
    dge_summaries[run_mode] = {}
    dge_summaries[run_mode] = load_dge_summary_downsampling(
        downsampled_dge_summary, run_mode, puck_barcode_file_id
    )

## Overview

### Run information

saturation analysis v0.8.1, generated on {{ report.date }}

Contact: <a href="mailto:tamasryszard.sztanka-toth@mdc-berlin.de" class="email">tamasryszard.sztanka-toth@mdc-berlin.de</a>, <a href="mailto:nikolaos.karaiskos@mdc-berlin.de" class="email">nikolaos.karaiskos@mdc-berlin.de</a>, <a href="mailto:daniel.leonperinan@mdc-berlin.de" class="email">daniel.leonperinan@mdc-berlin.de</a>

In [None]:
# Table

## Downstream stats

In order to know whether we would gain more from sequencing deeper, we downsampled the data (the final.bam file) to contain 10%, 20%… 90% reads, and then we created the DigitalExpression matrix (as in the normal dropseq pipeline).

This can give us insight, whether we have reached the saturation point (in terms of median umi per cell and median genes per cell) or whether we should sequence deeper.

Results of this are plotted below.

### Histograms per run mode

In [None]:
visualizer = smk.pl.TabVisualizer()

for run_mode, dge_summary in dge_summaries.items():
    plots = []

    plot = smk.pl.Plot(
        title='total_counts',
        description=f"# of UMIs\nper spatial unit",
        plot_func=partial(smk.pl.density_metric_faceted, dge_summary, "total_counts", log_scale=True, color=clrs["umis"], title="# of UMIs\nper spatial unit")
    )
    plots.append(plot)

    plot = smk.pl.Plot(
        title='n_reads',
        description=f"# of reads\nper spatial unit",
        plot_func=partial(smk.pl.density_metric_faceted, dge_summary, "n_reads", log_scale=True, color=clrs["reads"], title="# of reads\nper spatial unit")
    )
    plots.append(plot)

    plot = smk.pl.Plot(
        title='reads_per_counts',
        description=f"median reads/UMI\nper spatial unit",
        plot_func=partial(smk.pl.density_metric_faceted, dge_summary, "reads_per_counts", log_scale=True, color=clrs["pcr"], title="reads/UMI\nper spatial unit")
    )
    plots.append(plot)
    
    group = smk.pl.PlotGroup(
        name=f"Run Mode: {run_mode}",
        description=f"Analysis results for {run_mode}",
        plots=plots
    )
    
    visualizer.add_plot_group(group)

In [None]:
display(visualizer.generate_html())

### Median plots per run mode

In [None]:
visualizer = smk.pl.TabVisualizer()

for run_mode, dge_summary in dge_summaries.items():
    plots = []

    umi_cutoffs_values = project_df.config.get_run_mode(run_mode).variables["umi_cutoff"]
    umi_cutoffs_values = list(sorted(list(set([int(u) for u in umi_cutoffs_values] + [1]))))
    medianstats = {"name": run_mode, "umiplot": None, "readsplot": None, "readsumiplot": None}

    plot = smk.pl.Plot(
        title='total_counts',
        description=f"median UMIs\nper spatial unit",
        plot_func=partial(smk.pl.spatial, dge_summary, "total_counts", umi_cutoffs_values, color=clrs["umis"], title="median reads\nper spatial unit")
    )
    plots.append(plot)

    plot = smk.pl.Plot(
        title='n_reads',
        description=f"median reads\nper spatial unit",
        plot_func=partial(smk.pl.spatial, dge_summary, "n_reads", umi_cutoffs_values, color=clrs["reads"], title="median reads\nper spatial unit")
    )
    plots.append(plot)

    plot = smk.pl.Plot(
        title='reads_per_counts',
        description=f"median reads/UMI\nper spatial unit",
        plot_func=partial(smk.pl.spatial, dge_summary, "reads_per_counts", umi_cutoffs_values, color=clrs["pcr"], title="median reads/UMI\nper spatial unit")
    )
    plots.append(plot)
    
    group = smk.pl.PlotGroup(
        name=f"Run Mode: {run_mode}",
        description=f"Analysis results for {run_mode}",
        plots=plots
    )
    
    visualizer.add_plot_group(group)

In [None]:
display(visualizer.generate_html())

### Deciled median plots per run mode

In [None]:
visualizer = smk.pl.TabVisualizer()

for run_mode, dge_summary in dge_summaries.items():
    plots = []

    deciledmedian = {"name": run_mode, "plot": None}
    decile_dat = report_utils.generate_deciled_data(dge_summary)

    plot = smk.pl.Plot(
        title='Deciled median',
        description=f"Deciled median",
        plot_func=partial(smk.pl.deciled_median, decile_dat)
    )
    plots.append(plot)
    
    group = smk.pl.PlotGroup(
        name=f"Run Mode: {run_mode}",
        description=f"Analysis results for {run_mode}",
        plots=plots
    )
    
    visualizer.add_plot_group(group)

In [None]:
display(visualizer.generate_html())