# <b>spacemake</b> - Quality Control

In [None]:
import numpy as np
import scanpy as sc
import spacemake as smk
import matplotlib.pyplot as plt

from itables import init_notebook_mode
from IPython import get_ipython
from functools import partial

init_notebook_mode(all_interactive=True)

In [None]:
# Note: These can (must) be replaced if you run this outside of spacemake!
# This notebook is not intended to run outside of spacemake, but you can do that

run_modes = snakemake.params.run_modes # list
adata_paths = snakemake.input # paths to the dge/*.h5ad file(s)
split_reads_read_type = snakemake.input['reads_type_out'] # path to the split_reads file
project_id = snakemake.wildcards.project_id # str
sample_id = snakemake.wildcards.sample_id # str
puck_barcode_file_id_qc = snakemake.wildcards.puck_barcode_file_id_qc # str (not a path, just the ID)
complete_data_root = snakemake.params.complete_data_root # complete_data for a sample
is_spatial = snakemake.params['is_spatial'] # bool
config_yaml_path = "config.yaml" # at the root spacemake folder
project_df_path = "project_df.csv" # at the root spacemake folder

In [None]:
config = smk.config.ConfigFile.from_yaml(config_yaml_path)
project_df = smk.project_df.ProjectDF(project_df_path, config=config)
sample_info = project_df.get_sample_info(project_id, sample_id)

if isinstance(adata_paths, str):
    adata_paths = [adata_paths]

if isinstance(split_reads_read_type, str):
    split_reads_read_type = [split_reads_read_type]

if isinstance(run_modes, str) and run_modes is not None:
    run_modes = [run_modes]
elif run_modes is None:
    run_modes = sample_info["run_mode"]

if (len(run_modes) == len(adata_paths)):
    run_modes_adatas = {
        f'{run_mode}': adata_path for run_mode, adata_path in zip(run_modes, adata_paths)
    }
else:
    raise ValueError("'run_modes' and 'adata_paths' must have the same length")

## QC tables

In [None]:
sample_info_df = smk.report.qc_sequencing.create_sample_info_df(project_df, 
                                                                project_id, 
                                                                sample_id, 
                                                                puck_barcode_file_id_qc)
run_modes_df = smk.report.qc_sequencing.create_run_modes_df(run_modes, project_df)
mapping_stats_df = smk.report.qc_sequencing.create_mapping_stats_df(
    split_reads_read_type,
    complete_data_root
)
summary_beads_df = smk.report.qc_sequencing.create_summary_beads_df(
    run_modes_adatas,
)

In [None]:
smk.pl.metrics_table_html(sample_info_df)
smk.pl.metrics_table_html(run_modes_df)
smk.pl.metrics_table_html(mapping_stats_df)
smk.pl.metrics_table_html(summary_beads_df)

In [None]:
visualizer = smk.pl.TabVisualizer()
metric_desc = "Knee-plot"

# Sample Information table
visualizer.add_plot_group(
        smk.pl.PlotGroup(
        name=f"Sample Information",
        description=f"QC tables for the sample",
        plots=[smk.pl.DataFrameTable(
            title="Sample Information",
            description="[Description]",
            data=sample_info_df
        )]
    )
)

# Run modes table
visualizer.add_plot_group(
        smk.pl.PlotGroup(
        name=f"Sample Information",
        description=f"QC tables for the sample",
        plots=[smk.pl.DataFrameTable(
            title="Sample Information",
            description="[Description]",
            data=sample_info_df
        )]
    )
)

# Sample Information table
visualizer.add_plot_group(
        smk.pl.PlotGroup(
        name=f"Sample Information",
        description=f"QC tables for the sample",
        plots=[smk.pl.DataFrameTable(
            title="Sample Information",
            description="[Description]",
            data=sample_info_df
        )]
    )
)

# Sample Information table
visualizer.add_plot_group(
        smk.pl.PlotGroup(
        name=f"Sample Information",
        description=f"QC tables for the sample",
        plots=[smk.pl.DataFrameTable(
            title="Sample Information",
            description="[Description]",
            data=sample_info_df
        )]
    )
)

## QC plots

Each of the QC plots we show on a per run mode basis, to see if there are any downstream differences based on the run mode variable settings.

### 'Knee'-plot

Below we plot a so called 'Knee'-plot: on the y-axis is the Cummulative sum of reads, on the x-axis are the bead barcodes sorted by number of reads. For single-cell samples, this plot tells you roughly how many beads are in the sample.

In [None]:
visualizer = smk.pl.TabVisualizer()
metric_desc = "Knee-plot"

for run_mode, adata in run_modes_adatas.items():
    plots = []
    plot = smk.pl.Plot(
        title=metric_desc,
        description=f"Knee-plot for run mode {run_mode}",
        plot_func=partial(smk.pl.knee_plot, adata)
    )
    plots.append(plot)
    
    group = smk.pl.PlotGroup(
        name=f"Run Mode: {run_mode}",
        description=f"Analysis results for {run_mode}",
        plots=plots
    )
    
    visualizer.add_plot_group(group)

In [None]:
display(visualizer.generate_html())

### UMI-cutoff plots

In [None]:
visualizer = smk.pl.TabVisualizer()
metric_desc = "UMI-cutoff plots"

for run_mode, adata in run_modes_adatas.items():
    plots = []
    plot = smk.pl.Plot(
        title=metric_desc,
        description=f"Distribution of {metric_desc.lower()} for run mode {run_mode}",
        plot_func=partial(smk.pl.umi_cutoff, adata)
    )
    plots.append(plot)
    
    group = smk.pl.PlotGroup(
        name=f"Run Mode: {run_mode}",
        description=f"Analysis results for {run_mode}",
        plots=plots
    )
    
    visualizer.add_plot_group(group)

In [None]:
display(visualizer.generate_html())

### Histogram of metrics over beads

Next we show mertics such as number of UMIs, genes, reads and pcr per physical spot. We further distinguish between each run mode, showing one histogram for each.

In [None]:
visualizer = smk.pl.TabVisualizer()
metric_desc = "Nucleotide distribution per beads"

for run_mode, adata in run_modes_adatas.items():
    plots = []
    plot = smk.pl.Plot(
        title=metric_desc,
        description=f"Distribution of {metric_desc.lower()} for run mode {run_mode}",
        plot_func=partial(smk.pl.nucleotide_distribution_beads, adata)
    )
    plots.append(plot)
    
    group = smk.pl.PlotGroup(
        name=f"Run Mode: {run_mode}",
        description=f"Analysis results for {run_mode}",
        plots=plots
    )
    
    visualizer.add_plot_group(group)

In [None]:
display(visualizer.generate_html())

### Nucleotide distribution per beads

Next we bin the data based on reads into quartile. For each run_mode the data is divided into 4 beads, by reads. This means, that the first bin will contain beads which account 25% of the reads, the second will contain beads which account for the second 25% of reads and so on.

<b>For each run mode we plot the nucleotide distribution per quartile.</b>

<b>Only not-meshed run_mode(s) are shown</b>

In [None]:
visualizer = smk.pl.TabVisualizer()
metric_desc = "Nucleotide distribution per beads"

for run_mode, adata in run_modes_adatas.items():
    plots = []
    plot = smk.pl.Plot(
        title=metric_desc,
        description=f"Distribution of {metric_desc.lower()} for run mode {run_mode}",
        plot_func=partial(smk.pl.nucleotide_distribution_beads, adata)
    )
    plots.append(plot)
    
    group = smk.pl.PlotGroup(
        name=f"Run Mode: {run_mode}",
        description=f"Analysis results for {run_mode}",
        plots=plots
    )
    
    visualizer.add_plot_group(group)

In [None]:
display(visualizer.generate_html())

### Shannon entropy and string compression

In [None]:
visualizer = smk.pl.TabVisualizer()
metric_desc = "Shannon Entropy and String Compression"

for run_mode, adata in run_modes_adatas.items():
    plots = []
    plot = smk.pl.Plot(
        title=metric_desc,
        description=f"Distribution of {metric_desc.lower()} for run mode {run_mode}",
        plot_func=partial(smk.pl.entropy_compression, adata)
    )
    plots.append(plot)
    
    group = smk.pl.PlotGroup(
        name=f"Run Mode: {run_mode}",
        description=f"Analysis results for {run_mode}",
        plots=plots
    )
    
    visualizer.add_plot_group(group)

In [None]:
display(visualizer.generate_html())

In [None]:
if is_spatial:
    print("WARNING: Notebook will not continue - it is not a spatial sample!")
    get_ipython().stop_here()

### Spatial QC

In [None]:
spatial_metrics = {
    "n_genes_by_counts": "# of genes per spatial unit",
    "total_counts": "# of UMIs per spatial unit",
    "pct_counts_mt": "# % mt counts per spatial unit",
    "n_reads": "# of reads per spatial unit",
    "reads_per_counts": "reads/UMI per spatial unit",
    "n_joined": "# beads joined per spatial unit",
    "exact_entropy": "Shannon entropy per spatial unit",
    "exact_compression": "barcode length after compression per spatial unit",
}

In [None]:
visualizer = smk.pl.TabVisualizer()

for run_mode, adata in run_modes_adatas.items():
    plots = []

    for metric_key, metric_desc in spatial_metrics.items():
        plot = smk.pl.Plot(
            title=metric_desc,
            description=f"Distribution of {metric_desc.lower()} for run mode {run_mode}",
            plot_func=partial(smk.pl.spatial, adata, color=metric_key)
        )
        plots.append(plot)
    
    group = smk.pl.PlotGroup(
        name=f"Run Mode: {run_mode}",
        description=f"Analysis results for {run_mode}",
        plots=plots
    )
    
    visualizer.add_plot_group(group)

In [None]:
display(visualizer.generate_html())