# Visualizing pre-processing results

In [None]:
import holoviews as hv

from hv_anndata import ACCESSOR as A
from hv_anndata import register

register()

hv.extension("bokeh")

In [None]:
import numpy as np
import scanpy as sc
from anndata import AnnData
from fast_array_utils import stats

In [None]:
adata = sc.datasets.pbmc68k_reduced()

{func}`scanpy.pl.highest_expr_genes`

missing:
- horizontal box plots

In [None]:
def highest_expr_genes(
    adata: AnnData,
    n_top=20,
    *,
    layer: str | None = None,
    gene_symbols: str | None = None,
) -> AnnData:
    norm_expr = sc.pp.normalize_total(
        adata, target_sum=100, layer=layer, inplace=False
    )["X"]
    mean_percent = stats.mean(norm_expr, axis=0)
    top_idx = np.argsort(mean_percent)[::-1][:n_top]
    counts_top_genes = norm_expr[:, top_idx]
    var_labels = (
        adata.var_names[top_idx]
        if gene_symbols is None
        else adata.var[gene_symbols].iloc[top_idx].astype("string")
    )
    return AnnData(
        counts_top_genes, dict(names=adata.obs_names), dict(names=var_labels)
    )


hv.BoxWhisker(
    highest_expr_genes(adata)
    .to_df()
    .melt(var_name="gene", value_name="% of total counts"),
    ["gene"],
    ["% of total counts"],
).opts(xrotation=90, width=400)

~~{func}`scanpy.pl.filter_genes_dispersion`~~ deprecated in favor of:

{func}`scanpy.pl.highly_variable_genes`

In [None]:
sc.pp.highly_variable_genes(adata)
# sc.pl.highly_variable_genes(adata)

In [None]:
d1, d2 = (
    ("variances", "variances_norm")
    if adata.uns["hvg"]["flavor"] == "seurat_v3"
    else ("dispersions", "dispersions_norm")
)

hv.Layout([
    hv.Scatter(adata, [A.var["means"]], [A.var[d], A.var["highly_variable"]]).opts(
        color=A.var["highly_variable"],
        cmap={True: "black", False: "gray"},
    )
    for d in [d2, d1]
])

{func}`scanpy.pl.scrublet_score_distribution`

TODO:
- batches

missing:
- where are the y ticks on the y axis?

In [None]:
adata_sim = sc.pp.scrublet_simulate_doublets(adata)
sc.pp.scrublet(adata, adata_sim)
# sc.pl.scrublet_score_distribution(adata)

In [None]:
labels = dict(
    xlabel="Doublet score",
    ylabel="Probability density",
)

hv.Layout([
    hv.Dataset(adata, [], [A.obs["doublet_score"]])
    .hist(A.obs["doublet_score"], adjoin=False)
    .opts(xlim=(0, 1), logy=True, ylim=(1, None), **labels),
    hv.Table(adata.uns["scrublet"]["doublet_scores_sim"], "scores")
    .hist("scores", adjoin=False)
    .opts(xlim=(0, 1), shared_axes=False, **labels),
]) * hv.VLine(adata.uns["scrublet"]["threshold"])