In [1]:
import numpy as np
import pandas as pd
from distortions.geometry import Geometry, bind_metric, local_distortions
np.random.seed(20250702)

def two_clusters_differential(n):
    """Two 2D clusters of different sizes."""
    points = []
    for _ in range(n):
        points.append([10 * np.random.normal(), 10 * np.random.normal()])
        points.append([30 + np.random.normal(), np.random.normal()])
    return np.array(points)

In [2]:
from anndata import AnnData
import scanpy as sc

M = 500
n_neighbors = 50
data = two_clusters_differential(M)

adata = AnnData(X=data, obs=pd.DataFrame(range(2 * M)))
sc.pp.neighbors(adata, n_neighbors=n_neighbors)
sc.tl.umap(adata)
embedding = adata.obsm["X_umap"].copy()

radius = np.mean(adata.obsp["distances"].data)
geom = Geometry("brute", laplacian_method="geometric", affinity_kwds={"radius": radius}, adjacency_kwds={"n_neighbors": n_neighbors}, laplacian_kwds={"scaling_epps": 1})
H, Hvv, Hs = local_distortions(embedding, data, geom)

# postprocessing
Hs[Hs > 5] = 5
Hs /= Hs.mean()
for i in range(len(H)):
    H[i] = Hvv[i] @ np.diag(Hs[i]) @ Hvv[i].T

embedding = bind_metric(embedding, Hvv, Hs)
embedding["cluster"] = ["A", "B"] * M

In [None]:
import altair as alt

df_plot = pd.DataFrame({ "s0": Hs[:, 0], "s1": Hs[:, 1], "cluster": embedding["cluster"] })
lambda_plot = alt.Chart(df_plot).mark_circle()\
    .encode(
        x=alt.X("s0", axis=alt.Axis(title="λ₁")),
        y=alt.Y("s1", axis=alt.Axis(title="λ₂")),
        color=alt.Color("cluster", scale=alt.Scale(domain=["A", "B"], range=["#40e0d0", "#ff9d06"]))
    )\
    .configure_axis(grid=False)

#lambda_plot.save("../paper/figures/two_clusters_lambda.svg")

In [4]:
data = pd.DataFrame(data)
data["cluster"] = embedding["cluster"]
data.columns = ["x", "y", "cluster"]
data["s0"] = 1
data["s1"] = 1

In [5]:
from distortions.visualization import dplot

plots = {}
plots["two_clusters"] = dplot(data, width=450, height=350, labelFontSize=14)\
    .mapping(x="x", y="y", color="cluster")\
    .scale_color(scheme=["turquoise", "orange"])\
    .geom_ellipse(radiusMax=6, radiusMin=1)\
    .labs(x = "Original 1", y = "Original 2")

In [6]:
from distortions.geometry import neighborhoods

N = neighborhoods(adata, threshold=0.1, outlier_factor=3)
dplot(embedding, width=450, height=350, labelFontSize=14)\
    .mapping(x="embedding_0", y="embedding_1", color="cluster")\
    .inter_edge_link(N=N, threshold=1, backgroundOpacity=0.8)\
    .scale_color(scheme=["turquoise", "orange"])\
    .geom_ellipse(radiusMax=10, radiusMin=1)\
    .labs(x = "UMAP1", y = "UMAP2")

dplot(dataset=[{'embedding_0': -7.3624162673950195, 'embedding_1': 5.324365139007568, 'x0': -0.061769458747554…

In [7]:
metrics = {k: H[k] for k in range(len(H))}
plots["two_clusters_isometry"] = dplot(embedding, width=450, height=350, labelFontSize=14)\
    .mapping(x="embedding_0", y="embedding_1", color="cluster")\
    .geom_ellipse(radiusMin=1, radiusMax=10)\
    .inter_isometry(metrics=metrics, metric_bw=1, transformation_bw=.1, stroke="#f7f7f7")\
    .scale_color(scheme=["turquoise", "orange"])\
    .scale_size()\
    .labs(x="UMAP1", y="UMAP2")

plots["two_clusters_metric"] = dplot(embedding, width=450, height=350, labelFontSize=14)\
    .mapping(x="embedding_0", y="embedding_1", color="kernel_metric")\
    .geom_ellipse(radiusMin=1, radiusMax=10)\
    .inter_isometry(metrics=metrics, metric_bw=1, transformation_bw=.1, stroke="#f7f7f7")\
    .scale_color(scheme=["turquoise", "orange"])\
    .scale_size()\
    .labs(x="UMAP1", y="UMAP2")

plots["two_clusters_transform"] = dplot(embedding, width=450, height=350, labelFontSize=14)\
    .mapping(x="embedding_0", y="embedding_1", color="kernel_transform")\
    .geom_ellipse(radiusMin=1, radiusMax=10)\
    .inter_isometry(metrics=metrics, metric_bw=1, transformation_bw=.1, stroke="#f7f7f7")\
    .scale_color(scheme=["turquoise", "orange"])\
    .scale_size()\
    .labs(x="UMAP1", y="UMAP2")

In [None]:
#[p.save(f"../paper/figures/{k}.svg") for k, p in plots.items()]

[None, None, None, None]

In [9]:
[display(p) for p in plots.values()]

dplot(dataset=[{'x': -10.53549603766011, 'y': -1.8629896720824943, 'cluster': 'A', 's0': 1, 's1': 1}, {'x': 29…

dplot(dataset=[{'embedding_0': -7.3624162673950195, 'embedding_1': 5.324365139007568, 'x0': -0.061769458747554…

dplot(dataset=[{'embedding_0': -7.3624162673950195, 'embedding_1': 5.324365139007568, 'x0': -0.061769458747554…

dplot(dataset=[{'embedding_0': -7.3624162673950195, 'embedding_1': 5.324365139007568, 'x0': -0.061769458747554…

[None, None, None, None]

Here is an example with a random initialization.

In [10]:
sc.tl.umap(adata, init_pos="random")
embedding_random = adata.obsm["X_umap"].copy()

H, Hvv, Hs = local_distortions(embedding_random, data, geom)
embedding_random = bind_metric(embedding_random, Hvv, Hs)
embedding_random["cluster"] = ["A", "B"] * M

In [11]:
N = neighborhoods(adata, threshold=0.1, outlier_factor=2)
dplot(embedding_random, width=450, height=350, labelFontSize=14)\
    .mapping(x="embedding_0", y="embedding_1", color="cluster")\
    .inter_edge_link(N=N, threshold=1, backgroundOpacity=0.8)\
    .scale_color(scheme=["turquoise", "orange"])\
    .geom_ellipse(radiusMax=6, radiusMin=1)

dplot(dataset=[{'embedding_0': 11.932002067565918, 'embedding_1': 3.433741331100464, 'x0': -0.8767524756933656…