## UMAP caveats

In [None]:
import sklearn.datasets
from sklearn.datasets import fetch_openml
import sklearn

import umap
import umap.plot as uplot

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()

### Case when distances between cluster have meaning

In [None]:
# Load the digits data
mnist = fetch_openml('mnist_784', parser='auto')

# Normalize the digits data
mnist_data = mnist.data/255
mnist_label = mnist.target

In [None]:
mnist_umap = umap.UMAP()
embedding = mnist_umap.fit_transform(mnist_data)

In [None]:
uplot.points(mnist_umap, labels=mnist_label)

In [None]:
umap.plot.connectivity(mnist_umap, show_points=True,labels=mnist_label)

In [None]:
umap.plot.diagnostic(mnist_umap, diagnostic_type='pca')

### Case when distance between clusters is missleading

In [None]:
# Generate the dataset
blob_centers = [[1,1,1],[10,10,10], [100,100,100]]
blob_data, blob_ids = sklearn.datasets.make_blobs(
    n_samples=300, 
    n_features=3, 
    centers=blob_centers
)

sns.set_theme()
sns.scatterplot(x=blob_data[:,0], y=blob_data[:,1], hue=blob_ids)
plt.title('Original data')


In [None]:
umap_obj_blobs = umap.UMAP(n_neighbors=50)
embedding_blobs = umap_obj_blobs.fit_transform(blob_data)

# Plot with visible densities and labels
uplot.points(umap_obj_blobs, labels=blob_ids)

In [None]:
umap.plot.connectivity(umap_obj_blobs, show_points=True, labels=blob_ids)

In [None]:
umap.plot.diagnostic(umap_obj_blobs, diagnostic_type='pca')

### Case when cluster size is missleading

In [None]:
# Generate the dataset
blob_centers = [[1,1],[10,10], [50,50]]
blob_data, blob_ids = sklearn.datasets.make_blobs(
    n_samples=600, 
    n_features=2, 
    centers=blob_centers,
    cluster_std = [1,1,4]
)

sns.scatterplot(x=blob_data[:,0], y=blob_data[:,1], hue=blob_ids)
plt.title('Original data')

In [None]:
umap_obj_blobs = umap.UMAP(n_neighbors=30)
embedding_blobs = umap_obj_blobs.fit_transform(blob_data)

uplot.points(umap_obj_blobs, labels=blob_ids)

### Case when random noise does not look random

In [None]:
# Generate the dataset
blob_centers = [[1,1]]
blob_data, blob_ids = sklearn.datasets.make_blobs(
    n_samples=600, 
    n_features=2, 
    centers=blob_centers,
    cluster_std = [10]
)

In [None]:
sns.set_theme()
sns.scatterplot(x=blob_data[:,0], y=blob_data[:,1], hue=blob_ids)
plt.title('Original data')

In [None]:
umap_obj_blobs = umap.UMAP(n_neighbors=4)
embedding_blobs = umap_obj_blobs.fit_transform(blob_data)

uplot.points(umap_obj_blobs, labels=blob_ids)