# DiRe RAPIDS Installation

### Development install (uncomment for use)

In [None]:
# Clone the repository
!git clone https://github.com/sashakolpakov/dire-rapids.git
%cd dire-rapids
# Install FlowIO for cell data
%pip install flowio
# With CUDA support
%pip install -e .[cuda]

### PyPI install (uncomment for use)

In [None]:
#!pip install dire-rapids
#!pip install flowio
# Clone the repository to access extra utils
#!git clone https://github.com/sashakolpakov/dire-rapids.git
#%cd dire-rapids

## Reducer Instance Runner

In [None]:
from dire_rapids import ReducerConfig, ReducerRunner

## Importing Reducer Factories

In [None]:
from dire_rapids import create_dire
from cuml import UMAP
from cuml import TSNE

## Data Transforms

In [None]:
#
import numpy as np
from sklearn.preprocessing import StandardScaler

# --- Common transforms -------------------------------------------------
def scale_center(X, y):
    X = X.astype(np.float32, copy=False) / 255.0 - 0.5
    y = y.astype(np.int32, copy=False) if y is not None else None
    return X, y

def z_score(X, y):
    X = StandardScaler().fit_transform(X.astype(np.float32, copy=False))
    y = y.astype(np.int32, copy=False) if y is not None else None
    return X, y

## Reducer Configuration and Running

In [None]:
help(ReducerRunner)

In [None]:
help(ReducerConfig)

## Some standard datasets

In [None]:
#
# Reducer configration: DiRe
#
dire_config = ReducerConfig(
    name='dire',
    reducer_class=create_dire,
    reducer_kwargs=dict(
        backend='auto',
        memory_efficient=True,
        n_components=2,
        n_neighbors=16,
        init='pca',
        max_iter_layout=64,
        min_dist=1e-4,
        spread=2.0,
        cutoff=2.0,
        n_sample_dirs=8,
        sample_size=32,
        neg_ratio=8,
        random_state=42,
        verbose=True,
        ),
    visualize=True,
    categorical_labels=True,
)
#
# Reducer configuration UMAP
#
umap_config = ReducerConfig(
    name='umap',
    reducer_class=UMAP,
    reducer_kwargs=dict(
        n_components=2,
        n_neighbors=16,
        init='spectral',
        verbose=True,
    ),
    visualize=True,
    categorical_labels=True,
)
#
# Reducer configuration tSNE
#
tsne_config = ReducerConfig(
    name='tsne',
    reducer_class=TSNE,
    reducer_kwargs=dict(
        n_components=2,
        init='pca',
        random_state=42,
        verbose=True,
    ),
    visualize=True,
    categorical_labels=True,
)

### Blobs

In [None]:
# Blobs
runner = ReducerRunner(dire_config)
res_blobs_dire = runner.run(
    "sklearn:blobs",
    dataset_kwargs=dict(n_samples=6_000_000, centers=32, n_features=100, random_state=42), # it makes sense to try 100_000 or even 1_000_000 points, and a few dozen centers
    )

In [None]:
runner = ReducerRunner(umap_config)
res_blobs_umap = runner.run(
    "sklearn:blobs",
    dataset_kwargs=dict(n_samples=6_000_000, centers=32, n_features=100, random_state=42), # it makes sense to try 100_000 or even 1_000_000 points, and a few dozen centers
    )

In [None]:
runner = ReducerRunner(tsne_config)
res_blobs_tsne = runner.run(
    "sklearn:blobs",
    dataset_kwargs=dict(n_samples=100_000, centers=32, n_features=100, random_state=42), # it makes sense to try 100_000 or even 1_000_000 points, and a few dozen centers
    )

### Uniform Distribution inside a Disk

In [None]:
runner = ReducerRunner(dire_config)
res_disk_dire = runner.run(
    "dire:disk_uniform",
    dataset_kwargs=dict(n_samples=100_000, n_features=2, random_state=42),
)

In [None]:
runner = ReducerRunner(umap_config)
res_disk_umap = runner.run(
    "dire:disk_uniform",
    dataset_kwargs=dict(n_samples=50_000, n_features=2, random_state=42),
)

In [None]:
runner = ReducerRunner(tsne_config)
res_disk_tsne = runner.run(
    "dire:disk_uniform",
    dataset_kwargs=dict(n_samples=100_000, n_features=2, random_state=42),
)

### Uniform Distribution on a Sphere

In [None]:
runner = ReducerRunner(dire_config)
res_sphere_dire = runner.run(
    "dire:sphere_uniform",
    dataset_kwargs=dict(n_samples=100_000, n_features=3, random_state=42),
)

In [None]:
runner = ReducerRunner(umap_config)
res_sphere_umap = runner.run(
    "dire:sphere_uniform",
    dataset_kwargs=dict(n_samples=10_000, n_features=3, random_state=42),
)

In [None]:
runner = ReducerRunner(tsne_config)
res_sphere_tsne = runner.run(
    "dire:sphere_uniform",
    dataset_kwargs=dict(n_samples=100_000, n_features=3, random_state=42),
)

### Uniform Distribution on an Ellipsoid

In [None]:
runner = ReducerRunner(dire_config)
res_blobs_dire = runner.run(
    "dire:ellipsoid_uniform",
    dataset_kwargs=dict(n_samples=100_000, semi_axes=[2,3,5], random_state=42),
)

In [None]:
runner = ReducerRunner(umap_config)
res_blobs_umap = runner.run(
    "dire:ellipsoid_uniform",
    dataset_kwargs=dict(n_samples=50_000, semi_axes=[2,3,5], random_state=42),
)

In [None]:
runner = ReducerRunner(tsne_config)
res_blobs_tsne = runner.run(
    "dire:ellipsoid_uniform",
    dataset_kwargs=dict(n_samples=100_000, semi_axes=[2,3,5], random_state=42),
)

### MNIST Digits (small)

In [None]:
# MNIST Digits (small)
runner = ReducerRunner(dire_config)
res_digits_dire = runner.run(
    "sklearn:load_digits",
    dataset_kwargs=dict(),
    )

In [None]:
# MNIST Digits (small)
runner = ReducerRunner(umap_config)
res_digits_umap = runner.run(
    "sklearn:load_digits",
    dataset_kwargs=dict(),
    )

In [None]:
# MNIST Digits (small)
runner = ReducerRunner(tsne_config)
res_digits_tsne = runner.run(
    "sklearn:load_digits",
    dataset_kwargs=dict(),
    )

### Half-moons

In [None]:
#
runner = ReducerRunner(dire_config)
res_moons_dire = runner.run(
    "sklearn:moons",
    dataset_kwargs=dict(n_samples=100_000, noise=0.05, random_state=42),
    )

In [None]:
runner = ReducerRunner(umap_config)
res_moons_umap = runner.run(
    "sklearn:moons",
    dataset_kwargs=dict(n_samples=50_000, noise=0.05, random_state=42), # 100_000 may as well break cuML UMAP - on T4 it failed
    )

In [None]:
runner = ReducerRunner(tsne_config)
res_moons_tsne = runner.run(
    "sklearn:moons",
    dataset_kwargs=dict(n_samples=100_000, noise=0.05, random_state=42),
    )

### Swiss roll

In [None]:
dire_config.categorical_labels = False
runner = ReducerRunner(dire_config)
res_swiss_dire = runner.run(
    "sklearn:swiss_roll",
    dataset_kwargs=dict(n_samples=25_000, noise=0.05, random_state=42),
    )

In [None]:
umap_config.categorical_labels = False
runner = ReducerRunner(umap_config)
res_swiss_umap = runner.run(
    "sklearn:swiss_roll",
    dataset_kwargs=dict(n_samples=25_000, noise=0.05, random_state=42),
    )

In [None]:
tsne_config.categorical_labels = False
runner = ReducerRunner(tsne_config)
res_swiss_tsne = runner.run(
    "sklearn:swiss_roll",
    dataset_kwargs=dict(n_samples=25_000, noise=0.05, random_state=42),
    )

### UCI ML Wine dataset

In [None]:
dire_config.categorical_labels = True
runner = ReducerRunner(dire_config)
res_wine_dire = runner.run(
    "sklearn:wine",
    dataset_kwargs=dict(random_state=42),
    )

### New runner: PCA + DiRe for Classification dataset

In [None]:
hd_dire_config = ReducerConfig(
    name='hd_dire',
    reducer_class=create_dire,
    reducer_kwargs=dict(
        backend='auto',
        memory_efficient=True,
        n_components=2,
        n_neighbors=32,
        init='pca',
        max_iter_layout=128,
        min_dist=1e-4,
        spread=2.0,
        cutoff=16.0,
        n_sample_dirs=4,
        sample_size=16,
        neg_ratio=32,
        random_state=42,
        verbose=True,
        ),
    visualize=True,
    categorical_labels=True,
)


# --- PCA *before* kNN ---
def pca_factory(n_components=128, whiten: bool = False, random_state: int = 42):
    """
    PCA reducer for high-dim inputs. Uses cuML if available (and backend in {'auto','cuml'}),
    otherwise scikit-learn. No z-scoring; PCA centers data internally.
    """

    # --- Check if cuML is available ---
    try:
        from cuml.decomposition import PCA as cuPCA
        _HAS_CUML = True
    except Exception:
        _HAS_CUML = False

    if _HAS_CUML:
        # --- cuML PCA ---
        def _pca(X, y):
          X = np.asarray(X, dtype=np.float32, copy=False)
          k = int(min(n_components, X.shape[1]))
          pca = cuPCA(n_components=k, whiten=whiten)
          Xr = pca.fit_transform(X)
          # label coercion kept consistent with runner
          if y is not None and getattr(y, "dtype", None) is not None and y.dtype.kind in {"U","S","O"}:
            uniq = {v: i for i, v in enumerate(np.unique(y))}
            y = np.array([uniq[v] for v in y], dtype=np.int32)
          return np.asarray(Xr, dtype=np.float32, copy=False), y
    else:
        # --- scikit-learn PCA ---
        from sklearn.decomposition import PCA
        def _pca(X, y):
          X = np.asarray(X, dtype=np.float32, copy=False)
          k = int(min(n_components, X.shape[1]))
          svd_solver = "randomized" if k < min(X.shape[0], X.shape[1]) else "full"
          pca = PCA(n_components=k, whiten=whiten, svd_solver=svd_solver, random_state=random_state)
          Xr = pca.fit_transform(X)
          # label coercion kept consistent with runner
          if y is not None and getattr(y, "dtype", None) is not None and y.dtype.kind in {"U","S","O"}:
            uniq = {v: i for i, v in enumerate(np.unique(y))}
            y = np.array([uniq[v] for v in y], dtype=np.int32)
          return np.asarray(Xr, dtype=np.float32, copy=False), y

    return _pca

# Here having redundant features helps separating the two classes in 2D, while too many informative features make it harder
# Smaller sample size with higher negative sampling ratio makes clusters more pronounced
hd_runner = ReducerRunner(hd_dire_config, default_transform=pca_factory(n_components=64, random_state=42))
res_classification_dire = hd_runner.run(
    "sklearn:classification",
    dataset_kwargs=dict(n_samples=10_000, n_features=2_048, n_informative=32, n_redundant=64, n_repeated=4, n_clusters_per_class=4, random_state=42),
    )

In [None]:
umap_config.categorical_labels=True
hd_runner = ReducerRunner(umap_config, default_transform=pca_factory(n_components=64, random_state=42))
res_classification_umap = hd_runner.run(
    "sklearn:classification",
    dataset_kwargs=dict(n_samples=10_000, n_features=2_048, n_informative=32, n_redundant=64, n_repeated=4, n_clusters_per_class=4, random_state=42),
    )

In [None]:
tsne_config.categorical_labels=True
hd_runner = ReducerRunner(tsne_config, default_transform=pca_factory(n_components=64, random_state=42))
res_classification_umap = hd_runner.run(
    "sklearn:classification",
    dataset_kwargs=dict(n_samples=10_000, n_features=2_048, n_informative=32, n_redundant=64, n_repeated=4, n_clusters_per_class=4, random_state=42),
    )

## OpenML Datasets

In [None]:
openml_dire_config = ReducerConfig(
    name='openml_dire',
    reducer_class=create_dire,
    reducer_kwargs=dict(
        backend='auto',
        memory_efficient=True,
        n_components=2,
        n_neighbors=24,
        init='pca',
        max_iter_layout=128,
        min_dist=1e-2,
        spread=2.0,
        cutoff=16.0,
        n_sample_dirs=4,
        sample_size=32,
        neg_ratio=16,
        random_state=42,
        verbose=True,
        ),
    visualize=True,
    categorical_labels=True,
)

### MNIST Digits (70k)

In [None]:
# --- MNIST (70k, 784d) ---------------------------------------------
openml_runner = ReducerRunner(openml_dire_config, default_transform=scale_center)
res_mnist_dire = openml_runner.run(
    "openml:mnist_784",                  # OpenML dataset id 554
    dataset_kwargs=dict(as_frame=False, cache=True),
)

In [None]:
openml_runner = ReducerRunner(umap_config, default_transform=scale_center)
res_mnist_umap = openml_runner.run(
    "openml:mnist_784",                  # OpenML dataset id 554
    dataset_kwargs=dict(as_frame=False, cache=True),
)

In [None]:
openml_runner = ReducerRunner(tsne_config, default_transform=scale_center)
res_mnist_tsne = openml_runner.run(
    "openml:mnist_784",                  # OpenML dataset id 554
    dataset_kwargs=dict(as_frame=False, cache=True),
)

### Fashion MNIST

In [None]:
# --- Fashion-MNIST (70k, 784d) -------------------------------------
openml_runner = ReducerRunner(openml_dire_config, default_transform=scale_center)
res_fashion_dire = openml_runner.run(
    "openml:Fashion-MNIST",               # OpenML dataset id 40996
    dataset_kwargs=dict(as_frame=False, cache=True),
)

In [None]:
openml_runner = ReducerRunner(umap_config, default_transform=scale_center)
res_fashion_umap = openml_runner.run(
    "openml:Fashion-MNIST",               # OpenML dataset id 40996
    dataset_kwargs=dict(as_frame=False, cache=True),
)

In [None]:
openml_runner = ReducerRunner(tsne_config, default_transform=scale_center)
res_fashion_tsne = openml_runner.run(
    "openml:Fashion-MNIST",               # OpenML dataset id 40996
    dataset_kwargs=dict(as_frame=False, cache=True),
)

### UCI HAR Smartphones dataset

In [None]:
# --- UCI HAR (Smartphones) (10k, 561d) -----------------------------
openml_runner = ReducerRunner(openml_dire_config, default_transform=z_score)
res_har_dire = openml_runner.run(
    "openml:4153",                       # Smartphone-Based HAR
    dataset_kwargs=dict(as_frame=False, cache=True),
)

In [None]:
openml_runner = ReducerRunner(umap_config, default_transform=z_score)
res_har_umap = openml_runner.run(
    "openml:4153",                       # Smartphone-Based HAR
    dataset_kwargs=dict(as_frame=False, cache=True),
)

In [None]:
openml_runner = ReducerRunner(tsne_config, default_transform=z_score)
res_har_tsne = openml_runner.run(
    "openml:4153",                       # Smartphone-Based HAR
    dataset_kwargs=dict(as_frame=False, cache=True),
)

## Cytology datasets (Levine)

In [None]:
cyto_dire_config = ReducerConfig(
    name='cyto',
    reducer_class=create_dire,
    reducer_kwargs=dict(
                   n_components=2,
                   n_neighbors=64,
                   init='pca',
                   max_iter_layout=64,
                   min_dist=1e-4,
                   spread=1.0,
                   cutoff=4.0,
                   n_sample_dirs=16,
                   sample_size=16,
                   neg_ratio=4,
                   random_state=42,
                   verbose=True,
                   ),
    visualize=True,
    categorical_labels=True,
)

### Levine 13

In [None]:
cyto_runner = ReducerRunner(cyto_dire_config)
res_levine13_dire = cyto_runner.run(
    "cytof:levine13",
    dataset_kwargs=dict(arcsinh_cofactor=5.0,
                        drop_unassigned=True)
)

In [None]:
umap_config.visualize=True
umap_config.categorical_labels=True
cyto_runner = ReducerRunner(umap_config)
res_levine13_umap = cyto_runner.run(
    "cytof:levine13",
    dataset_kwargs=dict(arcsinh_cofactor=5.0,
                        drop_unassigned=True)
)

In [None]:
tsne_config.visualize=True
tsne_config.categorical_labels=True
cyto_runner = ReducerRunner(tsne_config)
res_levine13_tsne = cyto_runner.run(
    "cytof:levine13",
    dataset_kwargs=dict(arcsinh_cofactor=5.0,
                        drop_unassigned=True)
)

### Levine 32

In [None]:
cyto_runner = ReducerRunner(cyto_dire_config)
res_levine32_dire = cyto_runner.run(
    "cytof:levine32",
    dataset_kwargs=dict(arcsinh_cofactor=5.0,
                        drop_unassigned=True)
)