# DiRe RAPIDS Installation

In [None]:
# Clone the repository
!git clone https://github.com/sashakolpakov/dire-rapids.git
%cd dire-rapids
# With CUDA support
%pip install -e .[cuda]
# Install FlowIO for cell data
%pip install flowio

## DiRe Instance Runner

In [None]:
%run /content/dire-rapids/benchmarking/dire_runner.py

## Data Transforms

In [None]:
#
from sklearn.preprocessing import StandardScaler

# --- Common transforms -------------------------------------------------
def scale_center(X, y):
    X = X.astype(np.float32, copy=False) / 255.0 - 0.5
    y = y.astype(np.int32, copy=False) if y is not None else None
    return X, y

def z_score(X, y):
    X = StandardScaler().fit_transform(X.astype(np.float32, copy=False))
    y = y.astype(np.int32, copy=False) if y is not None else None
    return X, y

## Some standard datasets

In [None]:
from dire_rapids import create_dire

runner = DiReRunner(
    dire_class=create_dire,
    dire_kwargs=dict(
        backend='auto',
        memory_efficient=True,
        n_components=2,
        n_neighbors=16,
        init='pca',
        max_iter_layout=64,
        min_dist=1e-4,
        spread=2.0,
        cutoff=2.0,
        n_sample_dirs=8,
        sample_size=32,
        neg_ratio=8,
        random_state=42,
        verbose=True,
        )
)


### Blobs

In [None]:
# Blobs
res_blobs = runner.run(
    "sklearn:blobs",
    dataset_kwargs=dict(n_samples=1_000_000, centers=12, n_features=100, random_state=42),
    )

### MNIST Digits (small)

In [None]:
# MNIST Digits (small)
res_digits = runner.run(
    "sklearn:load_digits",
    dataset_kwargs=dict(),
    )

### Half-moons

In [None]:
res_moons = runner.run(
    "sklearn:moons",
    dataset_kwargs=dict(n_samples=200_000, noise=0.05, random_state=42),
    )

### Swiss roll

In [None]:
res_swiss = runner.run(
    "sklearn:swiss_roll",
    dataset_kwargs=dict(n_samples=120_000, noise=0.05, random_state=42),
    )

### UCI ML Wine dataset

In [None]:
res_wine = runner.run(
    "sklearn:wine",
    dataset_kwargs=dict(random_state=42),
    )

### New runner: PCA + DiRe for Classification dataset

In [None]:
hd_runner = DiReRunner(
    dire_class=create_dire,
    dire_kwargs=dict(
        backend='auto',
        memory_efficient=True,
        n_components=2,
        n_neighbors=32,
        init='pca',
        max_iter_layout=64,
        min_dist=1e-4,
        spread=1.0,
        cutoff=16.0,
        n_sample_dirs=4,
        sample_size=16,
        neg_ratio=32,
        random_state=42,
        verbose=True,
        )
)


# --- PCA *before* kNN ---
def pca_factory(n_components=128, whiten: bool = False, random_state: int = 42):
    """
    PCA reducer for high-dim inputs. Uses cuML if available (and backend in {'auto','cuml'}),
    otherwise scikit-learn. No z-scoring; PCA centers data internally.
    """

    # --- Check if cuML is available ---
    try:
        from cuml.decomposition import PCA as cuPCA
        _HAS_CUML = True
    except Exception:
        _HAS_CUML = False

    if _HAS_CUML:
        # --- cuML PCA ---
        def _pca(X, y):
          X = np.asarray(X, dtype=np.float32, copy=False)
          k = int(min(n_components, X.shape[1]))
          pca = cuPCA(n_components=k, whiten=whiten)
          Xr = pca.fit_transform(X)
          # label coercion kept consistent with runner
          if y is not None and getattr(y, "dtype", None) is not None and y.dtype.kind in {"U","S","O"}:
            uniq = {v: i for i, v in enumerate(np.unique(y))}
            y = np.array([uniq[v] for v in y], dtype=np.int32)
          return np.asarray(Xr, dtype=np.float32, copy=False), y
    else:
        # --- scikit-learn PCA ---
        from sklearn.decomposition import PCA
        def _pca(X, y):
          X = np.asarray(X, dtype=np.float32, copy=False)
          k = int(min(n_components, X.shape[1]))
          svd_solver = "randomized" if k < min(X.shape[0], X.shape[1]) else "full"
          pca = PCA(n_components=k, whiten=whiten, svd_solver=svd_solver, random_state=random_state)
          Xr = pca.fit_transform(X)
          # label coercion kept consistent with runner
          if y is not None and getattr(y, "dtype", None) is not None and y.dtype.kind in {"U","S","O"}:
            uniq = {v: i for i, v in enumerate(np.unique(y))}
            y = np.array([uniq[v] for v in y], dtype=np.int32)
          return np.asarray(Xr, dtype=np.float32, copy=False), y

    return _pca

# Here having redundant features helps separating the two classes in 2D, while too many informative features make it harder
# Smaller sample size with higher negative sampling ratio makes clusters more pronounced
res_classification = hd_runner.run(
    "sklearn:classification",
    dataset_kwargs=dict(n_samples=100_000, n_features=2_048, n_informative=32, n_redundant=64, n_repeated=4, n_clusters_per_class=4, random_state=42),
    transform=pca_factory(n_components=256, random_state=42),
    )

## OpenML Datasets

In [None]:
openml_runner = DiReRunner(
    dire_class=create_dire,
    dire_kwargs=dict(
        backend='auto',
        memory_efficient=True,
        n_components=2,
        n_neighbors=32,
        init='pca',
        max_iter_layout=128,
        min_dist=1e-4,
        spread=1.0,
        cutoff=8.0,
        n_sample_dirs=4,
        sample_size=32,
        neg_ratio=16,
        random_state=42,
        verbose=True,
        )
)

### MNIST Digits (70k)

In [None]:
# --- MNIST (70k, 784d) ---------------------------------------------
res_mnist = openml_runner.run(
    "openml:mnist_784",                  # OpenML dataset id 554
    dataset_kwargs=dict(as_frame=False, cache=True),
    transform=scale_center,
)

### Fashion MNIST

In [None]:
# --- Fashion-MNIST (70k, 784d) -------------------------------------
res_fashion = openml_runner.run(
    "openml:Fashion-MNIST",               # OpenML dataset id 40996
    dataset_kwargs=dict(as_frame=False, cache=True),
)

### UCI HAR Smartphones dataset

In [None]:
# --- UCI HAR (Smartphones) (10k, 561d) -----------------------------
res_har = openml_runner.run(
    "openml:4153",                       # Smartphone-Based HAR
    dataset_kwargs=dict(as_frame=False, cache=True),
    transform=z_score,
)

## Cytology datasets (Levine)

In [None]:
cyto_runner = DiReRunner(
    dire_class=create_dire,
    dire_kwargs=dict(
                   n_components=2,
                   n_neighbors=64,
                   init='pca',
                   max_iter_layout=64,
                   min_dist=1e-4,
                   spread=1.0,
                   cutoff=4.0,
                   n_sample_dirs=16,
                   sample_size=16,
                   neg_ratio=4,
                   random_state=42,
                   verbose=True,
                   ),
)


### Levine 13

In [None]:
res_levine13 = cyto_runner.run(
    "cytof:levine13",
    dataset_kwargs=dict(arcsinh_cofactor=5.0,
                        drop_unassigned=True)
)

### Levine 32

In [None]:
res_levine32 = cyto_runner.run(
    "cytof:levine32",
    dataset_kwargs=dict(arcsinh_cofactor=5.0,
                        drop_unassigned=True)
)