Skip to content

Commit

Permalink
Add diffusion map dimensionality reduction and simple Principal Compo…
Browse files Browse the repository at this point in the history
…nent Analysis (#17)

This PR adds two dimensionality reduction technique. 
 * Principal Component Analysis is a very common linear reduction technique based on the first and second order of the data. It projects data in such a way that the maximal of variance is preserved, while centering it. Neither sparse nor kernel PCA are implemented.
 * Diffusion Maps on the other hand are a non-linear reduction method, capable for more complex data. A kernel has to be chosen, which projects the data into a higher-dimensional space. With a sparse graph structure a approximation is derived, modeling the probability of diffusion between different sample points.
  • Loading branch information
bytesnake committed Jul 13, 2020
1 parent 8d37fc7 commit 7b6075e
Show file tree
Hide file tree
Showing 18 changed files with 800 additions and 1 deletion.
5 changes: 4 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ rand_isaac = "0.2.0"
ndarray-npy = { version = "0.5", default-features = false }

[workspace]
members = ["linfa-clustering"]
members = [
"linfa-clustering",
"linfa-reduction"
]

[profile.release]
opt-level = 3
3 changes: 3 additions & 0 deletions linfa-clustering/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@ categories = ["algorithms", "mathematics", "science"]
ndarray = { version = "0.13" , features = ["rayon", "serde", "approx"]}
ndarray-rand = "0.11"
ndarray-stats = "0.3"
ndarray-linalg = { version = "0.12", features = ["openblas"] }
sprs = "0.7"
serde = { version = "1", features = ["derive"] }
num-traits = "0.1.32"

[dev-dependencies]
rand_isaac = "0.2.0"
Expand Down
1 change: 1 addition & 0 deletions linfa-clustering/examples/kmeans.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,4 @@ fn main() {
)
.expect("Failed to write .npy file");
}

4 changes: 4 additions & 0 deletions linfa-clustering/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
//! Implementation choices, algorithmic details and a tutorial can be found [here](struct.KMeans.html).
//!
//! Check [here](https://github.com/LukeMathWalker/clustering-benchmarks) for extensive benchmarks against `scikit-learn`'s K-means implementation.

extern crate ndarray;
extern crate ndarray_linalg;

mod dbscan;
#[allow(clippy::new_ret_no_self)]
mod k_means;
Expand Down
32 changes: 32 additions & 0 deletions linfa-reduction/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[package]
name = "linfa-reduction"
version = "0.1.0"
authors = ["Lorenz Schmidt <bytesnake@mailbox.org>"]
description = "A collection of dimensionality reduction techniques"
edition = "2018"
license = "MIT/Apache-2.0"

repository = "https://github.com/LukeMathWalker/linfa"
readme = "README.md"

keywords = ["dimensionality reduction", "machine-learning", "linfa", "spectral", "unsupervised"]
categories = ["algorithms", "mathematics", "science"]

[dependencies]
ndarray = { version = "0.13" , features = ["rayon", "serde", "approx"]}
ndarray-rand = "0.11"
ndarray-stats = "0.3"
ndarray-linalg = { version = "0.12", features = ["openblas"] }
sprs = "0.7"
serde = { version = "1", features = ["derive"] }
num-traits = "0.1.32"
hnsw = "0.6"
space = "0.10"

[dev-dependencies]
rand_isaac = "0.2.0"
ndarray-npy = { version = "0.5", default-features = false }
criterion = "0.3"
serde_json = "1"
approx = "0.3"
linfa-clustering = { path = "../linfa-clustering" }
34 changes: 34 additions & 0 deletions linfa-reduction/examples/diffusion_map.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
use linfa_reduction::utils::generate_convoluted_rings;
use linfa_reduction::kernel::{IntoKernel, SparsePolynomialKernel};
use ndarray_npy::write_npy;
use ndarray_rand::rand::SeedableRng;
use rand_isaac::Isaac64Rng;

// A routine K-means task: build a synthetic dataset, fit the algorithm on it
// and save both training data and predictions to disk.
fn main() {
// Our random number generator, seeded for reproducibility
let mut rng = Isaac64Rng::seed_from_u64(42);

// For each our expected centroids, generate `n` data points around it (a "blob")
let n = 3000;

// generate three convoluted rings
let dataset = generate_convoluted_rings(&[(0.0, 3.0), (10.0, 13.0), (20.0, 23.0)], n, &mut rng);

// generate sparse polynomial kernel with k = 14, c = 5 and d = 2
let diffusion_map = SparsePolynomialKernel::new(&dataset, 14, 5.0, 2.0)
.reduce_fixed(4);

// get embedding
let embedding = diffusion_map.embedding();

// Save to disk our dataset (and the cluster label assigned to each observation)
// We use the `npy` format for compatibility with NumPy
write_npy("diffusion_map_dataset.npy", dataset).expect("Failed to write .npy file");
write_npy(
"diffusion_map_embedding.npy",
embedding
)
.expect("Failed to write .npy file");
}
34 changes: 34 additions & 0 deletions linfa-reduction/examples/pca.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
use linfa_clustering::generate_blobs;
use linfa_reduction::PrincipalComponentAnalysis;
use ndarray::array;
use ndarray_npy::write_npy;
use ndarray_rand::rand::SeedableRng;
use rand_isaac::Isaac64Rng;

// A routine K-means task: build a synthetic dataset, fit the algorithm on it
// and save both training data and predictions to disk.
fn main() {
// Our random number generator, seeded for reproducibility
let mut rng = Isaac64Rng::seed_from_u64(42);

// For each our expected centroids, generate `n` data points around it (a "blob")
let expected_centroids = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],];
let n = 10;
let dataset = generate_blobs(n, &expected_centroids, &mut rng);

dbg!(&dataset);

let embedding = PrincipalComponentAnalysis::fit(dataset.clone(), 1)
.predict(&dataset);

dbg!(&embedding);

// Save to disk our dataset (and the cluster label assigned to each observation)
// We use the `npy` format for compatibility with NumPy
write_npy("pca_dataset.npy", dataset).expect("Failed to write .npy file");
write_npy(
"pca_embedding.npy",
embedding
)
.expect("Failed to write .npy file");
}
113 changes: 113 additions & 0 deletions linfa-reduction/src/diffusion_map/algorithms.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
use ndarray::{Array1, Array2};
use ndarray_linalg::{TruncatedOrder, lobpcg::LobpcgResult, lobpcg, eigh::EighInto, lapack::UPLO, Scalar};
use ndarray_rand::{rand_distr::Uniform, RandomExt};
use num_traits::NumCast;

use crate::Float;
use crate::kernel::{Kernel, IntoKernel};
use super::hyperparameters::DiffusionMapHyperParams;

pub struct DiffusionMap<A> {
hyperparameters: DiffusionMapHyperParams,
embedding: Array2<A>,
eigvals: Array1<A>
}

impl<A: Float> DiffusionMap<A> {
pub fn project(
hyperparameters: DiffusionMapHyperParams,
kernel: impl IntoKernel<A>
) -> Self {
// compute spectral embedding with diffusion map
let (embedding, eigvals) = compute_diffusion_map(kernel.into_kernel(), hyperparameters.steps(), 0.0, hyperparameters.embedding_size(), None);

DiffusionMap {
hyperparameters,
embedding,
eigvals
}
}

/// Return the hyperparameters used to train this spectral mode instance.
pub fn hyperparameters(&self) -> &DiffusionMapHyperParams {
&self.hyperparameters
}

/// Estimate the number of clusters in this embedding (very crude for now)
pub fn estimate_clusters(&self) -> usize {
let mean = self.eigvals.sum() / NumCast::from(self.eigvals.len()).unwrap();
self.eigvals.iter().filter(|x| *x > &mean).count() + 1
}

/// Return a copy of the eigenvalues
pub fn eigvals(&self) -> Array1<A> {
self.eigvals.clone()
}

pub fn embedding(&self) -> Array2<A> {
self.embedding.clone()
}
}

fn compute_diffusion_map<A: Float>(kernel: impl Kernel<A>, steps: usize, alpha: f32, embedding_size: usize, guess: Option<Array2<A>>) -> (Array2<A>, Array1<A>) {
assert!(embedding_size < kernel.size());

let d = kernel.sum()
.mapv(|x| x.recip());

let d2 = d.mapv(|x| x.powf(NumCast::from(0.5 + alpha).unwrap()));

// use full eigenvalue decomposition for small problem sizes
let (vals, mut vecs) = if kernel.size() < 5 * embedding_size + 1 {
let mut matrix = kernel.mul_similarity(&Array2::from_diag(&d).view());
matrix.gencolumns_mut().into_iter().zip(d.iter()).for_each(|(mut a,b)| a *= *b);

let (vals, vecs) = matrix.eigh_into(UPLO::Lower).unwrap();
let (vals, vecs) = (vals.slice_move(s![..; -1]), vecs.slice_move(s![.., ..; -1]));
(
vals.slice_move(s![1..embedding_size+1]).mapv(|x| Scalar::from_real(x)),
vecs.slice_move(s![..,1..embedding_size+1])
)
} else {
// calculate truncated eigenvalue decomposition
let x = guess.unwrap_or(
Array2::random((kernel.size(), embedding_size + 1), Uniform::new(0.0f64, 1.0))
.mapv(|x| NumCast::from(x).unwrap())
);

let result = lobpcg::lobpcg(|y| {
let mut y = y.to_owned();
y.genrows_mut().into_iter().zip(d2.iter()).for_each(|(mut a,b)| a *= *b);
let mut y = kernel.mul_similarity(&y.view());
y.genrows_mut().into_iter().zip(d2.iter()).for_each(|(mut a,b)| a *= *b);

y
}, x, |_| {}, None, 1e-15, 200, TruncatedOrder::Largest);

let (vals, vecs) = match result {
LobpcgResult::Ok(vals, vecs, _) | LobpcgResult::Err(vals, vecs, _, _) => { (vals, vecs)},
_ => panic!("Eigendecomposition failed!")
};


// cut away first eigenvalue/eigenvector
(
vals.slice_move(s![1..]),
vecs.slice_move(s![..,1..])
)
};

let d = d.mapv(|x| x.sqrt());

for (mut col, val) in vecs.genrows_mut().into_iter().zip(d.iter()) {
col *= *val;
}

let steps = NumCast::from(steps).unwrap();
for (mut vec, val) in vecs.gencolumns_mut().into_iter().zip(vals.iter()) {
vec *= val.powf(steps);
}

(vecs, vals)
}

48 changes: 48 additions & 0 deletions linfa-reduction/src/diffusion_map/hyperparameters.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
pub struct DiffusionMapHyperParams {
steps: usize,
embedding_size: usize,
}

pub struct DiffusionMapHyperParamsBuilder {
steps: usize,
embedding_size: usize,
}

impl DiffusionMapHyperParamsBuilder {
pub fn steps(mut self, steps: usize) -> Self {
self.steps = steps;

self
}

pub fn build(self) -> DiffusionMapHyperParams {
DiffusionMapHyperParams::build(
self.steps,
self.embedding_size,
)
}
}

impl DiffusionMapHyperParams {
pub fn new(embedding_size: usize) -> DiffusionMapHyperParamsBuilder {
DiffusionMapHyperParamsBuilder {
steps: 10,
embedding_size
}
}

pub fn steps(&self) -> usize {
self.steps
}

pub fn embedding_size(&self) -> usize {
self.embedding_size
}

pub fn build(steps: usize, embedding_size: usize) -> Self {
DiffusionMapHyperParams {
steps,
embedding_size,
}
}
}
5 changes: 5 additions & 0 deletions linfa-reduction/src/diffusion_map/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
mod algorithms;
mod hyperparameters;

pub use algorithms::*;
pub use hyperparameters::*;
59 changes: 59 additions & 0 deletions linfa-reduction/src/kernel/gaussian.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
use num_traits::NumCast;
use ndarray::{Array2, ArrayView1};
use sprs::CsMat;
use std::iter::Sum;

use crate::Float;
use crate::kernel::{IntoKernel, dense_from_fn, sparse_from_fn};

fn kernel<A: Float>(a: ArrayView1<A>, b: ArrayView1<A>, eps: A) -> A {
let distance = a.iter().zip(b.iter()).map(|(x,y)| (*x-*y)*(*x-*y))
.sum::<A>();

(-distance / eps).exp()
}

pub struct GaussianKernel<A> {
pub data: Array2<A>
}

impl<A: Float + Sum<A>> GaussianKernel<A> {
pub fn new(dataset: &Array2<A>, eps: f32) -> Self {
let eps = NumCast::from(eps).unwrap();

GaussianKernel {
data: dense_from_fn(dataset, |a,b| kernel(a,b,eps))
}
}
}

impl<A: Float> IntoKernel<A> for GaussianKernel<A> {
type IntoKer = Array2<A>;

fn into_kernel(self) -> Self::IntoKer {
self.data
}
}

pub struct SparseGaussianKernel<A> {
similarity: CsMat<A>
}

impl<A: Float> SparseGaussianKernel<A> {
pub fn new(dataset: &Array2<A>, k: usize, eps: f32) -> Self {
let eps = NumCast::from(eps).unwrap();

SparseGaussianKernel {
similarity: sparse_from_fn(dataset, k, |a,b| kernel(a,b,eps))
}
}
}

impl<A: Float> IntoKernel<A> for SparseGaussianKernel<A> {
type IntoKer = CsMat<A>;

fn into_kernel(self) -> Self::IntoKer {
self.similarity
}
}

Loading

0 comments on commit 7b6075e

Please sign in to comment.