Add diffusion map dimensionality reduction and simple Principal Compo…

…nent Analysis (#17) This PR adds two dimensionality reduction technique. * Principal Component Analysis is a very common linear reduction technique based on the first and second order of the data. It projects data in such a way that the maximal of variance is preserved, while centering it. Neither sparse nor kernel PCA are implemented. * Diffusion Maps on the other hand are a non-linear reduction method, capable for more complex data. A kernel has to be chosen, which projects the data into a higher-dimensional space. With a sparse graph structure a approximation is derived, modeling the probability of diffusion between different sample points.
rust-ml · Jul 13, 2020 · 7b6075e · 7b6075e
1 parent 8d37fc7
commit 7b6075e
Show file tree

Hide file tree

Showing 18 changed files with 800 additions and 1 deletion.
diff --git a/Cargo.toml b/Cargo.toml
@@ -22,7 +22,10 @@ rand_isaac = "0.2.0"
 ndarray-npy = { version = "0.5", default-features = false }
 
 [workspace]
-members = ["linfa-clustering"]
+members = [
+    "linfa-clustering",
+    "linfa-reduction"
+]
 
 [profile.release]
 opt-level = 3
diff --git a/linfa-clustering/Cargo.toml b/linfa-clustering/Cargo.toml
@@ -16,7 +16,10 @@ categories = ["algorithms", "mathematics", "science"]
 ndarray = { version = "0.13" , features = ["rayon", "serde", "approx"]}
 ndarray-rand = "0.11"
 ndarray-stats = "0.3"
+ndarray-linalg = { version = "0.12", features = ["openblas"] }
+sprs = "0.7"
 serde = { version = "1", features = ["derive"] }
+num-traits = "0.1.32"
 
 [dev-dependencies]
 rand_isaac = "0.2.0"

diff --git a/linfa-clustering/examples/kmeans.rs b/linfa-clustering/examples/kmeans.rs
@@ -37,3 +37,4 @@ fn main() {
     )
     .expect("Failed to write .npy file");
 }
+
diff --git a/linfa-clustering/src/lib.rs b/linfa-clustering/src/lib.rs
@@ -18,6 +18,10 @@
 //! Implementation choices, algorithmic details and a tutorial can be found [here](struct.KMeans.html).
 //!
 //! Check [here](https://github.com/LukeMathWalker/clustering-benchmarks) for extensive benchmarks against `scikit-learn`'s K-means implementation.
+
+extern crate ndarray;
+extern crate ndarray_linalg;
+
 mod dbscan;
 #[allow(clippy::new_ret_no_self)]
 mod k_means;

diff --git a/linfa-reduction/Cargo.toml b/linfa-reduction/Cargo.toml
@@ -0,0 +1,32 @@
+[package]
+name = "linfa-reduction"
+version = "0.1.0"
+authors = ["Lorenz Schmidt <bytesnake@mailbox.org>"]
+description = "A collection of dimensionality reduction techniques"
+edition = "2018"
+license = "MIT/Apache-2.0"
+
+repository = "https://github.com/LukeMathWalker/linfa"
+readme = "README.md"
+
+keywords = ["dimensionality reduction", "machine-learning", "linfa", "spectral", "unsupervised"]
+categories = ["algorithms", "mathematics", "science"]
+
+[dependencies]
+ndarray = { version = "0.13" , features = ["rayon", "serde", "approx"]}
+ndarray-rand = "0.11"
+ndarray-stats = "0.3"
+ndarray-linalg = { version = "0.12", features = ["openblas"] }
+sprs = "0.7"
+serde = { version = "1", features = ["derive"] }
+num-traits = "0.1.32"
+hnsw = "0.6"
+space = "0.10"
+
+[dev-dependencies]
+rand_isaac = "0.2.0"
+ndarray-npy = { version = "0.5", default-features = false }
+criterion = "0.3"
+serde_json = "1" 
+approx = "0.3"
+linfa-clustering = { path = "../linfa-clustering" }
diff --git a/linfa-reduction/examples/diffusion_map.rs b/linfa-reduction/examples/diffusion_map.rs
@@ -0,0 +1,34 @@
+use linfa_reduction::utils::generate_convoluted_rings;
+use linfa_reduction::kernel::{IntoKernel, SparsePolynomialKernel};
+use ndarray_npy::write_npy;
+use ndarray_rand::rand::SeedableRng;
+use rand_isaac::Isaac64Rng;
+
+// A routine K-means task: build a synthetic dataset, fit the algorithm on it
+// and save both training data and predictions to disk.
+fn main() {
+    // Our random number generator, seeded for reproducibility
+    let mut rng = Isaac64Rng::seed_from_u64(42);
+
+    // For each our expected centroids, generate `n` data points around it (a "blob")
+    let n = 3000;
+
+    // generate three convoluted rings
+    let dataset = generate_convoluted_rings(&[(0.0, 3.0), (10.0, 13.0), (20.0, 23.0)], n, &mut rng);
+
+    // generate sparse polynomial kernel with k = 14, c = 5 and d = 2
+    let diffusion_map = SparsePolynomialKernel::new(&dataset, 14, 5.0, 2.0)
+        .reduce_fixed(4);
+
+    // get embedding
+    let embedding = diffusion_map.embedding();
+
+    // Save to disk our dataset (and the cluster label assigned to each observation)
+    // We use the `npy` format for compatibility with NumPy
+    write_npy("diffusion_map_dataset.npy", dataset).expect("Failed to write .npy file");
+    write_npy(
+        "diffusion_map_embedding.npy",
+        embedding
+    )
+    .expect("Failed to write .npy file");
+}
diff --git a/linfa-reduction/examples/pca.rs b/linfa-reduction/examples/pca.rs
@@ -0,0 +1,34 @@
+use linfa_clustering::generate_blobs;
+use linfa_reduction::PrincipalComponentAnalysis;
+use ndarray::array;
+use ndarray_npy::write_npy;
+use ndarray_rand::rand::SeedableRng;
+use rand_isaac::Isaac64Rng;
+
+// A routine K-means task: build a synthetic dataset, fit the algorithm on it
+// and save both training data and predictions to disk.
+fn main() {
+    // Our random number generator, seeded for reproducibility
+    let mut rng = Isaac64Rng::seed_from_u64(42);
+
+    // For each our expected centroids, generate `n` data points around it (a "blob")
+    let expected_centroids = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],];
+    let n = 10;
+    let dataset = generate_blobs(n, &expected_centroids, &mut rng);
+
+    dbg!(&dataset);
+
+    let embedding = PrincipalComponentAnalysis::fit(dataset.clone(), 1)
+        .predict(&dataset);
+
+    dbg!(&embedding);
+
+    // Save to disk our dataset (and the cluster label assigned to each observation)
+    // We use the `npy` format for compatibility with NumPy
+    write_npy("pca_dataset.npy", dataset).expect("Failed to write .npy file");
+    write_npy(
+        "pca_embedding.npy",
+        embedding
+    )
+    .expect("Failed to write .npy file");
+}
diff --git a/linfa-reduction/src/diffusion_map/algorithms.rs b/linfa-reduction/src/diffusion_map/algorithms.rs
@@ -0,0 +1,113 @@
+use ndarray::{Array1, Array2};
+use ndarray_linalg::{TruncatedOrder, lobpcg::LobpcgResult, lobpcg, eigh::EighInto, lapack::UPLO, Scalar};
+use ndarray_rand::{rand_distr::Uniform, RandomExt};
+use num_traits::NumCast;
+
+use crate::Float;
+use crate::kernel::{Kernel, IntoKernel};
+use super::hyperparameters::DiffusionMapHyperParams;
+
+pub struct DiffusionMap<A> {
+    hyperparameters: DiffusionMapHyperParams,
+    embedding: Array2<A>,
+    eigvals: Array1<A>
+}
+
+impl<A: Float> DiffusionMap<A> {
+    pub fn project(
+        hyperparameters: DiffusionMapHyperParams,
+        kernel: impl IntoKernel<A>
+    ) -> Self {
+        // compute spectral embedding with diffusion map
+        let (embedding, eigvals) = compute_diffusion_map(kernel.into_kernel(), hyperparameters.steps(), 0.0, hyperparameters.embedding_size(), None);
+
+        DiffusionMap {
+            hyperparameters,
+            embedding,
+            eigvals
+        }
+    }
+
+    /// Return the hyperparameters used to train this spectral mode instance.
+    pub fn hyperparameters(&self) -> &DiffusionMapHyperParams {
+        &self.hyperparameters
+    }
+
+    /// Estimate the number of clusters in this embedding (very crude for now)
+    pub fn estimate_clusters(&self) -> usize {
+        let mean = self.eigvals.sum() / NumCast::from(self.eigvals.len()).unwrap();
+        self.eigvals.iter().filter(|x| *x > &mean).count() + 1
+    }
+
+    /// Return a copy of the eigenvalues
+    pub fn eigvals(&self) -> Array1<A> {
+        self.eigvals.clone()
+    }
+
+    pub fn embedding(&self) -> Array2<A> {
+        self.embedding.clone()
+    }
+}
+
+fn compute_diffusion_map<A: Float>(kernel: impl Kernel<A>, steps: usize, alpha: f32, embedding_size: usize, guess: Option<Array2<A>>) -> (Array2<A>, Array1<A>) {
+    assert!(embedding_size < kernel.size());
+
+    let d = kernel.sum()
+        .mapv(|x| x.recip());
+
+    let d2 = d.mapv(|x| x.powf(NumCast::from(0.5 + alpha).unwrap()));
+
+    // use full eigenvalue decomposition for small problem sizes
+    let (vals, mut vecs) = if kernel.size() < 5 * embedding_size + 1 {
+        let mut matrix = kernel.mul_similarity(&Array2::from_diag(&d).view());
+        matrix.gencolumns_mut().into_iter().zip(d.iter()).for_each(|(mut a,b)| a *= *b);
+
+        let (vals, vecs) = matrix.eigh_into(UPLO::Lower).unwrap();
+        let (vals, vecs) = (vals.slice_move(s![..; -1]), vecs.slice_move(s![.., ..; -1]));
+        (
+            vals.slice_move(s![1..embedding_size+1]).mapv(|x| Scalar::from_real(x)),
+            vecs.slice_move(s![..,1..embedding_size+1])
+        )
+    } else {
+        // calculate truncated eigenvalue decomposition
+        let x = guess.unwrap_or(
+            Array2::random((kernel.size(), embedding_size + 1), Uniform::new(0.0f64, 1.0))
+            .mapv(|x| NumCast::from(x).unwrap())
+        );
+
+        let result = lobpcg::lobpcg(|y| {
+            let mut y = y.to_owned();
+            y.genrows_mut().into_iter().zip(d2.iter()).for_each(|(mut a,b)| a *= *b);
+            let mut y = kernel.mul_similarity(&y.view());
+            y.genrows_mut().into_iter().zip(d2.iter()).for_each(|(mut a,b)| a *= *b);
+
+            y
+        }, x, |_| {}, None, 1e-15, 200, TruncatedOrder::Largest);
+
+        let (vals, vecs) = match result {
+            LobpcgResult::Ok(vals, vecs, _) | LobpcgResult::Err(vals, vecs, _, _) => { (vals, vecs)},
+            _ => panic!("Eigendecomposition failed!")
+        };
+
+
+        // cut away first eigenvalue/eigenvector
+        (
+            vals.slice_move(s![1..]),
+            vecs.slice_move(s![..,1..])
+        )
+    };
+
+    let d = d.mapv(|x| x.sqrt());
+
+    for (mut col, val) in vecs.genrows_mut().into_iter().zip(d.iter()) {
+        col *= *val;
+    }
+
+    let steps = NumCast::from(steps).unwrap();
+    for (mut vec, val) in vecs.gencolumns_mut().into_iter().zip(vals.iter()) {
+        vec *= val.powf(steps);
+    }
+
+    (vecs, vals)
+}
+
diff --git a/linfa-reduction/src/diffusion_map/hyperparameters.rs b/linfa-reduction/src/diffusion_map/hyperparameters.rs
@@ -0,0 +1,48 @@
+pub struct DiffusionMapHyperParams {
+    steps: usize,
+    embedding_size: usize,
+}
+
+pub struct DiffusionMapHyperParamsBuilder {
+    steps: usize,
+    embedding_size: usize,
+}
+
+impl DiffusionMapHyperParamsBuilder {
+    pub fn steps(mut self, steps: usize) -> Self {
+        self.steps = steps;
+
+        self
+    }
+
+    pub fn build(self) -> DiffusionMapHyperParams {
+        DiffusionMapHyperParams::build(
+            self.steps,
+            self.embedding_size,
+        )
+    }
+}
+
+impl DiffusionMapHyperParams {
+    pub fn new(embedding_size: usize) -> DiffusionMapHyperParamsBuilder {
+        DiffusionMapHyperParamsBuilder {
+            steps: 10,
+            embedding_size
+        }
+    }
+
+    pub fn steps(&self) -> usize {
+        self.steps
+    }
+
+    pub fn embedding_size(&self) -> usize {
+        self.embedding_size
+    }
+
+    pub fn build(steps: usize, embedding_size: usize) -> Self {
+        DiffusionMapHyperParams {
+            steps,
+            embedding_size,
+        }
+    }
+}
diff --git a/linfa-reduction/src/diffusion_map/mod.rs b/linfa-reduction/src/diffusion_map/mod.rs
@@ -0,0 +1,5 @@
+mod algorithms;
+mod hyperparameters;
+
+pub use algorithms::*;
+pub use hyperparameters::*;
diff --git a/linfa-reduction/src/kernel/gaussian.rs b/linfa-reduction/src/kernel/gaussian.rs
@@ -0,0 +1,59 @@
+use num_traits::NumCast;
+use ndarray::{Array2, ArrayView1};
+use sprs::CsMat;
+use std::iter::Sum;
+
+use crate::Float;
+use crate::kernel::{IntoKernel, dense_from_fn, sparse_from_fn};
+
+fn kernel<A: Float>(a: ArrayView1<A>, b: ArrayView1<A>, eps: A) -> A {
+    let distance = a.iter().zip(b.iter()).map(|(x,y)| (*x-*y)*(*x-*y))
+        .sum::<A>();
+
+    (-distance / eps).exp()
+}
+
+pub struct GaussianKernel<A> {
+    pub data: Array2<A>
+}
+
+impl<A: Float + Sum<A>> GaussianKernel<A> {
+    pub fn new(dataset: &Array2<A>, eps: f32) -> Self {
+        let eps = NumCast::from(eps).unwrap();
+
+        GaussianKernel {
+            data: dense_from_fn(dataset, |a,b| kernel(a,b,eps))
+        }
+    }
+}
+
+impl<A: Float> IntoKernel<A> for GaussianKernel<A> {
+    type IntoKer = Array2<A>;
+
+    fn into_kernel(self) -> Self::IntoKer {
+        self.data
+    }
+}
+
+pub struct SparseGaussianKernel<A> {
+    similarity: CsMat<A>
+}
+
+impl<A: Float> SparseGaussianKernel<A> {
+    pub fn new(dataset: &Array2<A>, k: usize, eps: f32) -> Self {
+        let eps = NumCast::from(eps).unwrap();
+
+        SparseGaussianKernel { 
+            similarity: sparse_from_fn(dataset, k, |a,b| kernel(a,b,eps))
+        }
+    }
+}
+
+impl<A: Float> IntoKernel<A> for SparseGaussianKernel<A> {
+    type IntoKer = CsMat<A>;
+
+    fn into_kernel(self) -> Self::IntoKer {
+        self.similarity
+    }
+}
+