-
-
Notifications
You must be signed in to change notification settings - Fork 235
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add diffusion map dimensionality reduction and simple Principal Compo…
…nent Analysis (#17) This PR adds two dimensionality reduction technique. * Principal Component Analysis is a very common linear reduction technique based on the first and second order of the data. It projects data in such a way that the maximal of variance is preserved, while centering it. Neither sparse nor kernel PCA are implemented. * Diffusion Maps on the other hand are a non-linear reduction method, capable for more complex data. A kernel has to be chosen, which projects the data into a higher-dimensional space. With a sparse graph structure a approximation is derived, modeling the probability of diffusion between different sample points.
- Loading branch information
Showing
18 changed files
with
800 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,3 +37,4 @@ fn main() { | |
) | ||
.expect("Failed to write .npy file"); | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
[package] | ||
name = "linfa-reduction" | ||
version = "0.1.0" | ||
authors = ["Lorenz Schmidt <bytesnake@mailbox.org>"] | ||
description = "A collection of dimensionality reduction techniques" | ||
edition = "2018" | ||
license = "MIT/Apache-2.0" | ||
|
||
repository = "https://github.com/LukeMathWalker/linfa" | ||
readme = "README.md" | ||
|
||
keywords = ["dimensionality reduction", "machine-learning", "linfa", "spectral", "unsupervised"] | ||
categories = ["algorithms", "mathematics", "science"] | ||
|
||
[dependencies] | ||
ndarray = { version = "0.13" , features = ["rayon", "serde", "approx"]} | ||
ndarray-rand = "0.11" | ||
ndarray-stats = "0.3" | ||
ndarray-linalg = { version = "0.12", features = ["openblas"] } | ||
sprs = "0.7" | ||
serde = { version = "1", features = ["derive"] } | ||
num-traits = "0.1.32" | ||
hnsw = "0.6" | ||
space = "0.10" | ||
|
||
[dev-dependencies] | ||
rand_isaac = "0.2.0" | ||
ndarray-npy = { version = "0.5", default-features = false } | ||
criterion = "0.3" | ||
serde_json = "1" | ||
approx = "0.3" | ||
linfa-clustering = { path = "../linfa-clustering" } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
use linfa_reduction::utils::generate_convoluted_rings; | ||
use linfa_reduction::kernel::{IntoKernel, SparsePolynomialKernel}; | ||
use ndarray_npy::write_npy; | ||
use ndarray_rand::rand::SeedableRng; | ||
use rand_isaac::Isaac64Rng; | ||
|
||
// A routine K-means task: build a synthetic dataset, fit the algorithm on it | ||
// and save both training data and predictions to disk. | ||
fn main() { | ||
// Our random number generator, seeded for reproducibility | ||
let mut rng = Isaac64Rng::seed_from_u64(42); | ||
|
||
// For each our expected centroids, generate `n` data points around it (a "blob") | ||
let n = 3000; | ||
|
||
// generate three convoluted rings | ||
let dataset = generate_convoluted_rings(&[(0.0, 3.0), (10.0, 13.0), (20.0, 23.0)], n, &mut rng); | ||
|
||
// generate sparse polynomial kernel with k = 14, c = 5 and d = 2 | ||
let diffusion_map = SparsePolynomialKernel::new(&dataset, 14, 5.0, 2.0) | ||
.reduce_fixed(4); | ||
|
||
// get embedding | ||
let embedding = diffusion_map.embedding(); | ||
|
||
// Save to disk our dataset (and the cluster label assigned to each observation) | ||
// We use the `npy` format for compatibility with NumPy | ||
write_npy("diffusion_map_dataset.npy", dataset).expect("Failed to write .npy file"); | ||
write_npy( | ||
"diffusion_map_embedding.npy", | ||
embedding | ||
) | ||
.expect("Failed to write .npy file"); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
use linfa_clustering::generate_blobs; | ||
use linfa_reduction::PrincipalComponentAnalysis; | ||
use ndarray::array; | ||
use ndarray_npy::write_npy; | ||
use ndarray_rand::rand::SeedableRng; | ||
use rand_isaac::Isaac64Rng; | ||
|
||
// A routine K-means task: build a synthetic dataset, fit the algorithm on it | ||
// and save both training data and predictions to disk. | ||
fn main() { | ||
// Our random number generator, seeded for reproducibility | ||
let mut rng = Isaac64Rng::seed_from_u64(42); | ||
|
||
// For each our expected centroids, generate `n` data points around it (a "blob") | ||
let expected_centroids = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],]; | ||
let n = 10; | ||
let dataset = generate_blobs(n, &expected_centroids, &mut rng); | ||
|
||
dbg!(&dataset); | ||
|
||
let embedding = PrincipalComponentAnalysis::fit(dataset.clone(), 1) | ||
.predict(&dataset); | ||
|
||
dbg!(&embedding); | ||
|
||
// Save to disk our dataset (and the cluster label assigned to each observation) | ||
// We use the `npy` format for compatibility with NumPy | ||
write_npy("pca_dataset.npy", dataset).expect("Failed to write .npy file"); | ||
write_npy( | ||
"pca_embedding.npy", | ||
embedding | ||
) | ||
.expect("Failed to write .npy file"); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
use ndarray::{Array1, Array2}; | ||
use ndarray_linalg::{TruncatedOrder, lobpcg::LobpcgResult, lobpcg, eigh::EighInto, lapack::UPLO, Scalar}; | ||
use ndarray_rand::{rand_distr::Uniform, RandomExt}; | ||
use num_traits::NumCast; | ||
|
||
use crate::Float; | ||
use crate::kernel::{Kernel, IntoKernel}; | ||
use super::hyperparameters::DiffusionMapHyperParams; | ||
|
||
pub struct DiffusionMap<A> { | ||
hyperparameters: DiffusionMapHyperParams, | ||
embedding: Array2<A>, | ||
eigvals: Array1<A> | ||
} | ||
|
||
impl<A: Float> DiffusionMap<A> { | ||
pub fn project( | ||
hyperparameters: DiffusionMapHyperParams, | ||
kernel: impl IntoKernel<A> | ||
) -> Self { | ||
// compute spectral embedding with diffusion map | ||
let (embedding, eigvals) = compute_diffusion_map(kernel.into_kernel(), hyperparameters.steps(), 0.0, hyperparameters.embedding_size(), None); | ||
|
||
DiffusionMap { | ||
hyperparameters, | ||
embedding, | ||
eigvals | ||
} | ||
} | ||
|
||
/// Return the hyperparameters used to train this spectral mode instance. | ||
pub fn hyperparameters(&self) -> &DiffusionMapHyperParams { | ||
&self.hyperparameters | ||
} | ||
|
||
/// Estimate the number of clusters in this embedding (very crude for now) | ||
pub fn estimate_clusters(&self) -> usize { | ||
let mean = self.eigvals.sum() / NumCast::from(self.eigvals.len()).unwrap(); | ||
self.eigvals.iter().filter(|x| *x > &mean).count() + 1 | ||
} | ||
|
||
/// Return a copy of the eigenvalues | ||
pub fn eigvals(&self) -> Array1<A> { | ||
self.eigvals.clone() | ||
} | ||
|
||
pub fn embedding(&self) -> Array2<A> { | ||
self.embedding.clone() | ||
} | ||
} | ||
|
||
fn compute_diffusion_map<A: Float>(kernel: impl Kernel<A>, steps: usize, alpha: f32, embedding_size: usize, guess: Option<Array2<A>>) -> (Array2<A>, Array1<A>) { | ||
assert!(embedding_size < kernel.size()); | ||
|
||
let d = kernel.sum() | ||
.mapv(|x| x.recip()); | ||
|
||
let d2 = d.mapv(|x| x.powf(NumCast::from(0.5 + alpha).unwrap())); | ||
|
||
// use full eigenvalue decomposition for small problem sizes | ||
let (vals, mut vecs) = if kernel.size() < 5 * embedding_size + 1 { | ||
let mut matrix = kernel.mul_similarity(&Array2::from_diag(&d).view()); | ||
matrix.gencolumns_mut().into_iter().zip(d.iter()).for_each(|(mut a,b)| a *= *b); | ||
|
||
let (vals, vecs) = matrix.eigh_into(UPLO::Lower).unwrap(); | ||
let (vals, vecs) = (vals.slice_move(s![..; -1]), vecs.slice_move(s![.., ..; -1])); | ||
( | ||
vals.slice_move(s![1..embedding_size+1]).mapv(|x| Scalar::from_real(x)), | ||
vecs.slice_move(s![..,1..embedding_size+1]) | ||
) | ||
} else { | ||
// calculate truncated eigenvalue decomposition | ||
let x = guess.unwrap_or( | ||
Array2::random((kernel.size(), embedding_size + 1), Uniform::new(0.0f64, 1.0)) | ||
.mapv(|x| NumCast::from(x).unwrap()) | ||
); | ||
|
||
let result = lobpcg::lobpcg(|y| { | ||
let mut y = y.to_owned(); | ||
y.genrows_mut().into_iter().zip(d2.iter()).for_each(|(mut a,b)| a *= *b); | ||
let mut y = kernel.mul_similarity(&y.view()); | ||
y.genrows_mut().into_iter().zip(d2.iter()).for_each(|(mut a,b)| a *= *b); | ||
|
||
y | ||
}, x, |_| {}, None, 1e-15, 200, TruncatedOrder::Largest); | ||
|
||
let (vals, vecs) = match result { | ||
LobpcgResult::Ok(vals, vecs, _) | LobpcgResult::Err(vals, vecs, _, _) => { (vals, vecs)}, | ||
_ => panic!("Eigendecomposition failed!") | ||
}; | ||
|
||
|
||
// cut away first eigenvalue/eigenvector | ||
( | ||
vals.slice_move(s![1..]), | ||
vecs.slice_move(s![..,1..]) | ||
) | ||
}; | ||
|
||
let d = d.mapv(|x| x.sqrt()); | ||
|
||
for (mut col, val) in vecs.genrows_mut().into_iter().zip(d.iter()) { | ||
col *= *val; | ||
} | ||
|
||
let steps = NumCast::from(steps).unwrap(); | ||
for (mut vec, val) in vecs.gencolumns_mut().into_iter().zip(vals.iter()) { | ||
vec *= val.powf(steps); | ||
} | ||
|
||
(vecs, vals) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
pub struct DiffusionMapHyperParams { | ||
steps: usize, | ||
embedding_size: usize, | ||
} | ||
|
||
pub struct DiffusionMapHyperParamsBuilder { | ||
steps: usize, | ||
embedding_size: usize, | ||
} | ||
|
||
impl DiffusionMapHyperParamsBuilder { | ||
pub fn steps(mut self, steps: usize) -> Self { | ||
self.steps = steps; | ||
|
||
self | ||
} | ||
|
||
pub fn build(self) -> DiffusionMapHyperParams { | ||
DiffusionMapHyperParams::build( | ||
self.steps, | ||
self.embedding_size, | ||
) | ||
} | ||
} | ||
|
||
impl DiffusionMapHyperParams { | ||
pub fn new(embedding_size: usize) -> DiffusionMapHyperParamsBuilder { | ||
DiffusionMapHyperParamsBuilder { | ||
steps: 10, | ||
embedding_size | ||
} | ||
} | ||
|
||
pub fn steps(&self) -> usize { | ||
self.steps | ||
} | ||
|
||
pub fn embedding_size(&self) -> usize { | ||
self.embedding_size | ||
} | ||
|
||
pub fn build(steps: usize, embedding_size: usize) -> Self { | ||
DiffusionMapHyperParams { | ||
steps, | ||
embedding_size, | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
mod algorithms; | ||
mod hyperparameters; | ||
|
||
pub use algorithms::*; | ||
pub use hyperparameters::*; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
use num_traits::NumCast; | ||
use ndarray::{Array2, ArrayView1}; | ||
use sprs::CsMat; | ||
use std::iter::Sum; | ||
|
||
use crate::Float; | ||
use crate::kernel::{IntoKernel, dense_from_fn, sparse_from_fn}; | ||
|
||
fn kernel<A: Float>(a: ArrayView1<A>, b: ArrayView1<A>, eps: A) -> A { | ||
let distance = a.iter().zip(b.iter()).map(|(x,y)| (*x-*y)*(*x-*y)) | ||
.sum::<A>(); | ||
|
||
(-distance / eps).exp() | ||
} | ||
|
||
pub struct GaussianKernel<A> { | ||
pub data: Array2<A> | ||
} | ||
|
||
impl<A: Float + Sum<A>> GaussianKernel<A> { | ||
pub fn new(dataset: &Array2<A>, eps: f32) -> Self { | ||
let eps = NumCast::from(eps).unwrap(); | ||
|
||
GaussianKernel { | ||
data: dense_from_fn(dataset, |a,b| kernel(a,b,eps)) | ||
} | ||
} | ||
} | ||
|
||
impl<A: Float> IntoKernel<A> for GaussianKernel<A> { | ||
type IntoKer = Array2<A>; | ||
|
||
fn into_kernel(self) -> Self::IntoKer { | ||
self.data | ||
} | ||
} | ||
|
||
pub struct SparseGaussianKernel<A> { | ||
similarity: CsMat<A> | ||
} | ||
|
||
impl<A: Float> SparseGaussianKernel<A> { | ||
pub fn new(dataset: &Array2<A>, k: usize, eps: f32) -> Self { | ||
let eps = NumCast::from(eps).unwrap(); | ||
|
||
SparseGaussianKernel { | ||
similarity: sparse_from_fn(dataset, k, |a,b| kernel(a,b,eps)) | ||
} | ||
} | ||
} | ||
|
||
impl<A: Float> IntoKernel<A> for SparseGaussianKernel<A> { | ||
type IntoKer = CsMat<A>; | ||
|
||
fn into_kernel(self) -> Self::IntoKer { | ||
self.similarity | ||
} | ||
} | ||
|
Oops, something went wrong.