diff --git a/algorithms/linfa-clustering/benches/appx_dbscan.rs b/algorithms/linfa-clustering/benches/appx_dbscan.rs index f20aa7398..cbd0e7d47 100644 --- a/algorithms/linfa-clustering/benches/appx_dbscan.rs +++ b/algorithms/linfa-clustering/benches/appx_dbscan.rs @@ -1,5 +1,5 @@ use criterion::{ - black_box, criterion_group, criterion_main, AxisScale, Criterion, ParameterizedBenchmark, + black_box, criterion_group, criterion_main, AxisScale, BenchmarkId, Criterion, PlotConfiguration, }; use linfa::traits::Transformer; @@ -19,28 +19,32 @@ fn appx_dbscan_bench(c: &mut Criterion) { /*(10000, 0.1),*/ ]; - let benchmark = ParameterizedBenchmark::new( - "appx_dbscan", - move |bencher, &cluster_size_and_slack| { - let min_points = 4; - let n_features = 3; - let tolerance = 0.3; - let centroids = - Array2::random_using((min_points, n_features), Uniform::new(-30., 30.), &mut rng); - let dataset = generate_blobs(cluster_size_and_slack.0, ¢roids, &mut rng); - bencher.iter(|| { - black_box( - AppxDbscan::params(min_points) - .tolerance(tolerance) - .slack(cluster_size_and_slack.1) - .transform(&dataset), - ) - }); - }, - cluster_sizes_and_slacks, - ) - .plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); - c.bench("appx_dbscan", benchmark); + let mut benchmark = c.benchmark_group("appx_dbscan"); + benchmark.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); + for cluster_size_and_slack in cluster_sizes_and_slacks { + let rng = &mut rng; + benchmark.bench_with_input( + BenchmarkId::new("appx_dbscan", cluster_size_and_slack.0), + &cluster_size_and_slack, + move |bencher, &cluster_size_and_slack| { + let min_points = 4; + let n_features = 3; + let tolerance = 0.3; + let centroids = + Array2::random_using((min_points, n_features), Uniform::new(-30., 30.), rng); + let dataset = generate_blobs(cluster_size_and_slack.0, ¢roids, rng); + bencher.iter(|| { + black_box( + AppxDbscan::params(min_points) + .tolerance(tolerance) + .slack(cluster_size_and_slack.1) + .transform(&dataset), + ) + }); + }, + ); + } + benchmark.finish(); } criterion_group! { diff --git a/algorithms/linfa-clustering/benches/dbscan.rs b/algorithms/linfa-clustering/benches/dbscan.rs index cc085af4d..940b67435 100644 --- a/algorithms/linfa-clustering/benches/dbscan.rs +++ b/algorithms/linfa-clustering/benches/dbscan.rs @@ -1,5 +1,5 @@ use criterion::{ - black_box, criterion_group, criterion_main, AxisScale, Criterion, ParameterizedBenchmark, + black_box, criterion_group, criterion_main, AxisScale, BenchmarkId, Criterion, PlotConfiguration, }; use linfa::traits::Transformer; @@ -14,27 +14,31 @@ fn dbscan_bench(c: &mut Criterion) { let mut rng = Isaac64Rng::seed_from_u64(40); let cluster_sizes = vec![10, 100, 1000, 10000]; - let benchmark = ParameterizedBenchmark::new( - "dbscan", - move |bencher, &cluster_size| { - let min_points = 4; - let n_features = 3; - let tolerance = 0.3; - let centroids = - Array2::random_using((min_points, n_features), Uniform::new(-30., 30.), &mut rng); - let dataset = generate_blobs(cluster_size, ¢roids, &mut rng); - bencher.iter(|| { - black_box( - Dbscan::params(min_points) - .tolerance(tolerance) - .transform(&dataset), - ) - }); - }, - cluster_sizes, - ) - .plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); - c.bench("dbscan", benchmark); + let mut benchmark = c.benchmark_group("dbscan"); + benchmark.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); + for cluster_size in cluster_sizes { + let rng = &mut rng; + benchmark.bench_with_input( + BenchmarkId::new("dbscan", cluster_size), + &cluster_size, + move |bencher, &cluster_size| { + let min_points = 4; + let n_features = 3; + let tolerance = 0.3; + let centroids = + Array2::random_using((min_points, n_features), Uniform::new(-30., 30.), rng); + let dataset = generate_blobs(cluster_size, ¢roids, rng); + bencher.iter(|| { + black_box( + Dbscan::params(min_points) + .tolerance(tolerance) + .transform(&dataset), + ) + }); + }, + ); + } + benchmark.finish() } criterion_group! { diff --git a/algorithms/linfa-clustering/benches/gaussian_mixture.rs b/algorithms/linfa-clustering/benches/gaussian_mixture.rs index e47954747..5ec7934dd 100644 --- a/algorithms/linfa-clustering/benches/gaussian_mixture.rs +++ b/algorithms/linfa-clustering/benches/gaussian_mixture.rs @@ -1,5 +1,5 @@ use criterion::{ - black_box, criterion_group, criterion_main, AxisScale, Criterion, ParameterizedBenchmark, + black_box, criterion_group, criterion_main, AxisScale, BenchmarkId, Criterion, PlotConfiguration, }; use linfa::traits::Fit; @@ -15,30 +15,34 @@ fn gaussian_mixture_bench(c: &mut Criterion) { let mut rng = Isaac64Rng::seed_from_u64(40); let cluster_sizes = vec![10, 100, 1000, 10000]; - let benchmark = ParameterizedBenchmark::new( - "gaussian_mixture", - move |bencher, &cluster_size| { - let n_clusters = 4; - let n_features = 3; - let centroids = - Array2::random_using((n_clusters, n_features), Uniform::new(-30., 30.), &mut rng); - let dataset: DatasetBase<_, _> = - (generate_blobs(cluster_size, ¢roids, &mut rng)).into(); - bencher.iter(|| { - black_box( - GaussianMixtureModel::params(n_clusters) - .with_rng(rng.clone()) - .with_tolerance(1e-3) - .with_max_n_iterations(1000) - .fit(&dataset) - .expect("GMM fitting fail"), - ) - }); - }, - cluster_sizes, - ) - .plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); - c.bench("gaussian_mixture", benchmark); + let mut benchmark = c.benchmark_group("gaussian_mixture"); + benchmark.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); + for cluster_size in cluster_sizes { + let rng = &mut rng; + benchmark.bench_with_input( + BenchmarkId::new("gaussian_mixture", cluster_size), + &cluster_size, + move |bencher, &cluster_size| { + let n_clusters = 4; + let n_features = 3; + let centroids = + Array2::random_using((n_clusters, n_features), Uniform::new(-30., 30.), rng); + let dataset: DatasetBase<_, _> = + (generate_blobs(cluster_size, ¢roids, rng)).into(); + bencher.iter(|| { + black_box( + GaussianMixtureModel::params(n_clusters) + .with_rng(rng.clone()) + .with_tolerance(1e-3) + .with_max_n_iterations(1000) + .fit(&dataset) + .expect("GMM fitting fail"), + ) + }); + }, + ); + } + benchmark.finish(); } criterion_group! { diff --git a/algorithms/linfa-clustering/benches/k_means.rs b/algorithms/linfa-clustering/benches/k_means.rs index 5b38dd9d1..4a0c8fb44 100644 --- a/algorithms/linfa-clustering/benches/k_means.rs +++ b/algorithms/linfa-clustering/benches/k_means.rs @@ -1,5 +1,5 @@ use criterion::{ - black_box, criterion_group, criterion_main, AxisScale, Criterion, ParameterizedBenchmark, + black_box, criterion_group, criterion_main, AxisScale, BenchmarkId, Criterion, PlotConfiguration, }; use linfa::traits::Fit; @@ -15,27 +15,26 @@ fn k_means_bench(c: &mut Criterion) { let mut rng = Isaac64Rng::seed_from_u64(40); let cluster_sizes = vec![10, 100, 1000, 10000]; - let benchmark = ParameterizedBenchmark::new( - "naive_k_means", - move |bencher, &cluster_size| { - let n_clusters = 4; - let n_features = 3; - let centroids = - Array2::random_using((n_clusters, n_features), Uniform::new(-30., 30.), &mut rng); - let dataset = DatasetBase::from(generate_blobs(cluster_size, ¢roids, &mut rng)); + let mut benchmark = c.benchmark_group("naive_k_means"); + benchmark.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); + for cluster_size in cluster_sizes { + let rng = &mut rng; + let n_clusters = 4; + let n_features = 3; + let centroids = + Array2::random_using((n_clusters, n_features), Uniform::new(-30., 30.), rng); + let dataset = DatasetBase::from(generate_blobs(cluster_size, ¢roids, rng)); + benchmark.bench_function(BenchmarkId::new("naive_k_means", cluster_size), |bencher| { bencher.iter(|| { - black_box( - KMeans::params_with_rng(n_clusters, rng.clone()) - .max_n_iterations(1000) - .tolerance(1e-3) - .fit(&dataset), - ) + KMeans::params_with_rng(black_box(n_clusters), black_box(rng.clone())) + .max_n_iterations(black_box(1000)) + .tolerance(black_box(1e-3)) + .fit(&dataset) }); - }, - cluster_sizes, - ) - .plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); - c.bench("naive_k_means", benchmark); + }); + } + + benchmark.finish(); } criterion_group! { diff --git a/algorithms/linfa-clustering/src/k_means/algorithm.rs b/algorithms/linfa-clustering/src/k_means/algorithm.rs index d8203b9cf..c236f24ac 100644 --- a/algorithms/linfa-clustering/src/k_means/algorithm.rs +++ b/algorithms/linfa-clustering/src/k_means/algorithm.rs @@ -1,13 +1,11 @@ use crate::k_means::errors::{KMeansError, Result}; -use crate::k_means::helpers::IncrementalMean; use crate::k_means::hyperparameters::{KMeansHyperParams, KMeansHyperParamsBuilder}; use linfa::{traits::*, DatasetBase, Float}; -use ndarray::{s, Array1, Array2, ArrayBase, Axis, Data, DataMut, Ix1, Ix2, Zip}; +use ndarray::{Array1, Array2, ArrayBase, Axis, Data, DataMut, Ix1, Ix2, Zip}; use ndarray_rand::rand; use ndarray_rand::rand::Rng; use ndarray_stats::DeviationExt; use rand_isaac::Isaac64Rng; -use std::collections::HashMap; #[cfg(feature = "serde")] use serde_crate::{Deserialize, Serialize}; @@ -258,54 +256,36 @@ fn compute_inertia( /// If you check the `compute_cluster_memberships` function, /// you can see that it expects to receive centroids as a 2-dimensional array. /// -/// `compute_centroids` wraps our `compute_centroids_hashmap` to return a 2-dimensional array, +/// `compute_centroids` returns a 2-dimensional array, /// where the i-th row corresponds to the i-th cluster. fn compute_centroids( - // The number of clusters could be inferred from `centroids_hashmap`, - // but it is indeed possible for a cluster to become empty during the - // multiple rounds of assignment-update optimisations - // This would lead to an underestimate of the number of clusters - // and several errors down the line due to shape mismatches n_clusters: usize, // (n_observations, n_features) observations: &ArrayBase, Ix2>, // (n_observations,) cluster_memberships: &ArrayBase, Ix1>, ) -> Array2 { - let centroids_hashmap = compute_centroids_hashmap(&observations, &cluster_memberships); let (_, n_features) = observations.dim(); + let mut counts: Array1 = Array1::zeros(n_clusters); let mut centroids: Array2 = Array2::zeros((n_clusters, n_features)); - for (centroid_index, centroid) in centroids_hashmap.into_iter() { - centroids - .slice_mut(s![centroid_index, ..]) - .assign(¢roid.current_mean); - } - centroids -} -/// Iterate over our observations and capture in a HashMap the new centroids. -/// The HashMap is a (cluster_index => new centroid) mapping. -fn compute_centroids_hashmap( - // (n_observations, n_features) - observations: &ArrayBase, Ix2>, - // (n_observations,) - cluster_memberships: &ArrayBase, Ix1>, -) -> HashMap> { - let mut new_centroids: HashMap> = HashMap::new(); Zip::from(observations.genrows()) .and(cluster_memberships) - .apply(|observation, cluster_membership| { - if let Some(incremental_mean) = new_centroids.get_mut(cluster_membership) { - incremental_mean.update(&observation); - } else { - new_centroids.insert( - *cluster_membership, - IncrementalMean::new(observation.to_owned()), - ); + .apply(|observation, &cluster_membership| { + let mut centroid = centroids.row_mut(cluster_membership); + centroid += &observation; + counts[cluster_membership] += 1; + }); + + Zip::from(centroids.genrows_mut()) + .and(&counts) + .apply(|mut centroid, &cnt| { + if cnt != 0 { + centroid /= F::from(cnt).unwrap(); } }); - new_centroids + centroids } /// Given a matrix of centroids with shape (n_centroids, n_features) @@ -351,11 +331,9 @@ fn closest_centroid( // (n_features) observation: &ArrayBase, Ix1>, ) -> usize { - let mut iterator = centroids.genrows().into_iter().peekable(); + let iterator = centroids.genrows().into_iter(); - let first_centroid = iterator - .peek() - .expect("There has to be at least one centroid"); + let first_centroid = centroids.row(0); let (mut closest_index, mut minimum_distance) = ( 0, first_centroid @@ -472,6 +450,15 @@ mod tests { assert_eq!(centroids.len_of(Axis(0)), 2); } + #[test] + fn test_compute_extra_centroids() { + let observations = array![[1.0, 2.0]]; + let memberships = array![0]; + // Should return an average of 0 for empty clusters + let centroids = compute_centroids(2, &observations, &memberships); + assert_abs_diff_eq!(centroids, array![[1.0, 2.0], [0.0, 0.0]]); + } + #[test] // An observation is closest to itself. fn nothing_is_closer_than_self() { diff --git a/algorithms/linfa-clustering/src/k_means/helpers.rs b/algorithms/linfa-clustering/src/k_means/helpers.rs deleted file mode 100644 index 46e52fb85..000000000 --- a/algorithms/linfa-clustering/src/k_means/helpers.rs +++ /dev/null @@ -1,89 +0,0 @@ -use linfa::Float; -use ndarray::{Array1, ArrayBase, Data, Ix1}; - -/// The observation matrix will not be partitioned by cluster membership: we might have -/// a bunch of observations belonging to the first cluster followed by one observation -/// in the second cluster, and so on until the end of our data points. -/// -/// It would be convenient if we could iterate over our observations, -/// updating the relevant new centroid one observation at a time. -/// -/// In other words, we want to compute **an incremental mean**. -/// -/// The formula to compute the new mean based on the mean of `n` previous observations and -/// a new observation is the following: -/// -/// new_mean = current_mean + (new_observation - current_mean) / (n + 1) -/// -/// Check https://math.stackexchange.com/questions/106700/incremental-averageing for -/// a derivation (and a nicely formatted formula). -/// -/// To do this successfully, we need to keep track of: -/// - the current mean (`current_mean`); -/// - the number of observations we have seen so far (`n`). -/// -/// We can store this information in a struct: -pub(crate) struct IncrementalMean { - pub current_mean: Array1, - pub n_observations: usize, -} - -impl IncrementalMean { - pub fn new(first_observation: Array1) -> Self { - Self { - current_mean: first_observation, - n_observations: 1, - } - } - - pub fn update(&mut self, new_observation: &ArrayBase, Ix1>) { - self.n_observations += 1; - let shift = (new_observation - &self.current_mean) - .mapv_into(|x| x / F::from(self.n_observations).unwrap()); - self.current_mean += &shift; - } -} - -#[cfg(test)] -mod tests { - use super::*; - use approx::assert_abs_diff_eq; - use ndarray::{Array, Array2, Axis}; - use ndarray_rand::rand_distr::Uniform; - use ndarray_rand::RandomExt; - - #[test] - fn incremental_mean() { - let n_observations = 100; - let observations: Array2 = - Array::random((n_observations, 5), Uniform::new(-100., 100.)); - - // We need to initialise `incremental_mean` with the first observation - // We'll mark it as uninitialised using `None` - let mut incremental_mean: Option> = None; - - for observation in observations.genrows().into_iter() { - // If it has already been initialised, update it - if let Some(mean) = incremental_mean.as_mut() { - mean.update(&observation); - // Otherwise, initialise it - // Given that this branch is used only once, this is quite wasteful, - // but it's easier to read... hence ¯\_(ツ)_/¯ - } else { - // `.to_owned` takes `observation`, which has type `ArrayView1`, - // and returns an `Array1`, performing an allocation. - incremental_mean = Some(IncrementalMean::new(observation.to_owned())); - } - } - - let incremental_mean = incremental_mean.unwrap(); - - assert_eq!(incremental_mean.n_observations, n_observations); - // No significant difference between computing the mean incrementally or in a single shot - assert_abs_diff_eq!( - incremental_mean.current_mean, - observations.mean_axis(Axis(0)).unwrap(), - epsilon = 1e-5 - ); - } -} diff --git a/algorithms/linfa-clustering/src/k_means/mod.rs b/algorithms/linfa-clustering/src/k_means/mod.rs index ae97be3be..bddadf9ad 100644 --- a/algorithms/linfa-clustering/src/k_means/mod.rs +++ b/algorithms/linfa-clustering/src/k_means/mod.rs @@ -1,6 +1,5 @@ mod algorithm; mod errors; -mod helpers; mod hyperparameters; pub use algorithm::*;