diff --git a/algorithms/linfa-clustering/Cargo.toml b/algorithms/linfa-clustering/Cargo.toml index 7861bb80b..e9bb03ab4 100644 --- a/algorithms/linfa-clustering/Cargo.toml +++ b/algorithms/linfa-clustering/Cargo.toml @@ -37,7 +37,7 @@ num-traits = "0.2" rand_xoshiro = "0.6" space = "0.12" thiserror = "1.0" -#partitions = "0.2.4" This one will break in a future version of Rust and has no replacement +disjoint = "0.6.0" linfa = { version = "0.6.1", path = "../.." } linfa-nn = { version = "0.6.1", path = "../linfa-nn" } noisy_float = "0.2.0" @@ -59,6 +59,10 @@ harness = false name = "dbscan" harness = false +[[bench]] +name = "appx_dbscan" +harness = false + [[bench]] name = "gaussian_mixture" harness = false diff --git a/algorithms/linfa-clustering/README.md b/algorithms/linfa-clustering/README.md index ee88a49a2..c3ef4aac6 100644 --- a/algorithms/linfa-clustering/README.md +++ b/algorithms/linfa-clustering/README.md @@ -14,13 +14,15 @@ You can find a roadmap (and a selection of good first issues) `linfa-clustering` currently provides implementation of the following clustering algorithms, in addition to a couple of helper functions: - K-Means - DBSCAN -- Approximated DBSCAN (Currently an alias for DBSCAN, due to its superior performance) +- Approximated DBSCAN - Gaussian Mixture Model Implementation choices, algorithmic details and a tutorial can be found [here](https://docs.rs/linfa-clustering). +**WARNING:** Currently the Approximated DBSCAN implementation is slower than the normal DBSCAN implementation. Therefore DBSCAN should always be used over Approximated DBSCAN. + ## BLAS/Lapack backend We found that the pure Rust implementation maintained similar performance to the BLAS/LAPACK version and have removed it with this [PR](https://github.com/rust-ml/linfa/pull/257). Thus, to reduce code complexity BLAS support has been removed for this module. diff --git a/algorithms/linfa-clustering/benches/appx_dbscan.rs b/algorithms/linfa-clustering/benches/appx_dbscan.rs new file mode 100644 index 000000000..51003fa93 --- /dev/null +++ b/algorithms/linfa-clustering/benches/appx_dbscan.rs @@ -0,0 +1,63 @@ +use criterion::{ + black_box, criterion_group, criterion_main, AxisScale, BenchmarkId, Criterion, + PlotConfiguration, +}; +use linfa::benchmarks::config; +use linfa::traits::Transformer; +use linfa_clustering::AppxDbscan; +use linfa_datasets::generate; +use ndarray::Array2; +use ndarray_rand::rand::SeedableRng; +use ndarray_rand::rand_distr::Uniform; +use ndarray_rand::RandomExt; +use rand_xoshiro::Xoshiro256Plus; + +fn appx_dbscan_bench(c: &mut Criterion) { + let mut rng = Xoshiro256Plus::seed_from_u64(40); + let cluster_sizes_and_slacks = vec![ + (10, 0.00001), + (100, 0.00001), + (1000, 0.00001), + /*(10000, 0.1),*/ + ]; + + let mut benchmark = c.benchmark_group("appx_dbscan"); + config::set_default_benchmark_configs(&mut benchmark); + benchmark.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); + + for cluster_size_and_slack in cluster_sizes_and_slacks { + let rng = &mut rng; + benchmark.bench_with_input( + BenchmarkId::new("appx_dbscan", cluster_size_and_slack.0), + &cluster_size_and_slack, + move |bencher, &cluster_size_and_slack| { + let min_points = 4; + let n_features = 3; + let tolerance = 0.3; + let centroids = + Array2::random_using((min_points, n_features), Uniform::new(-30., 30.), rng); + let dataset = generate::blobs(cluster_size_and_slack.0, ¢roids, rng); + bencher.iter(|| { + black_box( + AppxDbscan::params(min_points) + .tolerance(tolerance) + .slack(cluster_size_and_slack.1) + .transform(&dataset), + ) + }); + }, + ); + } + benchmark.finish(); +} + +#[cfg(not(target_os = "windows"))] +criterion_group! { + name = benches; + config = config::get_default_profiling_configs(); + targets = appx_dbscan_bench +} +#[cfg(target_os = "windows")] +criterion_group!(benches, appx_dbscan_bench); + +criterion_main!(benches); diff --git a/algorithms/linfa-clustering/examples/appx_dbscan.rs b/algorithms/linfa-clustering/examples/appx_dbscan.rs new file mode 100644 index 000000000..9e9ab634c --- /dev/null +++ b/algorithms/linfa-clustering/examples/appx_dbscan.rs @@ -0,0 +1,65 @@ +use linfa::dataset::{DatasetBase, Labels, Records}; +use linfa::metrics::SilhouetteScore; +use linfa::traits::Transformer; +use linfa_clustering::AppxDbscan; +use linfa_datasets::generate; +use ndarray::array; +use ndarray_npy::write_npy; +use ndarray_rand::rand::SeedableRng; +use rand_xoshiro::Xoshiro256Plus; + +// A routine AppxDBScan task: build a synthetic dataset, predict clusters for it +// and save both training data and predictions to disk. +fn main() { + // Our random number generator, seeded for reproducibility + let mut rng = Xoshiro256Plus::seed_from_u64(42); + + // Infer an optimal set of centroids based on the training data distribution + let expected_centroids = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],]; + let n = 1000; + // For each our expected centroids, generate `n` data points around it (a "blob") + let dataset: DatasetBase<_, _> = generate::blobs(n, &expected_centroids, &mut rng).into(); + + // Configure our training algorithm + let min_points = 3; + + println!( + "Clustering #{} data points grouped in 4 clusters of {} points each", + dataset.nsamples(), + n + ); + + let cluster_memberships = AppxDbscan::params(min_points) + .tolerance(1.) + .slack(1e-2) + .transform(dataset) + .unwrap(); + + // sigle target dataset + let label_count = cluster_memberships.label_count().remove(0); + + println!(); + println!("Result: "); + for (label, count) in label_count { + match label { + None => println!(" - {} noise points", count), + Some(i) => println!(" - {} points in cluster {}", count, i), + } + } + println!(); + + let silhouette_score = cluster_memberships.silhouette_score().unwrap(); + + println!("Silhouette score: {}", silhouette_score); + + let (records, cluster_memberships) = (cluster_memberships.records, cluster_memberships.targets); + + // Save to disk our dataset (and the cluster label assigned to each observation) + // We use the `npy` format for compatibility with NumPy + write_npy("clustered_dataset.npy", &records).expect("Failed to write .npy file"); + write_npy( + "clustered_memberships.npy", + &cluster_memberships.map(|&x| x.map(|c| c as i64).unwrap_or(-1)), + ) + .expect("Failed to write .npy file"); +} diff --git a/algorithms/linfa-clustering/src/appx_dbscan/cells_grid/cell.rs b/algorithms/linfa-clustering/src/appx_dbscan/cells_grid/cell.rs index df635e257..53545e510 100644 --- a/algorithms/linfa-clustering/src/appx_dbscan/cells_grid/cell.rs +++ b/algorithms/linfa-clustering/src/appx_dbscan/cells_grid/cell.rs @@ -1,9 +1,9 @@ +use super::CellVector; use crate::appx_dbscan::counting_tree::TreeStructure; use crate::AppxDbscanValidParams; use linfa::Float; use linfa_nn::distance::{Distance, L2Dist}; use ndarray::{Array1, ArrayView1, ArrayView2, ArrayViewMut1}; -use partitions::PartitionVec; #[derive(Clone, Debug, PartialEq, Eq)] /// A point in a D dimensional euclidean space that memorizes its @@ -124,7 +124,7 @@ impl Cell { pub fn label( &mut self, - cells: &PartitionVec>, + cells: &CellVector, points: ArrayView2, params: &AppxDbscanValidParams, ) { @@ -160,7 +160,7 @@ impl Cell { /// memorized in the cell fn label_sparse( &mut self, - cells: &PartitionVec>, + cells: &CellVector, points: ArrayView2, params: &AppxDbscanValidParams, ) { diff --git a/algorithms/linfa-clustering/src/appx_dbscan/cells_grid/mod.rs b/algorithms/linfa-clustering/src/appx_dbscan/cells_grid/mod.rs index 9d15746a1..3d48df3b4 100644 --- a/algorithms/linfa-clustering/src/appx_dbscan/cells_grid/mod.rs +++ b/algorithms/linfa-clustering/src/appx_dbscan/cells_grid/mod.rs @@ -4,14 +4,15 @@ use std::collections::HashMap; use crate::appx_dbscan::counting_tree::get_base_cell_index; use crate::AppxDbscanValidParams; +use disjoint::DisjointSetVec; use linfa::Float; use linfa_nn::{distance::L2Dist, NearestNeighbour}; use ndarray::{Array1, Array2, ArrayView1, ArrayView2, Axis}; -use partitions::PartitionVec; use cell::{Cell, StatusPoint}; -pub type CellVector = PartitionVec>; +pub type CellVector = DisjointSetVec>; + /// A structure that memorizes all non empty cells by their index's hash pub type CellTable = HashMap, usize>; @@ -31,7 +32,7 @@ impl CellsGrid { ) -> CellsGrid { let mut grid = CellsGrid { table: CellTable::with_capacity(points.dim().0), - cells: PartitionVec::with_capacity(points.dim().0), + cells: CellVector::with_capacity(points.dim().0), dimensionality: points.ncols(), labeled: false, }; @@ -105,11 +106,11 @@ impl CellsGrid { &self.labeled } - pub fn cells(&self) -> &PartitionVec> { + pub fn cells(&self) -> &CellVector { &self.cells } - pub fn cells_mut(&mut self) -> &mut PartitionVec> { + pub fn cells_mut(&mut self) -> &mut CellVector { &mut self.cells } @@ -141,12 +142,12 @@ impl CellsGrid { let neighbours_indexes = self.cells[*cell_i].neighbours_indexes().clone(); for n_index in neighbours_indexes { let neighbour = self.cells.get(n_index).unwrap(); - if !neighbour.is_core() || self.cells.same_set(*cell_i, n_index) { + if !neighbour.is_core() || self.cells.is_joined(*cell_i, n_index) { continue; } for point in curr_cell_points.iter().filter(|p| p.is_core()) { if neighbour.approximate_range_counting(points.row(point.index()), params) > 0 { - self.cells.union(*cell_i, n_index); + self.cells.join(*cell_i, n_index); break; } } diff --git a/algorithms/linfa-clustering/src/appx_dbscan/cells_grid/tests.rs b/algorithms/linfa-clustering/src/appx_dbscan/cells_grid/tests.rs index fd794c5ef..05e726fe0 100644 --- a/algorithms/linfa-clustering/src/appx_dbscan/cells_grid/tests.rs +++ b/algorithms/linfa-clustering/src/appx_dbscan/cells_grid/tests.rs @@ -43,9 +43,9 @@ fn label_points_test() { grid.label_points(points.view(), ¶ms); assert_eq!(grid.cells().len(), 2); assert_eq!(grid.cells().iter().filter(|x| x.is_core()).count(), 2); - assert_eq!(grid.cells().all_sets().count(), 1); - for set in grid.cells().all_sets() { - assert_eq!(set.count(), 2); + assert_eq!(grid.cells().indices().sets().len(), 1); + for set in grid.cells().indices().sets() { + assert_eq!(set.len(), 2); } let all_points = vec![ 2.0 * l, @@ -62,7 +62,7 @@ fn label_points_test() { grid.label_points(points.view(), ¶ms); assert_eq!(grid.cells().len(), 2); assert_eq!(grid.cells().iter().filter(|x| x.is_core()).count(), 1); - assert_eq!(grid.cells.all_sets().count(), 2); + assert_eq!(grid.cells.indices().sets().len(), 2); } #[test] diff --git a/algorithms/linfa-clustering/src/appx_dbscan/clustering/mod.rs b/algorithms/linfa-clustering/src/appx_dbscan/clustering/mod.rs index 041691c31..cd75c3139 100644 --- a/algorithms/linfa-clustering/src/appx_dbscan/clustering/mod.rs +++ b/algorithms/linfa-clustering/src/appx_dbscan/clustering/mod.rs @@ -29,11 +29,15 @@ impl AppxDbscanValidParams { } let mut labels = Array1::from_elem(observations.dim().0, None); let mut current_cluster_i: usize = 0; - for set in grid.cells_mut().all_sets_mut() { + for cell_indices_set in grid.cells_mut().indices().sets() { let mut core_cells_count = 0; - for cell in set.filter(|(_, c)| c.is_core()).map(|(_, c)| c) { - cell.assign_to_cluster(current_cluster_i, &mut labels.view_mut()); - core_cells_count += 1; + for cell_index in cell_indices_set { + let cell = &mut grid.cells_mut()[cell_index]; + + if cell.is_core() { + cell.assign_to_cluster(current_cluster_i, &mut labels.view_mut()); + core_cells_count += 1; + } } if core_cells_count > 0 { current_cluster_i += 1; diff --git a/algorithms/linfa-clustering/src/lib.rs b/algorithms/linfa-clustering/src/lib.rs index 40a5f3534..990f2d376 100644 --- a/algorithms/linfa-clustering/src/lib.rs +++ b/algorithms/linfa-clustering/src/lib.rs @@ -15,28 +15,20 @@ //! Right now `linfa-clustering` provides the following clustering algorithms: //! * [K-Means](KMeans) //! * [DBSCAN](Dbscan) -//! * [Approximated DBSCAN](AppxDbscan) (Currently an alias for DBSCAN, due to its superior -//! performance) +//! * [Approximated DBSCAN](AppxDbscan) //! * [Gaussian-Mixture-Model](GaussianMixtureModel) //! * [OPTICS](OpticsAnalysis) //! //! Implementation choices, algorithmic details and tutorials can be found in the page dedicated to the specific algorithms. +mod appx_dbscan; mod dbscan; mod gaussian_mixture; #[allow(clippy::new_ret_no_self)] mod k_means; mod optics; +pub use appx_dbscan::*; pub use dbscan::*; pub use gaussian_mixture::*; pub use k_means::*; pub use optics::*; - -// Approx DBSCAN is currently an alias for DBSCAN, due to the old Approx DBSCAN implementation's -// lower performance and outdated dependencies - -use linfa_nn::distance::L2Dist; -pub type AppxDbscanValidParams = DbscanValidParams; -pub type AppxDbscanParams = DbscanParams; -pub type AppxDbscanParamsError = DbscanParamsError; -pub type AppxDbscan = Dbscan;