rust-ml · jogru0 · Apr 13, 2023 · Apr 12, 2023 · Apr 23, 2023
diff --git a/algorithms/linfa-clustering/Cargo.toml b/algorithms/linfa-clustering/Cargo.toml
@@ -37,7 +37,7 @@ num-traits = "0.2"
 rand_xoshiro = "0.6"
 space = "0.12"
 thiserror = "1.0"
-#partitions = "0.2.4" This one will break in a future version of Rust and has no replacement
+disjoint = "0.6.0"
 linfa = { version = "0.6.1", path = "../.." }
 linfa-nn = { version = "0.6.1", path = "../linfa-nn" }
 noisy_float = "0.2.0"
@@ -59,6 +59,10 @@ harness = false
 name = "dbscan"
 harness = false
 
+[[bench]]
+name = "appx_dbscan"
+harness = false
+
 [[bench]]
 name = "gaussian_mixture"
 harness = false
diff --git a/algorithms/linfa-clustering/README.md b/algorithms/linfa-clustering/README.md
@@ -14,13 +14,15 @@ You can find a roadmap (and a selection of good first issues)
 `linfa-clustering` currently provides implementation of the following clustering algorithms, in addition to a couple of helper functions: 
 - K-Means
 - DBSCAN
-- Approximated DBSCAN (Currently an alias for DBSCAN, due to its superior performance)
+- Approximated DBSCAN
 - Gaussian Mixture Model
 
 
 Implementation choices, algorithmic details and a tutorial can be found 
 [here](https://docs.rs/linfa-clustering).
 
+**WARNING:** Currently the Approximated DBSCAN implementation is slower than the normal DBSCAN implementation. Therefore DBSCAN should always be used over Approximated DBSCAN.
+
 ## BLAS/Lapack backend
 We found that the pure Rust implementation maintained similar performance to the BLAS/LAPACK version and have removed it with this [PR](https://github.com/rust-ml/linfa/pull/257). Thus, to reduce code complexity BLAS support has been removed for this module.
 

diff --git a/algorithms/linfa-clustering/benches/appx_dbscan.rs b/algorithms/linfa-clustering/benches/appx_dbscan.rs
@@ -0,0 +1,63 @@
+use criterion::{
+    black_box, criterion_group, criterion_main, AxisScale, BenchmarkId, Criterion,
+    PlotConfiguration,
+};
+use linfa::benchmarks::config;
+use linfa::traits::Transformer;
+use linfa_clustering::AppxDbscan;
+use linfa_datasets::generate;
+use ndarray::Array2;
+use ndarray_rand::rand::SeedableRng;
+use ndarray_rand::rand_distr::Uniform;
+use ndarray_rand::RandomExt;
+use rand_xoshiro::Xoshiro256Plus;
+
+fn appx_dbscan_bench(c: &mut Criterion) {
+    let mut rng = Xoshiro256Plus::seed_from_u64(40);
+    let cluster_sizes_and_slacks = vec![
+        (10, 0.00001),
+        (100, 0.00001),
+        (1000, 0.00001),
+        /*(10000, 0.1),*/
+    ];
+
+    let mut benchmark = c.benchmark_group("appx_dbscan");
+    config::set_default_benchmark_configs(&mut benchmark);
+    benchmark.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic));
+
+    for cluster_size_and_slack in cluster_sizes_and_slacks {
+        let rng = &mut rng;
+        benchmark.bench_with_input(
+            BenchmarkId::new("appx_dbscan", cluster_size_and_slack.0),
+            &cluster_size_and_slack,
+            move |bencher, &cluster_size_and_slack| {
+                let min_points = 4;
+                let n_features = 3;
+                let tolerance = 0.3;
+                let centroids =
+                    Array2::random_using((min_points, n_features), Uniform::new(-30., 30.), rng);
+                let dataset = generate::blobs(cluster_size_and_slack.0, &centroids, rng);
+                bencher.iter(|| {
+                    black_box(
+                        AppxDbscan::params(min_points)
+                            .tolerance(tolerance)
+                            .slack(cluster_size_and_slack.1)
+                            .transform(&dataset),
+                    )
+                });
+            },
+        );
+    }
+    benchmark.finish();
+}
+
+#[cfg(not(target_os = "windows"))]
+criterion_group! {
+    name = benches;
+    config = config::get_default_profiling_configs();
+    targets = appx_dbscan_bench
+}
+#[cfg(target_os = "windows")]
+criterion_group!(benches, appx_dbscan_bench);
+
+criterion_main!(benches);
diff --git a/algorithms/linfa-clustering/examples/appx_dbscan.rs b/algorithms/linfa-clustering/examples/appx_dbscan.rs
@@ -0,0 +1,65 @@
+use linfa::dataset::{DatasetBase, Labels, Records};
+use linfa::metrics::SilhouetteScore;
+use linfa::traits::Transformer;
+use linfa_clustering::AppxDbscan;
+use linfa_datasets::generate;
+use ndarray::array;
+use ndarray_npy::write_npy;
+use ndarray_rand::rand::SeedableRng;
+use rand_xoshiro::Xoshiro256Plus;
+
+// A routine AppxDBScan task: build a synthetic dataset, predict clusters for it
+// and save both training data and predictions to disk.
+fn main() {
+    // Our random number generator, seeded for reproducibility
+    let mut rng = Xoshiro256Plus::seed_from_u64(42);
+
+    // Infer an optimal set of centroids based on the training data distribution
+    let expected_centroids = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],];
+    let n = 1000;
+    // For each our expected centroids, generate `n` data points around it (a "blob")
+    let dataset: DatasetBase<_, _> = generate::blobs(n, &expected_centroids, &mut rng).into();
+
+    // Configure our training algorithm
+    let min_points = 3;
+
+    println!(
+        "Clustering #{} data points grouped in 4 clusters of {} points each",
+        dataset.nsamples(),
+        n
+    );
+
+    let cluster_memberships = AppxDbscan::params(min_points)
+        .tolerance(1.)
+        .slack(1e-2)
+        .transform(dataset)
+        .unwrap();
+
+    // sigle target dataset
+    let label_count = cluster_memberships.label_count().remove(0);
+
+    println!();
+    println!("Result: ");
+    for (label, count) in label_count {
+        match label {
+            None => println!(" - {} noise points", count),
+            Some(i) => println!(" - {} points in cluster {}", count, i),
+        }
+    }
+    println!();
+
+    let silhouette_score = cluster_memberships.silhouette_score().unwrap();
+
+    println!("Silhouette score: {}", silhouette_score);
+
+    let (records, cluster_memberships) = (cluster_memberships.records, cluster_memberships.targets);
+
+    // Save to disk our dataset (and the cluster label assigned to each observation)
+    // We use the `npy` format for compatibility with NumPy
+    write_npy("clustered_dataset.npy", &records).expect("Failed to write .npy file");
+    write_npy(
+        "clustered_memberships.npy",
+        &cluster_memberships.map(|&x| x.map(|c| c as i64).unwrap_or(-1)),
+    )
+    .expect("Failed to write .npy file");
+}
diff --git a/algorithms/linfa-clustering/src/appx_dbscan/cells_grid/cell.rs b/algorithms/linfa-clustering/src/appx_dbscan/cells_grid/cell.rs
@@ -1,9 +1,9 @@
+use super::CellVector;
 use crate::appx_dbscan::counting_tree::TreeStructure;
 use crate::AppxDbscanValidParams;
 use linfa::Float;
 use linfa_nn::distance::{Distance, L2Dist};
 use ndarray::{Array1, ArrayView1, ArrayView2, ArrayViewMut1};
-use partitions::PartitionVec;
 
 #[derive(Clone, Debug, PartialEq, Eq)]
 /// A point in a D dimensional euclidean space that memorizes its
@@ -124,7 +124,7 @@ impl<F: Float> Cell<F> {
 
     pub fn label<N>(
         &mut self,
-        cells: &PartitionVec<Cell<F>>,
+        cells: &CellVector<F>,
         points: ArrayView2<F>,
         params: &AppxDbscanValidParams<F, N>,
     ) {
@@ -160,7 +160,7 @@ impl<F: Float> Cell<F> {
     /// memorized in the cell
     fn label_sparse<N>(
         &mut self,
-        cells: &PartitionVec<Cell<F>>,
+        cells: &CellVector<F>,
         points: ArrayView2<F>,
         params: &AppxDbscanValidParams<F, N>,
     ) {

diff --git a/algorithms/linfa-clustering/src/appx_dbscan/cells_grid/mod.rs b/algorithms/linfa-clustering/src/appx_dbscan/cells_grid/mod.rs
@@ -4,14 +4,15 @@ use std::collections::HashMap;
 
 use crate::appx_dbscan::counting_tree::get_base_cell_index;
 use crate::AppxDbscanValidParams;
+use disjoint::DisjointSetVec;
 use linfa::Float;
 use linfa_nn::{distance::L2Dist, NearestNeighbour};
 use ndarray::{Array1, Array2, ArrayView1, ArrayView2, Axis};
-use partitions::PartitionVec;
 
 use cell::{Cell, StatusPoint};
 
-pub type CellVector<F> = PartitionVec<Cell<F>>;
+pub type CellVector<F> = DisjointSetVec<Cell<F>>;
+
 /// A structure that memorizes all non empty cells by their index's hash
 pub type CellTable = HashMap<Array1<i64>, usize>;
 
@@ -31,7 +32,7 @@ impl<F: Float> CellsGrid<F> {
     ) -> CellsGrid<F> {
         let mut grid = CellsGrid {
             table: CellTable::with_capacity(points.dim().0),
-            cells: PartitionVec::with_capacity(points.dim().0),
+            cells: CellVector::with_capacity(points.dim().0),
             dimensionality: points.ncols(),
             labeled: false,
         };
@@ -105,11 +106,11 @@ impl<F: Float> CellsGrid<F> {
         &self.labeled
     }
 
-    pub fn cells(&self) -> &PartitionVec<Cell<F>> {
+    pub fn cells(&self) -> &CellVector<F> {
         &self.cells
     }
 
-    pub fn cells_mut(&mut self) -> &mut PartitionVec<Cell<F>> {
+    pub fn cells_mut(&mut self) -> &mut CellVector<F> {
         &mut self.cells
     }
 
@@ -141,12 +142,12 @@ impl<F: Float> CellsGrid<F> {
             let neighbours_indexes = self.cells[*cell_i].neighbours_indexes().clone();
             for n_index in neighbours_indexes {
                 let neighbour = self.cells.get(n_index).unwrap();
-                if !neighbour.is_core() || self.cells.same_set(*cell_i, n_index) {
+                if !neighbour.is_core() || self.cells.is_joined(*cell_i, n_index) {
                     continue;
                 }
                 for point in curr_cell_points.iter().filter(|p| p.is_core()) {
                     if neighbour.approximate_range_counting(points.row(point.index()), params) > 0 {
-                        self.cells.union(*cell_i, n_index);
+                        self.cells.join(*cell_i, n_index);
                         break;
                     }
                 }

diff --git a/algorithms/linfa-clustering/src/appx_dbscan/cells_grid/tests.rs b/algorithms/linfa-clustering/src/appx_dbscan/cells_grid/tests.rs
@@ -43,9 +43,9 @@ fn label_points_test() {
     grid.label_points(points.view(), &params);
     assert_eq!(grid.cells().len(), 2);
     assert_eq!(grid.cells().iter().filter(|x| x.is_core()).count(), 2);
-    assert_eq!(grid.cells().all_sets().count(), 1);
-    for set in grid.cells().all_sets() {
-        assert_eq!(set.count(), 2);
+    assert_eq!(grid.cells().indices().sets().len(), 1);
+    for set in grid.cells().indices().sets() {
+        assert_eq!(set.len(), 2);
     }
     let all_points = vec![
         2.0 * l,
@@ -62,7 +62,7 @@ fn label_points_test() {
     grid.label_points(points.view(), &params);
     assert_eq!(grid.cells().len(), 2);
     assert_eq!(grid.cells().iter().filter(|x| x.is_core()).count(), 1);
-    assert_eq!(grid.cells.all_sets().count(), 2);
+    assert_eq!(grid.cells.indices().sets().len(), 2);
 }
 
 #[test]

diff --git a/algorithms/linfa-clustering/src/appx_dbscan/clustering/mod.rs b/algorithms/linfa-clustering/src/appx_dbscan/clustering/mod.rs
@@ -29,11 +29,15 @@ impl<F: Float, N: NearestNeighbour> AppxDbscanValidParams<F, N> {
         }
         let mut labels = Array1::from_elem(observations.dim().0, None);
         let mut current_cluster_i: usize = 0;
-        for set in grid.cells_mut().all_sets_mut() {
+        for cell_indices_set in grid.cells_mut().indices().sets() {
             let mut core_cells_count = 0;
-            for cell in set.filter(|(_, c)| c.is_core()).map(|(_, c)| c) {
-                cell.assign_to_cluster(current_cluster_i, &mut labels.view_mut());
-                core_cells_count += 1;
+            for cell_index in cell_indices_set {
+                let cell = &mut grid.cells_mut()[cell_index];
+
+                if cell.is_core() {
+                    cell.assign_to_cluster(current_cluster_i, &mut labels.view_mut());
+                    core_cells_count += 1;
+                }
             }
             if core_cells_count > 0 {
                 current_cluster_i += 1;

diff --git a/algorithms/linfa-clustering/src/lib.rs b/algorithms/linfa-clustering/src/lib.rs
@@ -15,28 +15,20 @@
 //! Right now `linfa-clustering` provides the following clustering algorithms:
 //! * [K-Means](KMeans)
 //! * [DBSCAN](Dbscan)
-//! * [Approximated DBSCAN](AppxDbscan) (Currently an alias for DBSCAN, due to its superior
-//! performance)
+//! * [Approximated DBSCAN](AppxDbscan)
 //! * [Gaussian-Mixture-Model](GaussianMixtureModel)
 //! * [OPTICS](OpticsAnalysis)
 //!
 //! Implementation choices, algorithmic details and tutorials can be found in the page dedicated to the specific algorithms.
+mod appx_dbscan;
 mod dbscan;
 mod gaussian_mixture;
 #[allow(clippy::new_ret_no_self)]
 mod k_means;
 mod optics;
 
+pub use appx_dbscan::*;
 pub use dbscan::*;
 pub use gaussian_mixture::*;
 pub use k_means::*;
 pub use optics::*;
-
-// Approx DBSCAN is currently an alias for DBSCAN, due to the old Approx DBSCAN implementation's
-// lower performance and outdated dependencies
-
-use linfa_nn::distance::L2Dist;
-pub type AppxDbscanValidParams<F, N> = DbscanValidParams<F, L2Dist, N>;
-pub type AppxDbscanParams<F, N> = DbscanParams<F, L2Dist, N>;
-pub type AppxDbscanParamsError = DbscanParamsError;
-pub type AppxDbscan = Dbscan;