Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reactivate appx DBSCAN impl with new disjoint set dependency #304

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 5 additions & 1 deletion algorithms/linfa-clustering/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ num-traits = "0.2"
rand_xoshiro = "0.6"
space = "0.12"
thiserror = "1.0"
#partitions = "0.2.4" This one will break in a future version of Rust and has no replacement
disjoint = "0.6.0"
linfa = { version = "0.6.1", path = "../.." }
linfa-nn = { version = "0.6.1", path = "../linfa-nn" }
noisy_float = "0.2.0"
Expand All @@ -59,6 +59,10 @@ harness = false
name = "dbscan"
harness = false

[[bench]]
name = "appx_dbscan"
harness = false

[[bench]]
name = "gaussian_mixture"
harness = false
4 changes: 3 additions & 1 deletion algorithms/linfa-clustering/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,15 @@ You can find a roadmap (and a selection of good first issues)
`linfa-clustering` currently provides implementation of the following clustering algorithms, in addition to a couple of helper functions:
- K-Means
- DBSCAN
- Approximated DBSCAN (Currently an alias for DBSCAN, due to its superior performance)
- Approximated DBSCAN
- Gaussian Mixture Model


Implementation choices, algorithmic details and a tutorial can be found
[here](https://docs.rs/linfa-clustering).

**WARNING:** Currently the Approximated DBSCAN implementation is slower than the normal DBSCAN implementation. Therefore DBSCAN should always be used over Approximated DBSCAN.

## BLAS/Lapack backend
We found that the pure Rust implementation maintained similar performance to the BLAS/LAPACK version and have removed it with this [PR](https://github.com/rust-ml/linfa/pull/257). Thus, to reduce code complexity BLAS support has been removed for this module.

Expand Down
63 changes: 63 additions & 0 deletions algorithms/linfa-clustering/benches/appx_dbscan.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
use criterion::{
black_box, criterion_group, criterion_main, AxisScale, BenchmarkId, Criterion,
PlotConfiguration,
};
use linfa::benchmarks::config;
use linfa::traits::Transformer;
use linfa_clustering::AppxDbscan;
use linfa_datasets::generate;
use ndarray::Array2;
use ndarray_rand::rand::SeedableRng;
use ndarray_rand::rand_distr::Uniform;
use ndarray_rand::RandomExt;
use rand_xoshiro::Xoshiro256Plus;

fn appx_dbscan_bench(c: &mut Criterion) {
let mut rng = Xoshiro256Plus::seed_from_u64(40);
let cluster_sizes_and_slacks = vec![
(10, 0.00001),
(100, 0.00001),
(1000, 0.00001),
/*(10000, 0.1),*/
];

let mut benchmark = c.benchmark_group("appx_dbscan");
config::set_default_benchmark_configs(&mut benchmark);
benchmark.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic));

for cluster_size_and_slack in cluster_sizes_and_slacks {
let rng = &mut rng;
benchmark.bench_with_input(
BenchmarkId::new("appx_dbscan", cluster_size_and_slack.0),
&cluster_size_and_slack,
move |bencher, &cluster_size_and_slack| {
let min_points = 4;
let n_features = 3;
let tolerance = 0.3;
let centroids =
Array2::random_using((min_points, n_features), Uniform::new(-30., 30.), rng);
let dataset = generate::blobs(cluster_size_and_slack.0, &centroids, rng);
bencher.iter(|| {
black_box(
AppxDbscan::params(min_points)
.tolerance(tolerance)
.slack(cluster_size_and_slack.1)
.transform(&dataset),
)
});
},
);
}
benchmark.finish();
}

#[cfg(not(target_os = "windows"))]
criterion_group! {
name = benches;
config = config::get_default_profiling_configs();
targets = appx_dbscan_bench
}
#[cfg(target_os = "windows")]
criterion_group!(benches, appx_dbscan_bench);

criterion_main!(benches);
65 changes: 65 additions & 0 deletions algorithms/linfa-clustering/examples/appx_dbscan.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
use linfa::dataset::{DatasetBase, Labels, Records};
use linfa::metrics::SilhouetteScore;
use linfa::traits::Transformer;
use linfa_clustering::AppxDbscan;
use linfa_datasets::generate;
use ndarray::array;
use ndarray_npy::write_npy;
use ndarray_rand::rand::SeedableRng;
use rand_xoshiro::Xoshiro256Plus;

// A routine AppxDBScan task: build a synthetic dataset, predict clusters for it
// and save both training data and predictions to disk.
fn main() {
// Our random number generator, seeded for reproducibility
let mut rng = Xoshiro256Plus::seed_from_u64(42);

// Infer an optimal set of centroids based on the training data distribution
let expected_centroids = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],];
let n = 1000;
// For each our expected centroids, generate `n` data points around it (a "blob")
let dataset: DatasetBase<_, _> = generate::blobs(n, &expected_centroids, &mut rng).into();

// Configure our training algorithm
let min_points = 3;

println!(
"Clustering #{} data points grouped in 4 clusters of {} points each",
dataset.nsamples(),
n
);

let cluster_memberships = AppxDbscan::params(min_points)
.tolerance(1.)
.slack(1e-2)
.transform(dataset)
.unwrap();

// sigle target dataset
let label_count = cluster_memberships.label_count().remove(0);

println!();
println!("Result: ");
for (label, count) in label_count {
match label {
None => println!(" - {} noise points", count),
Some(i) => println!(" - {} points in cluster {}", count, i),
}
}
println!();

let silhouette_score = cluster_memberships.silhouette_score().unwrap();

println!("Silhouette score: {}", silhouette_score);

let (records, cluster_memberships) = (cluster_memberships.records, cluster_memberships.targets);

// Save to disk our dataset (and the cluster label assigned to each observation)
// We use the `npy` format for compatibility with NumPy
write_npy("clustered_dataset.npy", &records).expect("Failed to write .npy file");
write_npy(
"clustered_memberships.npy",
&cluster_memberships.map(|&x| x.map(|c| c as i64).unwrap_or(-1)),
)
.expect("Failed to write .npy file");
}
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
use super::CellVector;
use crate::appx_dbscan::counting_tree::TreeStructure;
use crate::AppxDbscanValidParams;
use linfa::Float;
use linfa_nn::distance::{Distance, L2Dist};
use ndarray::{Array1, ArrayView1, ArrayView2, ArrayViewMut1};
use partitions::PartitionVec;

#[derive(Clone, Debug, PartialEq, Eq)]
/// A point in a D dimensional euclidean space that memorizes its
Expand Down Expand Up @@ -124,7 +124,7 @@ impl<F: Float> Cell<F> {

pub fn label<N>(
&mut self,
cells: &PartitionVec<Cell<F>>,
cells: &CellVector<F>,
points: ArrayView2<F>,
params: &AppxDbscanValidParams<F, N>,
) {
Expand Down Expand Up @@ -160,7 +160,7 @@ impl<F: Float> Cell<F> {
/// memorized in the cell
fn label_sparse<N>(
&mut self,
cells: &PartitionVec<Cell<F>>,
cells: &CellVector<F>,
points: ArrayView2<F>,
params: &AppxDbscanValidParams<F, N>,
) {
Expand Down
15 changes: 8 additions & 7 deletions algorithms/linfa-clustering/src/appx_dbscan/cells_grid/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ use std::collections::HashMap;

use crate::appx_dbscan::counting_tree::get_base_cell_index;
use crate::AppxDbscanValidParams;
use disjoint::DisjointSetVec;
use linfa::Float;
use linfa_nn::{distance::L2Dist, NearestNeighbour};
use ndarray::{Array1, Array2, ArrayView1, ArrayView2, Axis};
use partitions::PartitionVec;

use cell::{Cell, StatusPoint};

pub type CellVector<F> = PartitionVec<Cell<F>>;
pub type CellVector<F> = DisjointSetVec<Cell<F>>;

/// A structure that memorizes all non empty cells by their index's hash
pub type CellTable = HashMap<Array1<i64>, usize>;

Expand All @@ -31,7 +32,7 @@ impl<F: Float> CellsGrid<F> {
) -> CellsGrid<F> {
let mut grid = CellsGrid {
table: CellTable::with_capacity(points.dim().0),
cells: PartitionVec::with_capacity(points.dim().0),
cells: CellVector::with_capacity(points.dim().0),
dimensionality: points.ncols(),
labeled: false,
};
Expand Down Expand Up @@ -105,11 +106,11 @@ impl<F: Float> CellsGrid<F> {
&self.labeled
}

pub fn cells(&self) -> &PartitionVec<Cell<F>> {
pub fn cells(&self) -> &CellVector<F> {
&self.cells
}

pub fn cells_mut(&mut self) -> &mut PartitionVec<Cell<F>> {
pub fn cells_mut(&mut self) -> &mut CellVector<F> {
&mut self.cells
}

Expand Down Expand Up @@ -141,12 +142,12 @@ impl<F: Float> CellsGrid<F> {
let neighbours_indexes = self.cells[*cell_i].neighbours_indexes().clone();
for n_index in neighbours_indexes {
let neighbour = self.cells.get(n_index).unwrap();
if !neighbour.is_core() || self.cells.same_set(*cell_i, n_index) {
if !neighbour.is_core() || self.cells.is_joined(*cell_i, n_index) {
continue;
}
for point in curr_cell_points.iter().filter(|p| p.is_core()) {
if neighbour.approximate_range_counting(points.row(point.index()), params) > 0 {
self.cells.union(*cell_i, n_index);
self.cells.join(*cell_i, n_index);
break;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ fn label_points_test() {
grid.label_points(points.view(), &params);
assert_eq!(grid.cells().len(), 2);
assert_eq!(grid.cells().iter().filter(|x| x.is_core()).count(), 2);
assert_eq!(grid.cells().all_sets().count(), 1);
for set in grid.cells().all_sets() {
assert_eq!(set.count(), 2);
assert_eq!(grid.cells().indices().sets().len(), 1);
for set in grid.cells().indices().sets() {
assert_eq!(set.len(), 2);
}
let all_points = vec![
2.0 * l,
Expand All @@ -62,7 +62,7 @@ fn label_points_test() {
grid.label_points(points.view(), &params);
assert_eq!(grid.cells().len(), 2);
assert_eq!(grid.cells().iter().filter(|x| x.is_core()).count(), 1);
assert_eq!(grid.cells.all_sets().count(), 2);
assert_eq!(grid.cells.indices().sets().len(), 2);
}

#[test]
Expand Down
12 changes: 8 additions & 4 deletions algorithms/linfa-clustering/src/appx_dbscan/clustering/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,15 @@ impl<F: Float, N: NearestNeighbour> AppxDbscanValidParams<F, N> {
}
let mut labels = Array1::from_elem(observations.dim().0, None);
let mut current_cluster_i: usize = 0;
for set in grid.cells_mut().all_sets_mut() {
for cell_indices_set in grid.cells_mut().indices().sets() {
let mut core_cells_count = 0;
for cell in set.filter(|(_, c)| c.is_core()).map(|(_, c)| c) {
cell.assign_to_cluster(current_cluster_i, &mut labels.view_mut());
core_cells_count += 1;
for cell_index in cell_indices_set {
let cell = &mut grid.cells_mut()[cell_index];

if cell.is_core() {
cell.assign_to_cluster(current_cluster_i, &mut labels.view_mut());
core_cells_count += 1;
}
}
if core_cells_count > 0 {
current_cluster_i += 1;
Expand Down
14 changes: 3 additions & 11 deletions algorithms/linfa-clustering/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,28 +15,20 @@
//! Right now `linfa-clustering` provides the following clustering algorithms:
//! * [K-Means](KMeans)
//! * [DBSCAN](Dbscan)
//! * [Approximated DBSCAN](AppxDbscan) (Currently an alias for DBSCAN, due to its superior
//! performance)
//! * [Approximated DBSCAN](AppxDbscan)
//! * [Gaussian-Mixture-Model](GaussianMixtureModel)
//! * [OPTICS](OpticsAnalysis)
//!
//! Implementation choices, algorithmic details and tutorials can be found in the page dedicated to the specific algorithms.
mod appx_dbscan;
mod dbscan;
mod gaussian_mixture;
#[allow(clippy::new_ret_no_self)]
mod k_means;
mod optics;

pub use appx_dbscan::*;
pub use dbscan::*;
pub use gaussian_mixture::*;
pub use k_means::*;
pub use optics::*;

// Approx DBSCAN is currently an alias for DBSCAN, due to the old Approx DBSCAN implementation's
// lower performance and outdated dependencies

use linfa_nn::distance::L2Dist;
pub type AppxDbscanValidParams<F, N> = DbscanValidParams<F, L2Dist, N>;
pub type AppxDbscanParams<F, N> = DbscanParams<F, L2Dist, N>;
pub type AppxDbscanParamsError = DbscanParamsError;
pub type AppxDbscan = Dbscan;