-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
MinHashDeduper structure that computes dijsoint-set
I had a really hard time to make the generic stuff in IdContainer work along with the union-find, so ended up just making the vec public and reading it directly
- Loading branch information
Showing
5 changed files
with
325 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -56,3 +56,4 @@ pub mod minhash; | |
pub mod simhash; | ||
pub mod text; | ||
pub mod clustering; | ||
pub mod unionfind; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
use std::vec::Vec; | ||
|
||
pub struct UnionFind { | ||
pub parents: Vec<usize>, | ||
pub length: usize, | ||
} | ||
|
||
// Implementation of the Union Find algorithm to obtain all the connected duplicates | ||
impl UnionFind { | ||
pub fn new(length: usize) -> Self { | ||
Self { | ||
parents: (0..length).collect(), | ||
length: length, | ||
} | ||
} | ||
|
||
// find the parent of a node | ||
// after finding the uppermost parent, we set the direct parent of x, to that parent | ||
// so we widen the tree and subsequent finds will be much faster (only one jump) | ||
// doing mutable self because it's called from union, who has mutable self | ||
pub fn find(&mut self, x: usize) -> usize { | ||
let mut p = x; | ||
while self.parents[p] != p { | ||
p = self.parents[p]; | ||
} | ||
self.parents[x] = p; // path compression | ||
return p; | ||
} | ||
|
||
pub fn union(&mut self, x: usize, y: usize) { | ||
if x == y { | ||
return | ||
} | ||
let par_x = self.find(x); | ||
let par_y = self.find(y); | ||
self.parents[par_y] = par_x; | ||
} | ||
} | ||
|
||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
||
#[test] | ||
fn union_find() { | ||
let mut uf = UnionFind::new(6); | ||
uf.union(3,2); | ||
uf.union(4,2); | ||
|
||
assert_eq!(uf.parents, [0, 1, 3, 4, 4, 5]); | ||
} | ||
|
||
#[test] | ||
fn union_find_path_compression() { | ||
let mut uf = UnionFind::new(6); | ||
uf.union(3,2); | ||
uf.union(4,2); | ||
|
||
assert_eq!(uf.find(2), 4); | ||
assert_eq!(uf.parents, [0, 1, 4, 4, 4, 5]); | ||
} | ||
} |