Merge branch 'master' of github.com:dib-lab/sourmash into refactor/mi…

…nhash
sourmash-bio · Feb 8, 2020 · c412cea · c412cea
2 parents d599f19 + a1eeab1
commit c412cea
Show file tree

Hide file tree

Showing 10 changed files with 202 additions and 74 deletions.
diff --git a/.github/workflows/hypothesis.yml b/.github/workflows/hypothesis.yml
@@ -0,0 +1,27 @@
+name: Hypothesis tests
+
+on:
+  push:
+    branches: [master]
+  pull_request:
+    branches: [master]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v1
+
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v1
+        with:
+          python-version: "3.7"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install -e .[test]
+
+      - name: Run Hypothesis tests
+        run: |
+          python -m pytest --run-hypothesis --hypothesis-show-statistics --hypothesis-profile ci
diff --git a/doc/developer.md b/doc/developer.md
@@ -17,7 +17,7 @@ To install all of the necessary Python dependencies, do:
 ```
 pip install -r requirements.txt
 ```
-Briefly, we use `py.test` for testing, and `coverage` for code
+Briefly, we use `py.test` and `cargo test` for testing, and `coverage` for code
 coverage analysis.
 
 We suggest working on sourmash in a virtualenv; e.g. from within the
@@ -28,8 +28,9 @@ python -m virtualenv dev
 pip install -e .
 ```
 
-You can run tests by invoking `make test` or `python -m pytest` in the sourmash
-directory.
+You can run tests by invoking `make test` in the sourmash directory;
+`python -m pytest` will run the Python tests, and `cargo test` will
+run the Rust tests.
 
 ## Automated tests and code coverage calculation
 

diff --git a/include/sourmash.h b/include/sourmash.h
@@ -98,8 +98,6 @@ void computeparams_set_scaled(ComputeParameters *ptr, uint64_t scaled);
 
 uint64_t hash_murmur(const char *kmer, uint64_t seed);
 
-void kmerminhash_abunds_push(KmerMinHash *ptr, uint64_t val);
-
 void kmerminhash_add_from(KmerMinHash *ptr, const KmerMinHash *other);
 
 void kmerminhash_add_hash(KmerMinHash *ptr, uint64_t h);
@@ -162,8 +160,6 @@ void kmerminhash_merge(KmerMinHash *ptr, const KmerMinHash *other);
 
 bool kmerminhash_is_compatible(const KmerMinHash *ptr, const KmerMinHash *other);
 
-void kmerminhash_mins_push(KmerMinHash *ptr, uint64_t val);
-
 KmerMinHash *kmerminhash_new(uint32_t n,
                              uint32_t k,
                              bool prot,
@@ -181,6 +177,8 @@ void kmerminhash_remove_many(KmerMinHash *ptr, const uint64_t *hashes_ptr, uintp
 
 uint64_t kmerminhash_seed(KmerMinHash *ptr);
 
+void kmerminhash_set_abundances(KmerMinHash *ptr, const uint64_t *hashes_ptr, const uint64_t *abunds_ptr, uintptr_t insize);
+
 bool kmerminhash_track_abundance(KmerMinHash *ptr);
 
 bool nodegraph_count(Nodegraph *ptr, uint64_t h);

diff --git a/setup.py b/setup.py
@@ -77,7 +77,7 @@ def build_native(spec):
     "zip_safe": False,
     "platforms": "any",
     "extras_require": {
-        'test' : ['pytest', 'pytest-cov', 'recommonmark'],
+        'test' : ['pytest', 'pytest-cov', 'recommonmark', 'hypothesis'],
         'demo' : ['jupyter', 'jupyter_client', 'ipython'],
         'doc' : ['sphinx', 'recommonmark', 'alabaster',
                  "sphinxcontrib-napoleon", "nbsphinx"],

diff --git a/sourmash/_minhash.py b/sourmash/_minhash.py
@@ -442,15 +442,13 @@ def __iadd__(self, other):
 
     def set_abundances(self, values):
         if self.track_abundance:
-            added = 0
-
-            for k, v in sorted(values.items()):
-                if not self.max_hash or k <= self.max_hash:
-                    self._methodcall(lib.kmerminhash_mins_push, k)
-                    self._methodcall(lib.kmerminhash_abunds_push, v)
-                    added += 1
-                    if self.num > 0 and added >= self.num:
-                        break
+            hashes = []
+            abunds = []
+            for h, v in values.items():
+                hashes.append(h)
+                abunds.append(v)
+
+            self._methodcall(lib.kmerminhash_set_abundances, hashes, abunds, len(hashes))
         else:
             raise RuntimeError(
                 "Use track_abundance=True when constructing "

diff --git a/src/core/src/ffi/minhash.rs b/src/core/src/ffi/minhash.rs
@@ -252,13 +252,41 @@ pub unsafe extern "C" fn kmerminhash_get_mins_size(ptr: *mut KmerMinHash) -> usi
     mh.mins.len()
 }
 
-#[no_mangle]
-pub unsafe extern "C" fn kmerminhash_mins_push(ptr: *mut KmerMinHash, val: u64) {
+ffi_fn! {
+unsafe fn kmerminhash_set_abundances(
+    ptr: *mut KmerMinHash,
+    hashes_ptr: *const u64,
+    abunds_ptr: *const u64,
+    insize: usize,
+) -> Result<()> {
     let mh = {
         assert!(!ptr.is_null());
         &mut *ptr
     };
-    mh.mins.push(val)
+
+    let hashes = {
+        assert!(!hashes_ptr.is_null());
+        slice::from_raw_parts(hashes_ptr as *const u64, insize)
+    };
+
+    let abunds = {
+        assert!(!abunds_ptr.is_null());
+        slice::from_raw_parts(abunds_ptr as *const u64, insize)
+    };
+
+    let mut pairs: Vec<_> = hashes.iter().cloned().zip(abunds.iter().cloned()).collect();
+    pairs.sort();
+
+    // Reset the minhash
+    mh.mins.clear();
+    if let Some(ref mut abunds) = mh.abunds {
+        abunds.clear();
+    }
+
+    mh.add_many_with_abund(&pairs)?;
+
+    Ok(())
+}
 }
 
 ffi_fn! {
@@ -288,17 +316,6 @@ pub unsafe extern "C" fn kmerminhash_get_abunds_size(ptr: *mut KmerMinHash) -> u
     }
 }
 
-#[no_mangle]
-pub unsafe extern "C" fn kmerminhash_abunds_push(ptr: *mut KmerMinHash, val: u64) {
-    let mh = {
-        assert!(!ptr.is_null());
-        &mut *ptr
-    };
-    if let Some(ref mut abunds) = mh.abunds {
-        abunds.push(val)
-    }
-}
-
 #[no_mangle]
 pub unsafe extern "C" fn kmerminhash_is_protein(ptr: *mut KmerMinHash) -> bool {
     let mh = {

diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs
@@ -256,56 +256,74 @@ impl KmerMinHash {
     }
 
     pub fn add_hash(&mut self, hash: u64) {
+        self.add_hash_with_abundance(hash, 1);
+    }
+
+    pub fn add_hash_with_abundance(&mut self, hash: u64, abundance: u64) {
         let current_max = match self.mins.last() {
             Some(&x) => x,
             None => u64::max_value(),
         };
 
-        if hash <= self.max_hash || self.max_hash == 0 {
-            // empty? add it, if within range / no range specified.
-            if self.mins.is_empty() {
+        if hash > self.max_hash && self.max_hash != 0 {
+            // This is a scaled minhash, and we don't need to add the new hash
+            return;
+        }
+
+        if self.num == 0 && self.max_hash == 0 {
+            // why did you create this minhash? it will always be empty...
+            return;
+        }
+
+        if abundance == 0 {
+            // well, don't add it.
+            return;
+        }
+
+        // From this point on, hash is within scaled (or no scaled specified).
+
+        // empty mins? add it.
+        if self.mins.is_empty() {
+            self.mins.push(hash);
+            if let Some(ref mut abunds) = self.abunds {
+                abunds.push(abundance);
+            }
+            return;
+        }
+
+        if hash <= self.max_hash || hash <= current_max || (self.mins.len() as u32) < self.num {
+            // "good" hash - within range, smaller than current entry, or
+            // still have space available
+            let pos = match self.mins.binary_search(&hash) {
+                Ok(p) => p,
+                Err(p) => p,
+            };
+
+            if pos == self.mins.len() {
+                // at end - must still be growing, we know the list won't
+                // get too long
                 self.mins.push(hash);
                 if let Some(ref mut abunds) = self.abunds {
-                    abunds.push(1);
+                    abunds.push(abundance);
+                }
+            } else if self.mins[pos] != hash {
+                // didn't find hash in mins, so inserting somewhere
+                // in the middle; shrink list if needed.
+                self.mins.insert(pos, hash);
+                if let Some(ref mut abunds) = self.abunds {
+                    abunds.insert(pos, abundance);
                 }
-                return;
-            } else if hash <= self.max_hash
-                || current_max > hash
-                || (self.mins.len() as u32) < self.num
-            {
-                // "good" hash - within range, smaller than current entry, or
-                // still have space available
-                let pos = match self.mins.binary_search(&hash) {
-                    Ok(p) => p,
-                    Err(p) => p,
-                };
-
-                if pos == self.mins.len() {
-                    // at end - must still be growing, we know the list won't
-                    // get too long
-                    self.mins.push(hash);
-                    if let Some(ref mut abunds) = self.abunds {
-                        abunds.push(1);
-                    }
-                } else if self.mins[pos] != hash {
-                    // didn't find hash in mins, so inserting somewhere
-                    // in the middle; shrink list if needed.
-                    self.mins.insert(pos, hash);
-                    if let Some(ref mut abunds) = self.abunds {
-                        abunds.insert(pos, 1);
-                    }
 
-                    // is it too big now?
-                    if self.num != 0 && self.mins.len() > (self.num as usize) {
-                        self.mins.pop();
-                        if let Some(ref mut abunds) = self.abunds {
-                            abunds.pop();
-                        }
+                // is it too big now?
+                if self.num != 0 && self.mins.len() > (self.num as usize) {
+                    self.mins.pop();
+                    if let Some(ref mut abunds) = self.abunds {
+                        abunds.pop();
                     }
-                } else if let Some(ref mut abunds) = self.abunds {
-                    // pos == hash: hash value already in mins, inc count
-                    abunds[pos] += 1;
                 }
+            } else if let Some(ref mut abunds) = self.abunds {
+                // pos == hash: hash value already in mins, inc count by abundance
+                abunds[pos] += abundance;
             }
         }
     }
@@ -451,9 +469,7 @@ impl KmerMinHash {
 
     pub fn add_many_with_abund(&mut self, hashes: &[(u64, u64)]) -> Result<(), Error> {
         for item in hashes {
-            for _i in 0..item.1 {
-                self.add_hash(item.0);
-            }
+            self.add_hash_with_abundance(item.0, item.1);
         }
         Ok(())
     }

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,3 +1,6 @@
+import os
+
+from hypothesis import settings, Verbosity
 import pytest
 
 
@@ -36,10 +39,23 @@ def pytest_collection_modifyitems(items, config):
                 deselected_items.append(item)
         config.hook.pytest_deselected(items=deselected_items)
         items[:] = selected_items
+# --- END - Only run tests using a particular fixture --- #
 
 def pytest_addoption(parser):
     parser.addoption("--usesfixture",
                      action="store",
                      default=None,
                      help="just run tests that use a particular fixture")
-# --- END - Only run tests using a particular fixture --- #
+
+    parser.addoption("--run-hypothesis", action="store_true",
+                     help="run hypothesis tests")
+
+def pytest_runtest_setup(item):
+    if item.config.getoption("--run-hypothesis"):
+        if not any(mark for mark in item.iter_markers(name="hypothesis")):
+            pytest.skip("--run-hypothesis option set, running only hypothesis tests")
+
+settings.register_profile("ci", max_examples=1000)
+settings.register_profile("dev", max_examples=10)
+settings.register_profile("debug", max_examples=10, verbosity=Verbosity.verbose)
+settings.load_profile(os.getenv(u'HYPOTHESIS_PROFILE', 'default'))
diff --git a/tests/test__minhash.py b/tests/test__minhash.py
@@ -1138,6 +1138,14 @@ def test_reviving_minhash():
         mh.add_hash(m)
 
 
+def test_set_abundance_num():
+    a = MinHash(2, 10, track_abundance=True)
+
+    a.set_abundances({1: 3, 2: 4})
+
+    assert a.get_mins(with_abundance=True) == {1: 3, 2: 4}
+
+
 def test_mh_copy_and_clear(track_abundance):
     # test basic creation of new, empty MinHash
     a = MinHash(20, 10, track_abundance=track_abundance)

diff --git a/tests/test__minhash_hypothesis.py b/tests/test__minhash_hypothesis.py
@@ -0,0 +1,47 @@
+import pytest
+
+from hypothesis import given, example
+import hypothesis.strategies as st
+
+from sourmash import MinHash
+from sourmash._minhash import get_max_hash_for_scaled
+
+
+@given(st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000),
+       st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000),
+       st.integers(min_value=10, max_value=1000))
+@example([1, 2], [3, 4], 2)
+def test_set_abundance_num_hypothesis(hashes, abundances, sketch_size):
+    a = MinHash(sketch_size, 10, track_abundance=True)
+    oracle = dict(zip(hashes, abundances))
+
+    a.set_abundances(oracle)
+
+    mins = a.get_mins(with_abundance=True)
+    size = min(sum(1 for v in oracle.values() if v > 0), sketch_size)
+    assert len(mins) == size
+
+    for k, v in mins.items():
+        assert oracle[k] == v
+
+
+@given(st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000),
+       st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000),
+       st.integers(min_value=1000, max_value=10000))
+@example([0], [0], 1000)
+def test_set_abundance_scaled_hypothesis(hashes, abundances, scaled):
+    a = MinHash(0, 10, track_abundance=True, scaled=scaled)
+    oracle = dict(zip(hashes, abundances))
+
+    a.set_abundances(oracle)
+
+    max_hash = get_max_hash_for_scaled(scaled)
+    below_max_hash = sum(1 for (k, v) in oracle.items() if k <= max_hash and v > 0)
+
+    mins = a.get_mins(with_abundance=True)
+    assert len(mins) == below_max_hash
+
+    for k, v in mins.items():
+        assert oracle[k] == v
+        assert k <= max_hash
+        assert v > 0