Skip to content

Commit

Permalink
Merge branch 'master' of github.com:dib-lab/sourmash into refactor/mi…
Browse files Browse the repository at this point in the history
…nhash
  • Loading branch information
ctb committed Feb 8, 2020
2 parents d599f19 + a1eeab1 commit c412cea
Show file tree
Hide file tree
Showing 10 changed files with 202 additions and 74 deletions.
27 changes: 27 additions & 0 deletions .github/workflows/hypothesis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Hypothesis tests

on:
push:
branches: [master]
pull_request:
branches: [master]

jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1

- name: Set up Python 3.7
uses: actions/setup-python@v1
with:
python-version: "3.7"

- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install -e .[test]
- name: Run Hypothesis tests
run: |
python -m pytest --run-hypothesis --hypothesis-show-statistics --hypothesis-profile ci
7 changes: 4 additions & 3 deletions doc/developer.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ To install all of the necessary Python dependencies, do:
```
pip install -r requirements.txt
```
Briefly, we use `py.test` for testing, and `coverage` for code
Briefly, we use `py.test` and `cargo test` for testing, and `coverage` for code
coverage analysis.

We suggest working on sourmash in a virtualenv; e.g. from within the
Expand All @@ -28,8 +28,9 @@ python -m virtualenv dev
pip install -e .
```

You can run tests by invoking `make test` or `python -m pytest` in the sourmash
directory.
You can run tests by invoking `make test` in the sourmash directory;
`python -m pytest` will run the Python tests, and `cargo test` will
run the Rust tests.

## Automated tests and code coverage calculation

Expand Down
6 changes: 2 additions & 4 deletions include/sourmash.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,6 @@ void computeparams_set_scaled(ComputeParameters *ptr, uint64_t scaled);

uint64_t hash_murmur(const char *kmer, uint64_t seed);

void kmerminhash_abunds_push(KmerMinHash *ptr, uint64_t val);

void kmerminhash_add_from(KmerMinHash *ptr, const KmerMinHash *other);

void kmerminhash_add_hash(KmerMinHash *ptr, uint64_t h);
Expand Down Expand Up @@ -162,8 +160,6 @@ void kmerminhash_merge(KmerMinHash *ptr, const KmerMinHash *other);

bool kmerminhash_is_compatible(const KmerMinHash *ptr, const KmerMinHash *other);

void kmerminhash_mins_push(KmerMinHash *ptr, uint64_t val);

KmerMinHash *kmerminhash_new(uint32_t n,
uint32_t k,
bool prot,
Expand All @@ -181,6 +177,8 @@ void kmerminhash_remove_many(KmerMinHash *ptr, const uint64_t *hashes_ptr, uintp

uint64_t kmerminhash_seed(KmerMinHash *ptr);

void kmerminhash_set_abundances(KmerMinHash *ptr, const uint64_t *hashes_ptr, const uint64_t *abunds_ptr, uintptr_t insize);

bool kmerminhash_track_abundance(KmerMinHash *ptr);

bool nodegraph_count(Nodegraph *ptr, uint64_t h);
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def build_native(spec):
"zip_safe": False,
"platforms": "any",
"extras_require": {
'test' : ['pytest', 'pytest-cov', 'recommonmark'],
'test' : ['pytest', 'pytest-cov', 'recommonmark', 'hypothesis'],
'demo' : ['jupyter', 'jupyter_client', 'ipython'],
'doc' : ['sphinx', 'recommonmark', 'alabaster',
"sphinxcontrib-napoleon", "nbsphinx"],
Expand Down
16 changes: 7 additions & 9 deletions sourmash/_minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,15 +442,13 @@ def __iadd__(self, other):

def set_abundances(self, values):
if self.track_abundance:
added = 0

for k, v in sorted(values.items()):
if not self.max_hash or k <= self.max_hash:
self._methodcall(lib.kmerminhash_mins_push, k)
self._methodcall(lib.kmerminhash_abunds_push, v)
added += 1
if self.num > 0 and added >= self.num:
break
hashes = []
abunds = []
for h, v in values.items():
hashes.append(h)
abunds.append(v)

self._methodcall(lib.kmerminhash_set_abundances, hashes, abunds, len(hashes))
else:
raise RuntimeError(
"Use track_abundance=True when constructing "
Expand Down
45 changes: 31 additions & 14 deletions src/core/src/ffi/minhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -252,13 +252,41 @@ pub unsafe extern "C" fn kmerminhash_get_mins_size(ptr: *mut KmerMinHash) -> usi
mh.mins.len()
}

#[no_mangle]
pub unsafe extern "C" fn kmerminhash_mins_push(ptr: *mut KmerMinHash, val: u64) {
ffi_fn! {
unsafe fn kmerminhash_set_abundances(
ptr: *mut KmerMinHash,
hashes_ptr: *const u64,
abunds_ptr: *const u64,
insize: usize,
) -> Result<()> {
let mh = {
assert!(!ptr.is_null());
&mut *ptr
};
mh.mins.push(val)

let hashes = {
assert!(!hashes_ptr.is_null());
slice::from_raw_parts(hashes_ptr as *const u64, insize)
};

let abunds = {
assert!(!abunds_ptr.is_null());
slice::from_raw_parts(abunds_ptr as *const u64, insize)
};

let mut pairs: Vec<_> = hashes.iter().cloned().zip(abunds.iter().cloned()).collect();
pairs.sort();

// Reset the minhash
mh.mins.clear();
if let Some(ref mut abunds) = mh.abunds {
abunds.clear();
}

mh.add_many_with_abund(&pairs)?;

Ok(())
}
}

ffi_fn! {
Expand Down Expand Up @@ -288,17 +316,6 @@ pub unsafe extern "C" fn kmerminhash_get_abunds_size(ptr: *mut KmerMinHash) -> u
}
}

#[no_mangle]
pub unsafe extern "C" fn kmerminhash_abunds_push(ptr: *mut KmerMinHash, val: u64) {
let mh = {
assert!(!ptr.is_null());
&mut *ptr
};
if let Some(ref mut abunds) = mh.abunds {
abunds.push(val)
}
}

#[no_mangle]
pub unsafe extern "C" fn kmerminhash_is_protein(ptr: *mut KmerMinHash) -> bool {
let mh = {
Expand Down
100 changes: 58 additions & 42 deletions src/core/src/sketch/minhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -256,56 +256,74 @@ impl KmerMinHash {
}

pub fn add_hash(&mut self, hash: u64) {
self.add_hash_with_abundance(hash, 1);
}

pub fn add_hash_with_abundance(&mut self, hash: u64, abundance: u64) {
let current_max = match self.mins.last() {
Some(&x) => x,
None => u64::max_value(),
};

if hash <= self.max_hash || self.max_hash == 0 {
// empty? add it, if within range / no range specified.
if self.mins.is_empty() {
if hash > self.max_hash && self.max_hash != 0 {
// This is a scaled minhash, and we don't need to add the new hash
return;
}

if self.num == 0 && self.max_hash == 0 {
// why did you create this minhash? it will always be empty...
return;
}

if abundance == 0 {
// well, don't add it.
return;
}

// From this point on, hash is within scaled (or no scaled specified).

// empty mins? add it.
if self.mins.is_empty() {
self.mins.push(hash);
if let Some(ref mut abunds) = self.abunds {
abunds.push(abundance);
}
return;
}

if hash <= self.max_hash || hash <= current_max || (self.mins.len() as u32) < self.num {
// "good" hash - within range, smaller than current entry, or
// still have space available
let pos = match self.mins.binary_search(&hash) {
Ok(p) => p,
Err(p) => p,
};

if pos == self.mins.len() {
// at end - must still be growing, we know the list won't
// get too long
self.mins.push(hash);
if let Some(ref mut abunds) = self.abunds {
abunds.push(1);
abunds.push(abundance);
}
} else if self.mins[pos] != hash {
// didn't find hash in mins, so inserting somewhere
// in the middle; shrink list if needed.
self.mins.insert(pos, hash);
if let Some(ref mut abunds) = self.abunds {
abunds.insert(pos, abundance);
}
return;
} else if hash <= self.max_hash
|| current_max > hash
|| (self.mins.len() as u32) < self.num
{
// "good" hash - within range, smaller than current entry, or
// still have space available
let pos = match self.mins.binary_search(&hash) {
Ok(p) => p,
Err(p) => p,
};

if pos == self.mins.len() {
// at end - must still be growing, we know the list won't
// get too long
self.mins.push(hash);
if let Some(ref mut abunds) = self.abunds {
abunds.push(1);
}
} else if self.mins[pos] != hash {
// didn't find hash in mins, so inserting somewhere
// in the middle; shrink list if needed.
self.mins.insert(pos, hash);
if let Some(ref mut abunds) = self.abunds {
abunds.insert(pos, 1);
}

// is it too big now?
if self.num != 0 && self.mins.len() > (self.num as usize) {
self.mins.pop();
if let Some(ref mut abunds) = self.abunds {
abunds.pop();
}
// is it too big now?
if self.num != 0 && self.mins.len() > (self.num as usize) {
self.mins.pop();
if let Some(ref mut abunds) = self.abunds {
abunds.pop();
}
} else if let Some(ref mut abunds) = self.abunds {
// pos == hash: hash value already in mins, inc count
abunds[pos] += 1;
}
} else if let Some(ref mut abunds) = self.abunds {
// pos == hash: hash value already in mins, inc count by abundance
abunds[pos] += abundance;
}
}
}
Expand Down Expand Up @@ -451,9 +469,7 @@ impl KmerMinHash {

pub fn add_many_with_abund(&mut self, hashes: &[(u64, u64)]) -> Result<(), Error> {
for item in hashes {
for _i in 0..item.1 {
self.add_hash(item.0);
}
self.add_hash_with_abundance(item.0, item.1);
}
Ok(())
}
Expand Down
18 changes: 17 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os

from hypothesis import settings, Verbosity
import pytest


Expand Down Expand Up @@ -36,10 +39,23 @@ def pytest_collection_modifyitems(items, config):
deselected_items.append(item)
config.hook.pytest_deselected(items=deselected_items)
items[:] = selected_items
# --- END - Only run tests using a particular fixture --- #

def pytest_addoption(parser):
parser.addoption("--usesfixture",
action="store",
default=None,
help="just run tests that use a particular fixture")
# --- END - Only run tests using a particular fixture --- #

parser.addoption("--run-hypothesis", action="store_true",
help="run hypothesis tests")

def pytest_runtest_setup(item):
if item.config.getoption("--run-hypothesis"):
if not any(mark for mark in item.iter_markers(name="hypothesis")):
pytest.skip("--run-hypothesis option set, running only hypothesis tests")

settings.register_profile("ci", max_examples=1000)
settings.register_profile("dev", max_examples=10)
settings.register_profile("debug", max_examples=10, verbosity=Verbosity.verbose)
settings.load_profile(os.getenv(u'HYPOTHESIS_PROFILE', 'default'))
8 changes: 8 additions & 0 deletions tests/test__minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -1138,6 +1138,14 @@ def test_reviving_minhash():
mh.add_hash(m)


def test_set_abundance_num():
a = MinHash(2, 10, track_abundance=True)

a.set_abundances({1: 3, 2: 4})

assert a.get_mins(with_abundance=True) == {1: 3, 2: 4}


def test_mh_copy_and_clear(track_abundance):
# test basic creation of new, empty MinHash
a = MinHash(20, 10, track_abundance=track_abundance)
Expand Down
47 changes: 47 additions & 0 deletions tests/test__minhash_hypothesis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import pytest

from hypothesis import given, example
import hypothesis.strategies as st

from sourmash import MinHash
from sourmash._minhash import get_max_hash_for_scaled


@given(st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000),
st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000),
st.integers(min_value=10, max_value=1000))
@example([1, 2], [3, 4], 2)
def test_set_abundance_num_hypothesis(hashes, abundances, sketch_size):
a = MinHash(sketch_size, 10, track_abundance=True)
oracle = dict(zip(hashes, abundances))

a.set_abundances(oracle)

mins = a.get_mins(with_abundance=True)
size = min(sum(1 for v in oracle.values() if v > 0), sketch_size)
assert len(mins) == size

for k, v in mins.items():
assert oracle[k] == v


@given(st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000),
st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000),
st.integers(min_value=1000, max_value=10000))
@example([0], [0], 1000)
def test_set_abundance_scaled_hypothesis(hashes, abundances, scaled):
a = MinHash(0, 10, track_abundance=True, scaled=scaled)
oracle = dict(zip(hashes, abundances))

a.set_abundances(oracle)

max_hash = get_max_hash_for_scaled(scaled)
below_max_hash = sum(1 for (k, v) in oracle.items() if k <= max_hash and v > 0)

mins = a.get_mins(with_abundance=True)
assert len(mins) == below_max_hash

for k, v in mins.items():
assert oracle[k] == v
assert k <= max_hash
assert v > 0

0 comments on commit c412cea

Please sign in to comment.