In [1]:
import shutil, os
from io import StringIO

import sourmash
from sourmash.sbt_storage import FSStorage, ZipStorage
from sourmash.manifest import CollectionManifest
from sourmash.signature import load_signatures
from sourmash.sourmash_args import SaveSignaturesToLocation

In [2]:
# load some demo signatures
sig1 = sourmash.load_one_signature('akkermansia.fa.sig', ksize=31)
sig2 = sourmash.load_one_signature('shew_os185.fa.sig', ksize=31)
sig3 = sourmash.load_one_signature('shew_os223.fa.sig', ksize=31)

In [3]:
# create a zipfile with the signatures
try:
    os.remove('/tmp/zip-store2.zip')
except FileNotFoundError:
    pass

with SaveSignaturesToLocation('/tmp/zip-store2.zip') as savesig:
    savesig.add(sig1)
    savesig.add(sig2)
    savesig.add(sig3)
    
    print(f"saved {len(savesig)} signatures to {savesig.location}")

saved 3 signatures to /tmp/zip-store2.zip


In [4]:
#
# define two utility functions. the second one, search_list_of_sigs,
# is what we want to do in parallel and on demand; the first one, load_sig,
# is the new functionality that we need in rust - that is, support for loading
# from a Storage.
#

def load_sig(storage, iloc, md5sum):
    # load signature(s) from a location in storage, return those that match md5sum
    # note, there _should_ be only one match to the given md5sum,
    # but it is not 100% guaranteed so :shrug:
    sig_json_data = storage.load(iloc)          # <-- this is the only thing not available in Rust
    for sigobj in load_signatures(sig_json_data):
        if sigobj.md5sum() == md5sum:
            yield sigobj

def search_list_of_sigs(query, storage, list_of_sigs, threshold):
    # given a list of signatures identified by (iloc, md5sum) pairs in list_of_sigs,
    # load these signatures from storage,
    # compare them to query,
    # and return those with similarities higher than threshold
    #
    # in theory this could be done in parallel, but in Python there's not much point

    for (iloc, md5sum) in list_of_sigs:
        for sigobj in load_sig(storage, iloc, md5sum):
            if query.similarity(sigobj) >= threshold:
                yield iloc, md5sum
                

In [5]:
# construct a list of signatures to search, identified by (iloc, md5sum) tuples
# this can be done in a variety of ways, including using picklists.
# here we are using the manifest to get everything in the storage, but it may be a subset

# load storage and manifest
storage = ZipStorage('/tmp/zip-store2.zip')
manifest = CollectionManifest.load_from_storage(storage)

# construct list of signatures
list_of_sigs = [ (d['internal_location'], d['md5']) for d in manifest._select() ]

list_of_sigs

[('signatures/6822e0b7f2b21030699fbb98c698e71c.sig.gz',
  '6822e0b7f2b21030699fbb98c698e71c'),
 ('signatures/b47b13ef3781433fc3531fd502f723a4.sig.gz',
  'b47b13ef3781433fc3531fd502f723a4'),
 ('signatures/ae6659f6804482c9d5e739e554a48563.sig.gz',
  'ae6659f6804482c9d5e739e554a48563')]

In [6]:
# now, search this list of signatures.
res = search_list_of_sigs(sig2, storage, list_of_sigs, 0.01)

list(res)

[('signatures/b47b13ef3781433fc3531fd502f723a4.sig.gz',
  'b47b13ef3781433fc3531fd502f723a4'),
 ('signatures/ae6659f6804482c9d5e739e554a48563.sig.gz',
  'ae6659f6804482c9d5e739e554a48563')]