In [1]:
import datasketch as ds

In [2]:
class DictListStorage(ds.storage.OrderedStorage):
    '''This is a wrapper class around ``defaultdict(list)`` enabling
    it to support an API consistent with `Storage`
    '''
    def __init__(self, config):
        self._dict = ds.storage.defaultdict(list)
        print("DLS - Initializing DLS")

    def keys(self):
        print("DLS - Keys")
        return self._dict.keys()

    def get(self, key):
        print("DLS - Get")
        return self._dict.get(key, [])

    def remove(self, *keys):
        print("DLS - Remove")
        for key in keys:
            del self._dict[key]

    def remove_val(self, key, val):
        print("DLS - Remove_val")
        self._dict[key].remove(val)

    def insert(self, key, *vals, **kwargs):
        print("DLS - Insert")
        self._dict[key].extend(vals)

    def size(self):
        print("DLS - Size")
        return len(self._dict)

    def itemcounts(self, **kwargs):
        print("Itemcounts")
        '''Returns a dict where the keys are the keys of the container.
        The values are the *lengths* of the value sequences stored
        in this container.
        '''
        return {k: len(v) for k, v in self._dict.items()}

    def has_key(self, key):
        print("DLS - Has_key")
        return key in self._dict

In [3]:
class DictSetStorage(ds.storage.UnorderedStorage, DictListStorage):
    '''This is a wrapper class around ``defaultdict(set)`` enabling
    it to support an API consistent with `Storage`
    '''
    def __init__(self, config):
        print("Init DSS")
        self._dict = ds.storage.defaultdict(set)

    def get(self, key):
        print("DSS - Get")
        return self._dict.get(key, set())

    def insert(self, key, *vals, **kwargs):
        print("DSS - Insert")
        self._dict[key].update(vals)

In [4]:
ds.storage.DictListStorage = DictListStorage
ds.storage.DictSetStorage = DictSetStorage

In [5]:
s1 = set(['abc', 'dab', 'auwdhiuawd', 'w'])
mh1 = ds.MinHash()
for d in s1:
    mh1.update(d.encode('utf8'))

In [6]:
s2 = set(['abc', 'dab', 'auwdhiuawd', 'w'])
mh2 = ds.MinHash()
for d in s2:
    mh2.update(d.encode('utf8'))

In [7]:
mh1.jaccard(mh2)

1.0

In [8]:
mh1 = ds.MinHash()
mh2 = ds.MinHash()

a = 'hello world'
b = 'hewlokorlq'

for char in a:
    mh1.update(char.encode('utf8'))
    
for char in b:
    mh2.update(char.encode('utf8'))
    
mh1.jaccard(mh2)

0.5625

In [9]:
lsh = ds.MinHashLSH(threshold=0.5)
lsh.insert("1 aysavannna, iaodkajwd", mh1)
lsh.insert("2", mh2)

Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
Init DSS
DLS - Initializing DLS
DLS - Has_key
DLS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DLS - Has_key
DLS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert
DSS - Insert


In [10]:
mh3 = ds.MinHash()
c = 'hello woold'
for char in c:
    mh3.update(char.encode('utf8'))
    
lsh.query(mh3)

DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get
DSS - Get


['1 aysavannna, iaodkajwd']

In [10]:
lsh.query??

[0;31mSignature:[0m [0mlsh[0m[0;34m.[0m[0mquery[0m[0;34m([0m[0mminhash[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;32mdef[0m [0mquery[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mminhash[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34m'''[0m
[0;34m        Giving the MinHash of the query set, retrieve[0m
[0;34m        the keys that references sets with Jaccard[0m
[0;34m        similarities greater than the threshold.[0m
[0;34m[0m
[0;34m        Args:[0m
[0;34m            minhash (datasketch.MinHash): The MinHash of the query set.[0m
[0;34m[0m
[0;34m        Returns:[0m
[0;34m            `list` of unique keys.[0m
[0;34m        '''[0m[0;34m[0m
[0;34m[0m        [0;32mif[0m [0mlen[0m[0;34m([0m[0mminhash[0m[0;34m)[0m [0;34m!=[0m [0mself[0m[0;34m.[0m[0mh[0m[0;34m:[0m[0;34m[0m
[0;34m[0m            [0;32mraise[0m [0mValueError[0m[0;34m([0m[0;34m"Expecting minhash with length %d, got %d

In [16]:
lsh.hashranges

[(0, 5),
 (5, 10),
 (10, 15),
 (15, 20),
 (20, 25),
 (25, 30),
 (30, 35),
 (35, 40),
 (40, 45),
 (45, 50),
 (50, 55),
 (55, 60),
 (60, 65),
 (65, 70),
 (70, 75),
 (75, 80),
 (80, 85),
 (85, 90),
 (90, 95),
 (95, 100),
 (100, 105),
 (105, 110),
 (110, 115),
 (115, 120),
 (120, 125)]

In [17]:
lsh.hashtables

[<datasketch.storage.DictSetStorage at 0x7f1e8fa1ee20>,
 <datasketch.storage.DictSetStorage at 0x7f1e8fa1e550>,
 <datasketch.storage.DictSetStorage at 0x7f1e8fa1ef10>,
 <datasketch.storage.DictSetStorage at 0x7f1e8fa1eca0>,
 <datasketch.storage.DictSetStorage at 0x7f1e8fa1edf0>,
 <datasketch.storage.DictSetStorage at 0x7f1e8fa1eee0>,
 <datasketch.storage.DictSetStorage at 0x7f1e8fa1ef70>,
 <datasketch.storage.DictSetStorage at 0x7f1e8fa1ee50>,
 <datasketch.storage.DictSetStorage at 0x7f1e8fa1e5b0>,
 <datasketch.storage.DictSetStorage at 0x7f1e8fa1e4f0>,
 <datasketch.storage.DictSetStorage at 0x7f1e8f9b0040>,
 <datasketch.storage.DictSetStorage at 0x7f1e8f9b00a0>,
 <datasketch.storage.DictSetStorage at 0x7f1e8f9b0100>,
 <datasketch.storage.DictSetStorage at 0x7f1e8f9b0160>,
 <datasketch.storage.DictSetStorage at 0x7f1e8f9b01c0>,
 <datasketch.storage.DictSetStorage at 0x7f1e8f9b0220>,
 <datasketch.storage.DictSetStorage at 0x7f1e8f9b0280>,
 <datasketch.storage.DictSetStorage at 0x7f1e8f9

In [28]:
len(list(lsh.hashtables[0]._dict.keys())[1])

40

In [9]:
mh1.hashvalues

array([ 364990282,  678738577,   85449535,  574105082,  171543935,
         90794215, 1039617936,  692849170,  933848629,  479865199,
        125696125,  177928416,  184611626,   60243568,  101165625,
        225881735,  215173990,  262626311, 1262988712, 1215751963,
         41789668,  171205302,   46002067,  274448739,   85257387,
       1031868030,    3651271,  150570526,  449860825,  472642016,
        375109932,  992245193,  905135758,  167865696,  196082003,
       1244470369, 2456975410,   77884426,  130553716,  647163961,
        439486594,  249381847,  218145239,  186561769, 1178201266,
        663854654,  404239058,  200557755,  105274870,  123992087,
        304245968,   69986434,   25433287, 1774542206,  246122878,
        481187810,  609096839,  253496393,  927697339,  305156996,
         76600010,  398465001,   79527872, 1246375173,  913406597,
        446532106,  629430461,  782844498,  765711847,  184023092,
        573948482,   78195964,  894813567,   27583779,  505824

In [10]:
import io
import numpy as np
import datasketch.storage
lmh = ds.LeanMinHash(mh1)
b = bytearray(lmh.bytesize())
lmh.serialize(b)
b

bytearray(b'\x01\x00\x00\x00\x00\x00\x00\x00\x80\x00\x00\x00JO\xc1\x15\x91\xbat(?\xdb\x17\x05\xfa%8"\x7f\x8d9\n\xe7hi\x05\x90O\xf7=\x12\nL)5f\xa97o)\x9a\x1c}\xf8}\x07\xe0\xf8\x9a\n*\xf3\x00\x0bp>\x97\x039\xaa\x07\x06\x87\xaev\rfK\xd3\x0c\x07\\\xa7\x0f\xa8\xadGK\x1b\xe7vH\xe4\xa8}\x02\xb6b4\n\x93\xef\xbd\x02c\xc1[\x10\xab\xec\x14\x05~\x0e\x81=\xc7\xb67\x00\x1e\x86\xf9\x08\xd9T\xd0\x1a\xe0\xf1+\x1c,\xb9[\x16\xc9u$;\x8eF\xf35`m\x01\nS\xf9\xaf\x0ba\x1c-J2xr\x92\nl\xa4\x04t\x17\xc8\x079\xf0\x92&\x82\x082\x1a\xd7C\xdd\x0e\xd7\xa1\x00\r\xe9\xb4\x1e\x0b\xb2\xec9F>\x9e\x91\'\xd22\x18\x18\xbbD\xf4\x0b\xf6]F\x06\x17\xf8c\x07\xd0l"\x12\x82\xe8+\x04\xc7\x14\x84\x01~]\xc5i~\x89\xab\x0e\xe2W\xae\x1c\x87\x14N$I\x0c\x1c\x0f\xbb\x89K7\x84S0\x12\xca\xd2\x90\x04\xe9\x17\xc0\x17\xc0\x7f\xbd\x04\x05-JJ\x85zq6\n\x8a\x9d\x1a\xbdX\x84%RB\xa9.\xe7\xd5\xa3-4\xf8\xf7\nB\xc25"\xfc,\xa9\x04\x7f\xc5U5#\xe5\xa4\x01\x8bC&\x1e\xce\xf1ZD\xcc\xf7\x08l^|\xac(\xdd\x9a\xd8\x00\xeeJZ\x18\xcc;\xda\x18Uy\xa2\x12-\x1aD\x13u\xc2

In [11]:
ds.storage??

[0;31mType:[0m        module
[0;31mString form:[0m <module 'datasketch.storage' from '/home/ubuntu/.local/lib/python3.8/site-packages/datasketch/storage.py'>
[0;31mFile:[0m        ~/.local/lib/python3.8/site-packages/datasketch/storage.py
[0;31mSource:[0m     
[0;32mfrom[0m [0mcollections[0m [0;32mimport[0m [0mdefaultdict[0m[0;34m[0m
[0;34m[0m[0;32mimport[0m [0mbinascii[0m[0;34m[0m
[0;34m[0m[0;32mimport[0m [0mcollections[0m[0;34m[0m
[0;34m[0m[0;32mimport[0m [0mitertools[0m[0;34m[0m
[0;34m[0m[0;32mimport[0m [0moperator[0m[0;34m[0m
[0;34m[0m[0;32mimport[0m [0mos[0m[0;34m[0m
[0;34m[0m[0;32mimport[0m [0mrandom[0m[0;34m[0m
[0;34m[0m[0;32mimport[0m [0mstring[0m[0;34m[0m
[0;34m[0m[0;32mfrom[0m [0mabc[0m [0;32mimport[0m [0mABCMeta[0m[0;34m,[0m [0mabstractmethod[0m[0;34m[0m
[0;34m[0m[0mABC[0m [0;34m=[0m [0mABCMeta[0m[0;34m([0m[0;34m'ABC'[0m[0;34m,[0m [0;34m([0m[0mobject[0m[0;34m,[0m[0;

In [12]:
ds.storage.DictListStorage??

[0;31mInit signature:[0m [0mds[0m[0;34m.[0m[0mstorage[0m[0;34m.[0m[0mDictListStorage[0m[0;34m([0m[0mconfig[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mDictListStorage[0m[0;34m([0m[0mOrderedStorage[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m'''This is a wrapper class around ``defaultdict(list)`` enabling[0m
[0;34m    it to support an API consistent with `Storage`[0m
[0;34m    '''[0m[0;34m[0m
[0;34m[0m    [0;32mdef[0m [0m__init__[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mconfig[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0mself[0m[0;34m.[0m[0m_dict[0m [0;34m=[0m [0mdefaultdict[0m[0;34m([0m[0mlist[0m[0;34m)[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0;32mdef[0m [0mkeys[0m[0;34m([0m[0mself[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;32mreturn[0m [0mself[0m[0;34m.[0m[0m_dict[0m[0;34m.[0m[0mkeys[0m[0;34m([0m[0;34m)[0

In [13]:
import Bio
import Bio.SeqIO

In [14]:
gen = Bio.SeqIO.read("sequence.gb", "genbank")
gen

SeqRecord(seq=Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN'), id='CM000681.2', name='CM000681', description='Homo sapiens chromosome 19, GRCh38 reference primary assembly', dbxrefs=['BioProject:PRJNA31257'])

In [15]:
Bio.SeqIO.write(gen, "sequence.fasta", "fasta")

1