In [1]:
import datasketch as ds

https://github.com/ekzhu/datasketch/blob/master/datasketch/storage.py
https://raw.githubusercontent.com/ekzhu/datasketch/master/datasketch/storage.py

In [2]:
class DictListStorage(ds.storage.OrderedStorage):
    '''This is a wrapper class around ``defaultdict(list)`` enabling
    it to support an API consistent with `Storage`
    '''
    def __init__(self, config):
        self._dict = defaultdict(list)

    def keys(self):
        return self._dict.keys()

    def get(self, key):
        return self._dict.get(key, [])

    def remove(self, *keys):
        for key in keys:
            del self._dict[key]

    def remove_val(self, key, val):
        self._dict[key].remove(val)

    def insert(self, key, *vals, **kwargs):
        self._dict[key].extend(vals)

    def size(self):
        return len(self._dict)

    def itemcounts(self, **kwargs):
        '''Returns a dict where the keys are the keys of the container.
        The values are the *lengths* of the value sequences stored
        in this container.
        '''
        return {k: len(v) for k, v in self._dict.items()}

    def has_key(self, key):
        return key in self._dict

In [3]:
class DictSetStorage(ds.storage.UnorderedStorage, DictListStorage):
    '''This is a wrapper class around ``defaultdict(set)`` enabling
    it to support an API consistent with `Storage`
    '''
    def __init__(self, config):
        self._dict = defaultdict(set)

    def get(self, key):
        return self._dict.get(key, set())

    def insert(self, key, *vals, **kwargs):
        self._dict[key].update(vals)

## Minimum viable product:
* [ ] Init - DSS:
    Initializes/runs a lot (for each character added?)
* [ ] Init - DLS:
    Only runs once
* [ ] Has_key - DLS:
    Runs for each sequence/read
* [ ] Insert - DLS:
    Runs for each sequence/read
* [ ] Insert - DSS:
    Runs for each bp (each character)
* [ ] Get - DSS:
    Runs in query for each hash(ish)

In [4]:
import datasketch as ds
import happybase as hb

In [5]:
import base64

class HBaseDictListStorage(ds.storage.OrderedStorage):
    def __init__(self, config, name=None):
        self._name = b'' if name == None else name
        self._pool = config['hbase_pool']
        self._table = base64.b64encode(config['hbase_table'].encode('utf8') + self._name).decode('utf8').replace('=', '')
        self._keysize = config['hbase_keysize']
        self._recreate_table = config.get('hbase_recreate_table', False)
        
        with self._pool.connection() as c:
            families = {
                'fvalue': dict(),
            }
            
            tables = c.tables()
            if not self._table.encode('utf8') in tables:
                #print(f"Trying to create table with name: {self._table}, and type: {type(self._table)}")
                c.create_table(self._table, families)
            elif self._recreate_table:
                #print(f"Deleting table with name: {self._table}")
                c.delete_table(self._table, disable=True)
                #print(f"Trying to create table with name: {self._table}, and type: {type(self._table)}")
                c.create_table(self._table, families)
                

    def keys(self):
        raise ValueError('Not implemented')

    def get(self, key):
        raise ValueError('Not implemented')

    def remove(self, *keys):
        raise ValueError('Not implemented')

    def remove_val(self, key, val):
        raise ValueError('Not implemented')

    def insert(self, key, *vals, **kwargs):
        # Needs implementation
        
        
        
        #if len(key) != self._keysize:
        #    raise ValueError(f'Length of key must be equal to config parameter: "hbase_keysize", was {len(key)}, should be: {self._keysize}')
        
        binary_key = key.encode('utf8') if type(key) is str else key
        
        #print(f"Insert: key: {binary_key}, kt: {type(binary_key)}, vals: {vals}, vt: {type(vals[0])}")
        
        with self._pool.connection() as c:
            table = c.table(self._table)
            with table.batch() as batch:
                for value in vals:
                    value = value.encode('utf8') if type(value) is str else value
                    #print(f"Trying to insert values with key: {binary_key + value}")
                    batch.put(binary_key + value, {b'fvalue:value': value})


    def size(self):
        raise ValueError('Not implemented')

    def itemcounts(self, **kwargs):
        raise ValueError('Not implemented')
        
    def has_key(self, key):
        # Needs implementation
        with self._pool.connection() as c:
            table = c.table(self._table)
            row = table.row(key)
            #print(f"Checking whether key: {key} exists: {len(row) != 0}, (row: {row})")
            return len(row) != 0

In [6]:
class HBaseDictSetStorage(ds.storage.UnorderedStorage, HBaseDictListStorage):
    '''This is a wrapper class around ``defaultdict(set)`` enabling
    it to support an API consistent with `Storage`
    '''
    def __init__(self, config, name=None):
        HBaseDictListStorage.__init__(self, config, name=name)

    def get(self, key):
        with self._pool.connection() as c:
            table = c.table(self._table)
            values = [row[1][b'fvalue:value'] for row in table.scan(row_prefix=key)]
            return set(values)

    def insert(self, key, *vals, **kwargs):
        HBaseDictListStorage.insert(self, key, *vals, **kwargs)

In [7]:
def hbase_ordered_storage(config, name=None):
    #print(f"Overriden ordered storage ran with config: {config}")
    tp = config['type']
    if tp == 'hbase':
        return HBaseDictListStorage(config, name=name)
    else:
        return old_ordered_storage_function(config, name=name)

if ds.storage.ordered_storage != hbase_ordered_storage:
    old_ordered_storage_function = ds.storage.ordered_storage
    ds.storage.ordered_storage = hbase_ordered_storage

In [8]:
def hbase_unordered_storage(config, name=None):
    #print(f"Overriden unordered storage ran with config: {config}")
    tp = config['type']
    if tp == 'hbase':
        return HBaseDictSetStorage(config, name=name)
    else:
        return old_unordered_storage_function(config, name=name)

if ds.storage.unordered_storage != hbase_unordered_storage:
    old_unordered_storage_function = ds.storage.unordered_storage    
    ds.storage.unordered_storage = hbase_unordered_storage

In [9]:
import struct
import pickle

def override_lsh__init__(self, threshold=0.9, num_perm=128, weights=(0.5, 0.5),
                 params=None, storage_config=None, prepickle=None, hashfunc=None):
        print("Overriden constructor ran")
        storage_config = {'type': 'dict'} if not storage_config else storage_config
        self._buffer_size = 50000
        if threshold > 1.0 or threshold < 0.0:
            raise ValueError("threshold must be in [0.0, 1.0]")
        if num_perm < 2:
            raise ValueError("Too few permutation functions")
        if any(w < 0.0 or w > 1.0 for w in weights):
            raise ValueError("Weight must be in [0.0, 1.0]")
        if sum(weights) != 1.0:
            raise ValueError("Weights must sum to 1.0")
        self.h = num_perm
        if params is not None:
            self.b, self.r = params
            if self.b * self.r > num_perm:
                raise ValueError("The product of b and r in params is "
                        "{} * {} = {} -- it must be less than num_perm {}. "
                        "Did you forget to specify num_perm?".format(
                            self.b, self.r, self.b*self.r, num_perm))
        else:
            false_positive_weight, false_negative_weight = weights
            self.b, self.r = ds.lsh._optimal_param(threshold, num_perm,
                    false_positive_weight, false_negative_weight)

        self.prepickle = storage_config['type'] == 'redis' if prepickle is None else prepickle

        self.hashfunc = hashfunc
        if hashfunc:
            self._H = self._hashed_byteswap
        else:
            self._H = self._byteswap

        basename = storage_config.get('basename', ds.storage._random_name(11))
        self.hashtables = [
            ds.storage.unordered_storage(storage_config, name=b''.join([basename, b'_bucket_', struct.pack('>H', i)]))
            for i in range(self.b)]
        self.hashranges = [(i*self.r, (i+1)*self.r) for i in range(self.b)]
        self.keys = ds.storage.ordered_storage(storage_config, name=b''.join([basename, b'_keys']))
        
ds.lsh.MinHashLSH.__init__ = override_lsh__init__

In [10]:
values = ['hello', 'hewwoadwojaowjdpoawjd', 'nopeq', 'yesir', 'hotadwhawudagdwiuagwdel']
hashes = [ds.MinHash() for v in values]

for i, value in enumerate(values):
    for v in value:
        hashes[i].update(v.encode('utf8'))
        
lsh = ds.MinHashLSH(threshold=0.5)
for i, _hash in enumerate(hashes):
    lsh.insert(values[i], _hash)

Overriden constructor ran


In [11]:
[len(a) for a in list(lsh.hashtables[0]._dict.keys())]

[40, 40, 40, 40, 40]

In [12]:
pool = hb.ConnectionPool(10, host='namenode')
config = {'type': 'hbase', 'hbase_table': 'test_table_2', 'hbase_pool': pool, 'hbase_keysize': 40, 'basename': b'', 'hbase_recreate_table': True}
lsh_hbase = ds.MinHashLSH(threshold=0.5, storage_config=config)

Overriden constructor ran


In [13]:
with pool.connection() as c:
    tbs = c.tables()
tbs

[b'dGVzdF90YWJsZV8yX2J1Y2tldF8AAA',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8AAQ',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8AAg',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8AAw',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8ABA',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8ABQ',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8ABg',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8ABw',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8ACA',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8ACQ',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8ACg',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8ACw',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8ADA',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8ADQ',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8ADg',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8ADw',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8AEA',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8AEQ',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8AEg',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8AEw',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8AFA',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8AFQ',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8AFg',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8AFw',
 b'dGVzdF90YWJsZV8yX2J1Y2tldF8AGA',
 b'dGVzdF90YWJsZV8yX2tleXM',
 b'dGVzdF90YWJsZV9oYmFzZWFvaWpkd29pYWpkYWlqb2R3',
 b'dGVzdF90YWJsZV9oYm

In [14]:
'test_table_hbaseaoijdwoiajdaijodw'.encode('utf8') in tbs

True

In [15]:
hbase_unordered_storage(config, name=b'aoijdwoiajdaijodw')

<__main__.HBaseDictSetStorage at 0x7fe942310880>

In [16]:
ds.storage.ordered_storage(config, name=b''.join([config['basename'], b'_keys']))

<__main__.HBaseDictListStorage at 0x7fe94227dd60>

In [17]:
for i, _hash in enumerate(hashes):
    lsh_hbase.insert(values[i], _hash)

In [18]:
from datasketch.storage import (
    ordered_storage, unordered_storage, _random_name)

In [19]:
ordered_storage??

[0;31mSignature:[0m [0mordered_storage[0m[0;34m([0m[0mconfig[0m[0;34m,[0m [0mname[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
[0;32mdef[0m [0mhbase_ordered_storage[0m[0;34m([0m[0mconfig[0m[0;34m,[0m [0mname[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;31m#print(f"Overriden ordered storage ran with config: {config}")[0m[0;34m[0m
[0;34m[0m    [0mtp[0m [0;34m=[0m [0mconfig[0m[0;34m[[0m[0;34m'type'[0m[0;34m][0m[0;34m[0m
[0;34m[0m    [0;32mif[0m [0mtp[0m [0;34m==[0m [0;34m'hbase'[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;32mreturn[0m [0mHBaseDictListStorage[0m[0;34m([0m[0mconfig[0m[0;34m,[0m [0mname[0m[0;34m=[0m[0mname[0m[0;34m)[0m[0;34m[0m
[0;34m[0m    [0;32melse[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;32mreturn[0m [0mold_ordered_storage_function[0m[0;34m([0m[0mconfig[0m[

In [20]:
blerg

'dGVzdF90YWJsZV8yX2tleXM'

In [21]:
(b'hello').decode('utf8')

'hello'

In [22]:
w = base64.b64encode(b'hello').decode('utf8').replace('=', '')
w

'aGVsbG8'

In [23]:
base64.b64decode(w + '==')

b'hello'

In [24]:
test = ds.MinHash()
value = 'opel'
for v in value:
    test.update(v.encode('utf8'))
    
lsh_hbase.query(test)

[b'hello', b'nopeq', b'hewwoadwojaowjdpoawjd']

In [25]:
set({'h': True})

{'h'}