In [2]:
import datasketch as ds
import struct
import pickle
import base64
import time
import random
import pickle

In [2]:
import thrift as t
import hbase_thrift.hbase as hb

from hbase_thrift.hbase import THBaseService
from hbase_thrift.hbase.ttypes import *

from thrift.transport import *
from thrift.protocol import *

In [3]:
TSocket.TSocket??

[0;31mInit signature:[0m
[0mTSocket[0m[0;34m.[0m[0mTSocket[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mhost[0m[0;34m=[0m[0;34m'localhost'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mport[0m[0;34m=[0m[0;36m9090[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0munix_socket[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msocket_family[0m[0;34m=[0m[0;34m<[0m[0mAddressFamily[0m[0;34m.[0m[0mAF_UNSPEC[0m[0;34m:[0m [0;36m0[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msocket_keepalive[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mTSocket[0m[0;34m([0m[0mTSocketBase[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Socket implementation of TTransport base."""[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0;32mdef[0m [0m__init__[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mhost[0m[0;34m=[0m[0;34

In [5]:
class HbaseConnection:
    def __init__(self, host, port):
        self.host = host
        self.port = port
        
        self._socket = None
        self._client = None
        
    def _create_transport(self):
        if self._socket != None and self._socket.isOpen():
            try:
                self._socket.close()
            except:
                pass
        
        self._socket = TSocket.TSocket(self.host, self.port)
        self._transport = TTransport.TBufferedTransport(self._socket)
        self._protocol = TBinaryProtocol.TBinaryProtocol(self._transport)
        self._client = THBaseService.Client(self._protocol)
        self._transport.open()
        
    def __enter__(self):
        if self._client == None or not self._socket.isOpen():
            self._create_transport()
        
        return self._client
    
    def __exit__(self, _, __, ___):
        pass

In [6]:
def hbase_safe_b64_encode(data):
    return base64.b64encode(data).decode('utf8').replace('=', '').encode('utf8')

class HBaseDictListStorage(ds.storage.OrderedStorage):
    # Order may be important in insertion in this storage object, but I'm not sure. Let's assume not for now, and get burned by it later
    def __init__(self, config, name):
        self._name = name
        self._table = hbase_safe_b64_encode(name)
        self._pool = config['hbase_pool']
        
        self.buffer = []
        
        print(f"Initializing table with name: {self._name}, safe: {self._table}")
        
        needs_create_table = True
        
        retries = 0
        max_retries = 10
        
        while needs_create_table and retries < max_retries:
            try:
                with self._pool as c:
                    needs_create_table = not c.tableExists(TTableName(qualifier=self._table))
            except TIOError as e:
                message = e.message.decode('utf8')
                if not message.startswith('org.apache.hadoop.hbase.TableNotFoundException'):
                    raise e
                        
            if needs_create_table:
                print(f"Finished sleeping, attempting to create table ({retries}/{max_retries} retries): {self._name}")
                
                try:
                    with self._pool as c:
                        # Create table here:
                        key_family = TColumnFamilyDescriptor(name=b'key', bloomnFilterType=TBloomFilterType.ROWCOL)
                        value_family = TColumnFamilyDescriptor(name=b'value')
                        name = TTableName(qualifier=self._table)
                        newTable = TTableDescriptor(tableName=name, columns=[key_family, value_family])
                        c.createTable(newTable, None)
                        needs_create_table = False
                        print(f"Successfully create table: {self._name}")
                except BaseException as e:
                    retries += 1
                    print(f"Failed to create table: {self._name} with exception: {e}")
                    sleep_time = random.uniform(0.01, 2)
                    print(f"Need to create table:  {self._name} ({self._table}), sleeping {sleep_time} seconds")
                    time.sleep(sleep_time)
                    #raise e
                
        if needs_create_table:
            raise ValueError(f"Failed to create table {self._name} with {retries} retries")
        

    def keys(self):
        raise ValueError('Not implemented')

    def get(self, key):
        with self._pool as c:
            r = c.get(self._table, TGet(row=key))
            return [q.qualifier for q in r.columnValues]

    def remove(self, *keys):
        raise ValueError('Not implemented')

    def remove_val(self, key, val):
        raise ValueError('Not implemented')

    def empty_buffer(self):
        # Used to execute large batch
        if len(self.buffer) > 0:
            self._insert(self.buffer)
            self.buffer.clear()
    
    def _insert(self, values):
            with self._pool as c:
                c.putMultiple(self._table, values)
                
                
        
    def insert(self, key, *vals, **kwargs):
        # Needs implementation
        # Should check: kwargs['buffer'], if true, buffer untill empty_buffer is called
        
        cols = [TColumnValue(family=b'value', qualifier=v, value=b'') for v in vals]
        put = TPut(row=key, columnValues=cols, durability=TDurability.SKIP_WAL)
        
        if kwargs['buffer'] and kwargs['buffer'] == True:
            self.buffer.append(put)
        else:
            self._insert([put])
        

    def size(self):
        raise ValueError('Not implemented')

    def itemcounts(self, **kwargs):
        raise ValueError('Not implemented')
        
    def has_key(self, key):
        # Needs implementation
        with self._pool as c:
            return c.exists(self._table, TGet(row=key))

In [7]:
class HBaseDictSetStorage(ds.storage.UnorderedStorage, HBaseDictListStorage):
    '''This is a wrapper class around ``defaultdict(set)`` enabling
    it to support an API consistent with `Storage`
    '''
    def __init__(self, config, name=None):
        HBaseDictListStorage.__init__(self, config, name=name)

    def get(self, key):
        return set(HBaseDictListStorage.get(self, key))

    def insert(self, key, *vals, **kwargs):
        HBaseDictListStorage.insert(self, key, *vals, **kwargs)

In [8]:
def hbase_ordered_storage(config, name=None):
    #print(f"Overriden ordered storage ran with config: {config}")
    tp = config['type']
    if tp == 'hbase':
        return HBaseDictListStorage(config, name=name)
    else:
        return ds.storage.ordered_storage(config, name=name)


def hbase_unordered_storage(config, name=None):
    #print(f"Overriden unordered storage ran with config: {config}")
    tp = config['type']
    if tp == 'hbase':
        return HBaseDictSetStorage(config, name=name)
    else:
        return ds.storage.unordered_storage(config, name=name)

In [9]:
def override_lsh__init__(self, threshold=0.9, num_perm=128, weights=(0.5, 0.5),
                 params=None, storage_config=None, prepickle=None, hashfunc=None):
        storage_config = {'type': 'dict'} if not storage_config else storage_config
        self._buffer_size = 50000
        if threshold > 1.0 or threshold < 0.0:
            raise ValueError("threshold must be in [0.0, 1.0]")
        if num_perm < 2:
            raise ValueError("Too few permutation functions")
        if any(w < 0.0 or w > 1.0 for w in weights):
            raise ValueError("Weight must be in [0.0, 1.0]")
        if sum(weights) != 1.0:
            raise ValueError("Weights must sum to 1.0")
        self.h = num_perm
        if params is not None:
            self.b, self.r = params
            if self.b * self.r > num_perm:
                raise ValueError("The product of b and r in params is "
                        "{} * {} = {} -- it must be less than num_perm {}. "
                        "Did you forget to specify num_perm?".format(
                            self.b, self.r, self.b*self.r, num_perm))
        else:
            false_positive_weight, false_negative_weight = weights
            self.b, self.r = ds.lsh._optimal_param(threshold, num_perm,
                    false_positive_weight, false_negative_weight)

        self.prepickle = storage_config['type'] == 'redis' if prepickle is None else prepickle

        self.hashfunc = hashfunc
        if hashfunc:
            self._H = self._hashed_byteswap
        else:
            self._H = self._byteswap

        basename = storage_config.get('basename', ds.storage._random_name(11))
        self.hashtables = [
            hbase_unordered_storage(storage_config, name=b''.join([basename, b'_bucket_', struct.pack('>H', i)]))
            for i in range(self.b)]
        self.hashranges = [(i*self.r, (i+1)*self.r) for i in range(self.b)]
        self.keys = hbase_ordered_storage(storage_config, name=b''.join([basename, b'_keys']))
        
ds.lsh.MinHashLSH.__init__ = override_lsh__init__

In [8]:
pool = HbaseConnection(host="datanode1", port=9090)
lsh = ds.lsh.MinHashLSH(storage_config={'type': 'hbase', 'basename': b'test9', 'hbase_pool': pool}, prepickle=True)

Initializing table with name: b'test9_bucket_\x00\x00', safe: b'dGVzdDlfYnVja2V0XwAA'
Initializing table with name: b'test9_bucket_\x00\x01', safe: b'dGVzdDlfYnVja2V0XwAB'
Initializing table with name: b'test9_bucket_\x00\x02', safe: b'dGVzdDlfYnVja2V0XwAC'
Initializing table with name: b'test9_bucket_\x00\x03', safe: b'dGVzdDlfYnVja2V0XwAD'
Initializing table with name: b'test9_bucket_\x00\x04', safe: b'dGVzdDlfYnVja2V0XwAE'
Initializing table with name: b'test9_keys', safe: b'dGVzdDlfa2V5cw'


In [9]:
def create_hash(string):
    mh2 = ds.MinHash()
#    for d in string:
#        mh2.update(d.encode('utf8'))

    for i in range(2, len(string)):
        v = string[i-2:i]
        mh2.update(v.encode('utf8'))

    return mh2

In [10]:
test = create_hash("ha")
#lsh.insert("a", test)

In [11]:
#values = ['hello', 'home', 'hotel', 'ham', 'hamwich', 'handel', 'ada', 'lovelace', 'garmin', 'new york']
values = ['ATCGATCGATCG', 'ATCGATCGATCT'] 
hashes = [create_hash(v) for v in values]
#for key, h in zip(values, hashes):
#    lsh.insert(key, h)

In [12]:
lsh_d = ds.lsh.MinHashLSH()
for key, h in zip(values, hashes):
    lsh_d.insert(key, h)

In [13]:
query = create_hash("ATCGTGCGATCG")
lsh.query(query), lsh_d.query(query)

([], [])

In [14]:
hashes[0].jaccard(query)

0.515625

In [15]:
spark = True

In [1]:
import findspark
findspark.init()
findspark.find()

'/usr/local/spark/python/pyspark'

In [2]:
from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .master("yarn")
         .appName("python-testing")
         .config("spark.executor.instances", 12)
         .config("spark.executor.memory", "800m")
         #.config("spark.dynamicAllocation.enabled", "true")
         #.config("spark.executor.cores", 1)
         #.config("spark.dynamicAllocation.minExecutors", 4)
         #.config("spark.dynamicAllocation.maxExecutors", 32)
         #.config("spark.shuffle.service.enabled", "true")
         #.config("spark.shuffle.service.port", 7337)
         .getOrCreate())
sc = spark.sparkContext
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-04-19 12:30:34,609 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2022-04-19 12:30:43,243 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [2]:
import sys
sys.path.append('he')

In [3]:
spark

In [4]:
sc.environment['PYTHONPATH'] = '/home/ubuntu/.local/lib/python3.8/site-packages'
sc.addPyFile('hbase_connector.py')

In [5]:
windows = sc.sequenceFile("hdfs:///files/windowed")
windows.take(1)

                                                                                

[(30126923,
  'CGCATCGTGGCTGGACCTGAGTCCATCTGCCCTGGTGCCTGCATGACTGGCCCTTCTCCTTCACAGACCATGGCCCCAGGCTCCCTTGCTTTCATTTCCCAGCCCGTTATTGGGGCAGGAGAGTAGCAAGCGGGGGAGTTTTGATGAGGCGAGGA')]

In [6]:
sample = windows.sample(fraction=0.0001, withReplacement=False, seed=1).repartition(16)
sample.count()

                                                                                

5752

In [7]:
#sample.saveAsPickleFile('hdfs:///files/hadoop_test')

In [8]:
def mapTobase64Pickle(values):
    import pickle
    import base64
    pickled = [pickle.dumps(v) for v in values]
    b64d = [base64.b64encode(p) for p in pickled]
    return b64d

b64pickled = sample.mapPartitions(mapTobase64Pickle)
b64pickled.take(2)


                                                                                

In [9]:
b64pickled.saveAsTextFile('hdfs:///files/hadoop_test')

Py4JJavaError: An error occurred while calling o106.saveAsTextFile.
: org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory hdfs://namenode:9000/files/hadoop_test already exists
	at org.apache.hadoop.mapred.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:131)
	at org.apache.spark.internal.io.HadoopMapRedWriteConfigUtil.assertConf(SparkHadoopWriter.scala:299)
	at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:71)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopDataset$1(PairRDDFunctions.scala:1090)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1088)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$4(PairRDDFunctions.scala:1061)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1026)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$3(PairRDDFunctions.scala:1008)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1007)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$2(PairRDDFunctions.scala:964)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:962)
	at org.apache.spark.rdd.RDD.$anonfun$saveAsTextFile$2(RDD.scala:1578)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1578)
	at org.apache.spark.rdd.RDD.$anonfun$saveAsTextFile$1(RDD.scala:1564)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1564)
	at org.apache.spark.api.java.JavaRDDLike.saveAsTextFile(JavaRDDLike.scala:551)
	at org.apache.spark.api.java.JavaRDDLike.saveAsTextFile$(JavaRDDLike.scala:550)
	at org.apache.spark.api.java.AbstractJavaRDDLike.saveAsTextFile(JavaRDDLike.scala:45)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)


In [22]:
def foreach(inputs):
    import datasketch as ds
    import hbase_connector
    
    def create_hash(string):
        mh2 = ds.MinHash()
        for i in range(3, len(string)):
            v = string[i-3:i]
            mh2.update(v.encode('utf8'))

        return mh2

    pool = hbase_connector.HbaseConnection(host="localhost", port=9090)
    lsh = ds.lsh.MinHashLSH(storage_config={'type': 'hbase', 'basename': b'insert_9', 'hbase_pool': pool}, prepickle=True)
    
    with lsh.insertion_session() as sess:
        for key, value in inputs:
            sess.insert((key, value), create_hash(value))

In [None]:
sample.save

In [23]:
%%time
#sample.foreachPartition(foreach)

2022-04-19 09:55:09,467 WARN scheduler.TaskSetManager: Lost task 1.0 in stage 5.0 (TID 259) (datanode1 executor 2): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/tmp/hadoop-ubuntu/nm-local-dir/usercache/ubuntu/appcache/application_1650289273905_0011/container_1650289273905_0011_01_000003/pyspark.zip/pyspark/worker.py", line 619, in main
    process()
  File "/tmp/hadoop-ubuntu/nm-local-dir/usercache/ubuntu/appcache/application_1650289273905_0011/container_1650289273905_0011_01_000003/pyspark.zip/pyspark/worker.py", line 609, in process
    out_iter = func(split_index, iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
  File "/usr/local/spark/python/pyspark/rdd.py", line 417, in func
  File "/usr/local/spark/python/pyspark/rdd.py", line 933, in func
  File 

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 12 in stage 5.0 failed 4 times, most recent failure: Lost task 12.3 in stage 5.0 (TID 287) (datanode2 executor 8): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/tmp/hadoop-ubuntu/nm-local-dir/usercache/ubuntu/appcache/application_1650289273905_0011/container_1650289273905_0011_01_000009/pyspark.zip/pyspark/worker.py", line 619, in main
    process()
  File "/tmp/hadoop-ubuntu/nm-local-dir/usercache/ubuntu/appcache/application_1650289273905_0011/container_1650289273905_0011_01_000009/pyspark.zip/pyspark/worker.py", line 609, in process
    out_iter = func(split_index, iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
  File "/usr/local/spark/python/pyspark/rdd.py", line 417, in func
  File "/usr/local/spark/python/pyspark/rdd.py", line 933, in func
  File "/tmp/ipykernel_15846/3611185596.py", line 18, in foreach
  File "/home/ubuntu/.local/lib/python3.8/site-packages/datasketch/lsh.py", line 364, in insert
    self.lsh._insert(key, minhash, check_duplication=check_duplication,
  File "/home/ubuntu/.local/lib/python3.8/site-packages/datasketch/lsh.py", line 168, in _insert
    raise ValueError("The given key already exists")
ValueError: The given key already exists

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:545)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:703)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:685)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:498)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1030)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2254)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2403)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2352)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2351)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2351)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1109)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1109)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1109)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2591)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2533)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2522)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:898)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2235)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2254)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2279)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:180)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/tmp/hadoop-ubuntu/nm-local-dir/usercache/ubuntu/appcache/application_1650289273905_0011/container_1650289273905_0011_01_000009/pyspark.zip/pyspark/worker.py", line 619, in main
    process()
  File "/tmp/hadoop-ubuntu/nm-local-dir/usercache/ubuntu/appcache/application_1650289273905_0011/container_1650289273905_0011_01_000009/pyspark.zip/pyspark/worker.py", line 609, in process
    out_iter = func(split_index, iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
  File "/usr/local/spark/python/pyspark/rdd.py", line 417, in func
  File "/usr/local/spark/python/pyspark/rdd.py", line 933, in func
  File "/tmp/ipykernel_15846/3611185596.py", line 18, in foreach
  File "/home/ubuntu/.local/lib/python3.8/site-packages/datasketch/lsh.py", line 364, in insert
    self.lsh._insert(key, minhash, check_duplication=check_duplication,
  File "/home/ubuntu/.local/lib/python3.8/site-packages/datasketch/lsh.py", line 168, in _insert
    raise ValueError("The given key already exists")
ValueError: The given key already exists

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:545)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:703)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:685)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:498)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1030)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2254)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [10]:
pool = HbaseConnection(host="datanode1", port=9090)
lsh = ds.lsh.MinHashLSH(storage_config={'type': 'hbase', 'basename': b'hbase_insert_11', 'hbase_pool': pool}, prepickle=True)

Initializing table with name: b'hbase_insert_11_bucket_\x00\x00', safe: b'aGJhc2VfaW5zZXJ0XzExX2J1Y2tldF8AAA'
Initializing table with name: b'hbase_insert_11_bucket_\x00\x01', safe: b'aGJhc2VfaW5zZXJ0XzExX2J1Y2tldF8AAQ'
Initializing table with name: b'hbase_insert_11_bucket_\x00\x02', safe: b'aGJhc2VfaW5zZXJ0XzExX2J1Y2tldF8AAg'
Initializing table with name: b'hbase_insert_11_bucket_\x00\x03', safe: b'aGJhc2VfaW5zZXJ0XzExX2J1Y2tldF8AAw'
Initializing table with name: b'hbase_insert_11_bucket_\x00\x04', safe: b'aGJhc2VfaW5zZXJ0XzExX2J1Y2tldF8ABA'
Initializing table with name: b'hbase_insert_11_keys', safe: b'aGJhc2VfaW5zZXJ0XzExX2tleXM'


In [11]:
def create_hash(string):
    mh2 = ds.MinHash()
    for i in range(3, len(string)):
        v = string[i-3:i]
        mh2.update(v.encode('utf8'))

    return mh2

test = 'CGCATCGTGGCTGGACCTGAGTCCATCTGCCCTGGTGCCTGCATGACTGGCCCTTCTCCTTCACAGACCATGGCCCCAGGCTCCCTTGCTTTCATTTCCCAGCCCGTTATTGGGGCAGGAGAGTAGCAAGCGGGGGAGTTTTGATGAGGCGAGGA'

In [12]:
test2 = 'AGCATCGTGGCTGGACCTGAGTCCATCTGCCCTGGTGCCTGCATGACTGGCCCTTCTCCTTCACAGACCATGGCCCCAGGCTCCCTTGCTTTCATTTCCCAGCCCGTTATTGGGGCAGGAGAGTAGCAAGCGGGGGAGTTTTGATGAGGCGAGGA'
h = create_hash(test2)

In [14]:
%%time
lsh.query(h)

CPU times: user 2.38 ms, sys: 349 µs, total: 2.73 ms
Wall time: 15.7 ms


[(50785679,
  'GTAGGCCCGAGGCCGGGCGCAAGGGCAGTGGGTCGGCGGAGCAGGATGTGCGGATTCAGTTACATTTATCTGGCCTCAGAGCGATGGCAGAGGGAGAGGCAGGGCTCTTGTGCGGGGAGGTCCTGTGTCTTCTCACTGCTGAGCCTGAGAGAGGA'),
 (39073433,
  'TTTTGTATTTTTAGTAGAGACAGAGTTTCACCATGTTGGCCAGGCTGGTCTCGAACTCCTGACCTCAGGCGATCCACCCGCCTTGGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACCGCACCTGGCTTAGCTATTCTTCTTACTGACAGT'),
 (52148903,
  'CAGCCTGGTGTCAAACTCCTCACCTCAAGTGATCTGCCACCTTGGCCTGCCAAAGTGCTGGGATTACAGGTGTGAGCCACCATGCCTGGCCTGTATGTGCTTTTAAAGTCCTTGTGTCACTAAGTTACAGAGCTTTGACGTCTGGGTCTAAAAAG'),
 (14778755,
  'GAGTGCAGGGTCACCATCACAGCTCACTGCAGCCTCAAACTTCAGGGCTCAAGGGATCCTCCTGCCTCAGCCTCCTGAGGAGCTGGGACTACAGCTGTGCACCATCACGTCTGGCTAGTTTTCTCCTTAAGTGTTTTTTTTTTTTGTAGAGACGA'),
 (17345024,
  'CCCAGCCTTTTTTTTTTTTTTCTTTAAGTAAAGATGGGGGTCTTGCTGTGTTGCCCAGGCTGGTCTCAAACTCTTGGCCTCTGGTGATCCTGAGACCTCAGCCTCCCAAAGTGCTGGGATCACATGTGTGAGCCACCGTGCCTGGCCTGATGGCT'),
 (7456904,
  'TTTGTATTTTTAGTAGAGACGGGGTTTCACCATGTTGGCCAGGCTGGTCTCCATCTTCTAACCTCAAGTGATCCTCCCGCCTTGGCCTCCCAAAGTGATGGGATTACAGTCGTGCG

In [28]:
sample.count()

                                                                                

58411

In [29]:
#sample.saveAsPickleFile('hdfs:///files/hadoop_test')

Py4JJavaError: An error occurred while calling o66.saveAsObjectFile.
: org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory hdfs://namenode:9000/files/hadoop_test already exists
	at org.apache.hadoop.mapred.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:131)
	at org.apache.spark.internal.io.HadoopMapRedWriteConfigUtil.assertConf(SparkHadoopWriter.scala:299)
	at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:71)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopDataset$1(PairRDDFunctions.scala:1090)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1088)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$4(PairRDDFunctions.scala:1061)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1026)
	at org.apache.spark.rdd.SequenceFileRDDFunctions.$anonfun$saveAsSequenceFile$1(SequenceFileRDDFunctions.scala:66)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.SequenceFileRDDFunctions.saveAsSequenceFile(SequenceFileRDDFunctions.scala:51)
	at org.apache.spark.rdd.RDD.$anonfun$saveAsObjectFile$1(RDD.scala:1587)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.saveAsObjectFile(RDD.scala:1587)
	at org.apache.spark.api.java.JavaRDDLike.saveAsObjectFile(JavaRDDLike.scala:566)
	at org.apache.spark.api.java.JavaRDDLike.saveAsObjectFile$(JavaRDDLike.scala:565)
	at org.apache.spark.api.java.AbstractJavaRDDLike.saveAsObjectFile(JavaRDDLike.scala:45)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)
