In [3]:
from utils import helper_functions

In [1]:
# this is an example for cortex release 0.21 and may not deploy correctly on other releases of cortex
import os
import shutil
import glob
import math
import numpy as np
import scipy.spatial
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from collections import OrderedDict 
from itertools import islice
import json
import ipdb

import boto3

import redis

from utils import helper_functions, redis_cache_mechanisms


class PythonPredictor:

    def __init__(self):

        # download the information retrieval model trained on MS-MARCO dataset
        #self.embedder = SentenceTransformer('distilroberta-base-msmarco-v2')
        self.embedder = SentenceTransformer('./models/distilroberta-base-msmarco-v2')
        
        # set the environment variables
        self.redis_host = '127.0.0.1'
        self.redis_port = 6379


        self.aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
        self.aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
        
           
        # establish connection with s3 bucket
        
        try:  
            self.s3 = boto3.client('s3', aws_access_key_id=self.aws_access_key_id , aws_secret_access_key=self.aws_secret_access_key)
            print('Connected to s3 bucket!')
        except Exception as ex:
            print('\n\naws client error:', ex)
            exit('Failed to connect to s3 bucket, terminating.')
        
        
        # establish connection to redis server to be used as data store persistence

        try:
            self.r = redis.StrictRedis(host=self.redis_host, port=self.redis_port, decode_responses=True)
            self.r.ping()
            print('Connected to redis cache!')
        except Exception as ex:
            print('\n\nredis client error:', ex)
            exit('Failed to connect to redis, terminating.')

        
        self.dir = 'tmp'



        if os.path.exists(self.dir):
            shutil.rmtree(self.dir)
        os.makedirs(self.dir)                                           




    def predict(self, payload):
        
        # extract values from the request payload
        
        # sess stores a file's uuid
        # a unique identifier to link to an uploaded file's text file, encodings and top words
        sess = payload["uuid"]
     
        query = payload["text"]

        max_results =  payload["top"]
        
        acc_greater_than  = payload["accuracyGreaterThan"]
        
        cache_bool_value = redis_cache_mechanisms.check_if_request_to_be_cached(self, sess, query, max_results)
        
        print('are we caching the values:', cache_bool_value)
                
        if cache_bool_value:
            
            # as caching has to be done we request for 50 more lines and cache them
            # however we return the exact requested amount of lines to the client
            max_results+=50
            
            # check if the files for the corresponding file id are present on the local disk or not
            # return 0 if there's no folder present for the file
            sess_dir_find = glob.glob('tmp/'+sess)
            new_disk_sess = True if len(sess_dir_find)==0 else False

            if new_disk_sess:
                # create new cache disk session direct

                helper_functions.download_text_file_and_embeddings_from_s3_bucket(self, sess)

                corpus, corpus_embeddings = helper_functions.load_text_file_and_embeddings(self, sess)

            else:


                # accessing from already downloaded encodings and files from disk

                print('😉 got you\'ve covered, model alread encoded 🤘')

                corpus, corpus_embeddings = helper_functions.load_text_file_and_embeddings(self, sess)


            queries = [str(query)]
            

            query_embeddings = self.embedder.encode(queries)

            queries_and_embeddings=(queries, query_embeddings)
            corpus_and_embeddings=(corpus, corpus_embeddings)

            response = helper_functions.cluster(self, corpus_and_embeddings, queries_and_embeddings, max_results, acc_greater_than)
            
            #ipdb.set_trace()
            #-------------------------------Redis cache layer---------------------------------
            
            redis_cache_mechanisms.cache_response_to_redis(self, sess, query, response, max_results)
            
            #----------------------------------------------------------------------------------

            response = OrderedDict(islice(response.items(), 0, payload['top']))
            
            return response

        else:

            # return from redis cache!

            
            print('file available in redis cache! 😇')

            response_cache = redis_cache_mechanisms.get_cache_data_from_redis(self, sess, query, max_results)
            
            
            return response_cache



In [2]:
pr = PythonPredictor()

Connected to s3 bucket!
Connected to redis cache!


In [3]:
!

In [19]:

# 0eb1f558-b4e0-4118-80f1-c7b3e1741fca
# 3dc5ea3d-d5e1-4946-91ed-be0d63af8a12
# c514e5d4-e0c9-4c6a-a35a-d3ad706c419b

payload = {
    "uuid": "3dc5ea3d-d5e1-4946-91ed-be0d63af8a12", 
    "text": "money",
    "top": 54,
}


In [20]:
resp = pr.predict(payload)


in check for cache or not
query not present in cache:  False
number of requested lines exceed the ones in cache:  False
are we caching the values: False
file available in redis cache! 😇


In [21]:
resp

'{"Cybermoney will become the new money of the     Information Age, replacing the paper money of Industrialism": 0.5772, "Unique, anonymous, and verifiable, this money will accommodate the largest transactions": 0.5653, "Paper money in the West began as warehouse or safe-deposit receipts for quantities of precious metals": 0.5416, "This new digital form of money is destined to play a pivotal role in cybercommerce": 0.5248, "The new digital money of the Information Age will return control over the medium of exchange to the owners of wealth, who wish to preserve it, rather than to nation-states that wish to spirit it away": 0.5158, "\\"17          Digital money on global computer networks will make every object on Hayek\'s continuum of liquidity more liquid-except government paper": 0.4879, "The names of traditional currencies like the \\"pound\\" and the \\"peso\\" reflect the fact that they originated as measures of weight of specific quantities of precious metals": 0.4699, "Cash trans

In [7]:
a = 'Not only will they tend to have less recourse to banks for borrowing; firms in the Information Age are also likely to have fewer physical assets to capture'
b = 'Tax-free money already compounds far faster offshore than onshore funds still subject to the high tax burden imposed by the twentieth-century nation-state'

In [10]:
line_hash_id = get_sliced_hash(a)
print(line_hash_id)
print(get_sliced_hash(b))

b5f63b256df
6b942de42fa


In [9]:
import hashlib

def hash_it(string):
    '''
    sha1 hash creates a 40 character string encoded as hexadecimal.
    hexadecimal bit contains 2^4 characters
    so if we slice the sha1 output to first 9 bits.
    Ideally it should map 2^(4*9) number of strings. ~= 6.8*10^10 = 68 billion lines
    For practical cases, taking into consideration the birthday problem and other collision issues,
    A collision can be considered to take place roughly every 2^(4*n*0.5) where n is the number of bits
    '''

    # for our purposes we will keep first 11 bits for line ids, which can map atleast 4 million lines (2^(4*0.5*11))
    # because an average 1MB book contains 6000 strings
    # if we max out the book size to 50MB and containing 300,000 lines, we still have additional 2.7 Million lines to spare

    # for query ids also we will keep first 11 bits, which can map atleast 4 million query. Enough to start with

    return hashlib.sha1(string.encode('utf-8')).hexdigest()


def get_sliced_hash(string):

    # number of bits from sha1 hash to be used for line and query ids
    # chech hash_it function for more details on why we are slicing a part of sha1 hash
    HASH_SHA1_BITS_RETAIN = 11
    

    string_hash = hash_it(string)
    hash_id = string_hash[:HASH_SHA1_BITS_RETAIN]

    return hash_id


In [None]:
resp

In [5]:
a = [1, 2, 3, 4, 5]
print(a[2:])

[3, 4, 5]


In [None]:
#og
resp

In [None]:
for l, b in list(a.items()):
    print(l, b)

In [None]:
resp = pr.predict(payload)

In [1]:
import redis
redis_host = "localhost"
redis_port = 6379

In [2]:
r = redis.StrictRedis(host=redis_host, port=redis_port, decode_responses=True)

In [36]:
r.xrange('3dc5ea3d-d5e1-4946-91ed-be0d63af8a12:query_id:1613237734378-0:match_lines', '-', '+', count=3)

[('1613237734379-0', {'1613237734379-0': '0.6601'}),
 ('1613237734380-0', {'1613237734379-1': '0.6288'}),
 ('1613237734380-1', {'1613237734380-0': '0.5703'})]

In [55]:
# setbit user:user_id:bookmarks:query_id line_id 1
r.setbit('user:1:uuid:2:bookmarks:query_id', 113, 1)

0

In [56]:
r.get('user:1:uuid:2:bookmarks:query_id')

'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00@'

In [57]:
s = r.get('user:1:uuid:2:bookmarks:query_id')
bitmap = ""
for c in s:
    x = ord(c)
    str = bin(x).split('b')[1]
    if len(str) < 8 : 
       str = '0' * (8-len(str)) + str
    bitmap += str
print(bitmap)

000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000


In [None]:
r.

In [None]:
#XADD mystream * name Sara surname OConnor

a = r.xadd('qid1:', {'content': 'enlightenment is the space between your thoughts', 'bookmark': 959 })


In [None]:
r.xlen('qid1:')

In [None]:
%time
r.xrange('qid1:', '-', '+')

In [16]:
%time
r.xrevrange('qid13:', '+', '-', count=1)

CPU times: user 6 µs, sys: 2 µs, total: 8 µs
Wall time: 14.5 µs


[]

In [59]:
query_hash_id = hash_it('violence')[:11]
print(hash_it('violence'))
print(query_hash_id)
query_hash_id[:2]



c8ceaa59d3031a316e2e212b2b1b80213368b10b
c8ceaa59d30


'c8'

In [29]:
query_id = r.hget(payload['uuid']+':query_to_id:'+query_hash_id[:2], query_hash_id[2::])


1613237734378-0


In [20]:
file_list = []
with open('tmp/3dc5ea3d-d5e1-4946-91ed-be0d63af8a12/text_content.txt', 'r') as file:
    file_list = file.read()

In [8]:
import hashlib
import sys

def hash_it(string):
    return hashlib.sha1(string.encode('utf-8')).hexdigest()

a = hash_it('Missing files are the most common problems with Unix programs, so if the system log and other log information aren\xe2\x80\x99t very helpful and you have nowhere else to turn, strace can be of great use')
print(a)
print(a[:2], a[2::])

f816927ca5f3cbf165d67d8b263f0c55270656f1
f8 16927ca5f3cbf165d67d8b263f0c55270656f1


In [9]:
string = 'Missing files are the most common problems with Unix programs, so if the system log and other log information aren\xe2\x80\x99t very helpful and you have nowhere else to turn, strace can be of great use'

In [11]:
hash_it(string)

'f816927ca5f3cbf165d67d8b263f0c55270656f1'

In [10]:
string.encode('utf-8')

b'Missing files are the most common problems with Unix programs, so if the system log and other log information aren\xc3\xa2\xc2\x80\xc2\x99t very helpful and you have nowhere else to turn, strace can be of great use'

In [None]:
%time

maxi=0
maxline=''
for l in file_list:
    if len(l) > maxi:
        maxi=len(l)
        maxline=l
    line_id = r.hincrby('ids', 'si:lines', 1)
    hash_line = hash_it(l)
    r.hset('file:si:'+hash_line[:2], hash_line[3::], line_id)

In [None]:
%time
maxi=0
maxline=''
for l in file_list:
    if len(l) > maxi:
        maxi=len(l)
        maxline=l
    line_id = r.hincrby('ids', 'si:lines', 1)
    hash_line = hash_it(l)
    r.hset('file:si:', l, line_id)

In [58]:

import hashlib


def hash_it(string):
    '''
    sha1 hash creates a 40 character string encoded as hexadecimal.
    hexadecimal bit contains 2^4 characters
    so if we slice the sha1 output to first 9 bits.
    Ideally it should map 2^(4*9) number of strings. ~= 6.8*10^10 = 68 billion lines
    For practical cases, taking into consideration the birthday problem and other collision issues,
    A collision can be considered to take place roughly every 2^(4*n*0.5) where n is the number of bits
    '''

    # for our purposes we will keep first 10 bits for line ids, which can map atleast 1,048,576 lines (2^(4*0.5*10))
    # because an average 1MB book contains 6000 strings
    # if we max out the book size to 50MB and containing 300,000 lines, we still have additional 700,000 to spare

    # for query ids we will keep first 11 bits, which can map atleast 4 million query. Enough to start with

    return hashlib.sha1(string.encode('utf-8')).hexdigest()


def get_query_hash(string):

    # number of bits from sha1 hash to be used for line and query ids
    # chech hash_it function for more details on why we are slicing a part of sha1 hash
    QUERY_HASH_SHA1_BITS_RETAIN = 11
    

    query_hash = hash_it(string)
    query_hash_id = query_hash[:QUERY_HASH_SHA1_BITS_RETAIN]

    return query_hash_id


In [28]:
query_hash_id = get_query_hash('violence')

In [36]:
query_id_val = r.hget(payload['uuid']+':query_to_id:'+query_hash_id[:2], query_hash_id[2::])

In [37]:
r.xlen(payload['uuid']+':query_id:'+str(query_id_val)+':match_lines')

148