In [1]:
from utils import helper_functions

In [10]:
# this is an example for cortex release 0.21 and may not deploy correctly on other releases of cortex
import os
import shutil
import glob
import math
import numpy as np
import scipy.spatial
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from collections import OrderedDict 
from itertools import islice
import json

import boto3

import redis

from utils import helper_functions, redis_cache_mechanisms


class PythonPredictor:

    def __init__(self):

        # download the information retrieval model trained on MS-MARCO dataset
        #self.embedder = SentenceTransformer('distilroberta-base-msmarco-v2')
        self.embedder = SentenceTransformer('./models/distilroberta-base-msmarco-v2')
        
        # set the environment variables
        self.redis_host = '127.0.0.1'
        self.redis_port = 6379


        self.aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
        self.aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
        
        # establish connection with s3 bucket
        self.s3 = boto3.client('s3', aws_access_key_id=self.aws_access_key_id , aws_secret_access_key=self.aws_secret_access_key)


        # establish connection to redis server to be used as data store persistence

        try:
            self.r = redis.StrictRedis(host=self.redis_host, port=self.redis_port, decode_responses=True)
            
            self.r.ping()
            print('Connected to redis cache!')
        except Exception as ex:
            print('\n\nredis client error:', ex)
            exit('Failed to connect to redis, terminating.')

        
        self.dir = 'tmp'



        #if os.path.exists(self.dir):
        #    shutil.rmtree(self.dir)
        os.makedirs(self.dir)                                           




    def predict(self, payload):
        
        # extract values from the request payload
        
        # sess stores a file's uuid
        # a unique identifier to link to an uploaded file's text file, encodings and top words
        sess = payload["uuid"]
     
        query = payload["text"]

        max_results =  payload["top"]
        
        acc_greater_than  = payload["accuracyGreaterThan"]
        
        cache_bool_value = redis_cache_mechanisms.check_if_request_to_be_cached(self, sess, query, max_results)
                
        if cache_bool_value:
            
            # as caching has to be done we request for 50 more lines and cache them
            # however we return the exact requested amount of lines to the client
            max_results+=50
            
            # check if the files for the corresponding file id are present on the local disk or not
            # return 0 if there's no folder present for the file
            sess_dir_find = glob.glob('tmp/'+sess)
            new_disk_sess = True if len(sess_dir_find)==0 else False

            if new_disk_sess:
                # create new cache disk session direct

                helper_functions.download_text_file_and_embeddings_from_s3_bucket(self, sess)

                corpus, corpus_embeddings = helper_functions.load_text_file_and_embeddings(self, sess)

            else:


                # accessing from already downloaded encodings and files from disk

                print('😉 got you\'ve covered, model alread encoded 🤘')

                corpus, corpus_embeddings = helper_functions.load_text_file_and_embeddings(self, sess)


            queries = [str(query)]
            

            query_embeddings = self.embedder.encode(queries)

            queries_and_embeddings=(queries, query_embeddings)
            corpus_and_embeddings=(corpus, corpus_embeddings)

            response = helper_functions.cluster(self, corpus_and_embeddings, queries_and_embeddings, max_results, acc_greater_than)
            
            redis_cache_mechanisms.cache_response_to_redis(self, sess, query, response)

            response = OrderedDict(islice(response.items(), 0, payload['top']))
            
            return response

        else:

            # return from redis cache!

            print('file available in redis cache! 😇')

            response_cache = redis_cache_mechanisms.get_cache_data_from_redis(self, sess, query, max_results)
            
            return response_cache



In [11]:
pr = PythonPredictor()

Connected to redis cache!


In [22]:

# 0eb1f558-b4e0-4118-80f1-c7b3e1741fca
# 3dc5ea3d-d5e1-4946-91ed-be0d63af8a12
# c514e5d4-e0c9-4c6a-a35a-d3ad706c419b

payload = {
    "uuid": "3dc5ea3d-d5e1-4946-91ed-be0d63af8a12", 
    "text": "love",
    "top": 10,
    "accuracyGreaterThan": 0.2
}


In [23]:
resp = pr.predict(payload)

in check for cachce or not
😉 got you've covered, model alread encoded 🤘
In the 1960s there was a new wave of free love, partly based on the apparent security of the female contraceptive pill, but also promoted by mood-changing drugs and pop music (Score: 0.3742)
He wrote: "Loyalty to one's country, on the other hand, is something we could do without (Score: 0.2976)
A shared morality in a tolerant society was the ideal of John Locke and of early philosophers of liberty (Score: 0.2912)
Respect for parents and faithfulness in marriage are the best ways to preserve family life; family life is the best way to bring up morally healthy children (Score: 0.2799)
The winners, the diarist wrote, were those who made love with those courtesans the greatest number of times (Score: 0.2694)
"My word is my bond" was for them an absolute principle (Score: 0.2694)
" That makes a very fine phrase, and a very fine aspiration, but "life, liberty and estate" is more down to earth than "life, liberty, and the

In [15]:
resp

'{"The new digital gold will overcome many of the practical                                                163   problems that inhibited direct use of gold as money in the past": 0.5343, "Early generations of \\"digital servants\\" already obey the commands of those who control the computers in which they are sealed much as genies were sealed in magic lamps": 0.4664, "The new digital money of the Information Age will return control over the medium of exchange to the owners of wealth, who wish to preserve it, rather than to nation-states that wish to spirit it away": 0.4642, "Contracting Leverage           The emergence of digital money will not only defeat inflation once and for all; it will also contract leverage in the banking systems of the world": 0.4323, "\\"17          Digital money on global computer networks will make every object on Hayek\'s continuum of liquidity more liquid-except government paper": 0.4276, "This new digital form of money is destined to play a pivotal role i

In [14]:
resp = pr.predict(payload)

in check for cachce or not
file available in redis cache! 😇


In [None]:
import redis
redis_host = "localhost"
redis_port = 6379

In [None]:
r = redis.StrictRedis(host=redis_host, port=redis_port, decode_responses=True)

In [None]:
file_list = []
with open('tmp/3dc5ea3d-d5e1-4946-91ed-be0d63af8a12/text_content.txt', 'r') as file:
    file_list = file.read()

In [None]:
import hashlib
import sys

def hash_it(string):
    return hashlib.sha1(string.encode('utf-8')).hexdigest()

a = hash_it(' You could have pocketed anaverage real return of more than 30 percent annually in U')
print(a)
print(a[:13])

In [None]:
%time

maxi=0
maxline=''
for l in file_list:
    if len(l) > maxi:
        maxi=len(l)
        maxline=l
    line_id = r.hincrby('ids', 'si:lines', 1)
    hash_line = hash_it(l)
    r.hset('file:si:'+hash_line[:2], hash_line[3::], line_id)

In [None]:
%time
maxi=0
maxline=''
for l in file_list:
    if len(l) > maxi:
        maxi=len(l)
        maxline=l
    line_id = r.hincrby('ids', 'si:lines', 1)
    hash_line = hash_it(l)
    r.hset('file:si:', l, line_id)