In [1]:
from utils import helper_functions

In [1]:
# this is an example for cortex release 0.21 and may not deploy correctly on other releases of cortex
import os
import shutil
import glob
import math
import numpy as np
import scipy.spatial
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from collections import OrderedDict 
from itertools import islice
import json

import boto3

import redis

from utils import helper_functions, redis_cache_mechanisms


class PythonPredictor:

    def __init__(self):

        # download the information retrieval model trained on MS-MARCO dataset
        #self.embedder = SentenceTransformer('distilroberta-base-msmarco-v2')
        self.embedder = SentenceTransformer('./models/distilroberta-base-msmarco-v2')
        
        # set the environment variables
        self.redis_host = '127.0.0.1'
        self.redis_port = 6379


        self.aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
        self.aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
        
         
        # establish connection with s3 bucket
        
        try:  
            self.s3 = boto3.client('s3', aws_access_key_id=self.aws_access_key_id , aws_secret_access_key=self.aws_secret_access_key)
            print('Connected to s3 bucket!')
        except Exception as ex:
            print('\n\naws client error:', ex)
            exit('Failed to connect to s3 bucket, terminating.')
        
        
        # establish connection to redis server to be used as data store persistence

        try:
            self.r = redis.StrictRedis(host=self.redis_host, port=self.redis_port, decode_responses=True)
            
            self.r.ping()
            print('Connected to redis cache!')
        except Exception as ex:
            print('\n\nredis client error:', ex)
            exit('Failed to connect to redis, terminating.')

        
        self.dir = 'tmp'



        if os.path.exists(self.dir):
            shutil.rmtree(self.dir)
        os.makedirs(self.dir)                                           




    def predict(self, payload):
        
        # extract values from the request payload
        
        # sess stores a file's uuid
        # a unique identifier to link to an uploaded file's text file, encodings and top words
        sess = payload["uuid"]
     
        query = payload["text"]

        max_results =  payload["top"]
        
        acc_greater_than  = payload["accuracyGreaterThan"]
        
        cache_bool_value = redis_cache_mechanisms.check_if_request_to_be_cached(self, sess, query, max_results)
                
        if cache_bool_value:
            
            # as caching has to be done we request for 50 more lines and cache them
            # however we return the exact requested amount of lines to the client
            max_results+=50
            
            # check if the files for the corresponding file id are present on the local disk or not
            # return 0 if there's no folder present for the file
            sess_dir_find = glob.glob('tmp/'+sess)
            new_disk_sess = True if len(sess_dir_find)==0 else False

            if new_disk_sess:
                # create new cache disk session direct

                helper_functions.download_text_file_and_embeddings_from_s3_bucket(self, sess)

                corpus, corpus_embeddings = helper_functions.load_text_file_and_embeddings(self, sess)

            else:


                # accessing from already downloaded encodings and files from disk

                print('😉 got you\'ve covered, model alread encoded 🤘')

                corpus, corpus_embeddings = helper_functions.load_text_file_and_embeddings(self, sess)


            queries = [str(query)]
            

            query_embeddings = self.embedder.encode(queries)

            queries_and_embeddings=(queries, query_embeddings)
            corpus_and_embeddings=(corpus, corpus_embeddings)

            response = helper_functions.cluster(self, corpus_and_embeddings, queries_and_embeddings, max_results, acc_greater_than)
            
            # =============================Redis cache layer===========================================
            
            # the response is stored in redis cache, setting up the data model and feeding data into it
            redis_cache_mechanisms.cache_response_to_redis(self, sess, query, response)
            
            # =========================================================================================

            response = OrderedDict(islice(response.items(), 0, payload['top']))
            
            return response

        else:

            # return from redis cache!

            print('file available in redis cache! 😇')

            response_cache = redis_cache_mechanisms.get_cache_data_from_redis(self, sess, query, max_results)
            
            return response_cache



In [2]:
pr = PythonPredictor()

Connected to s3 bucket!
Connected to redis cache!


In [9]:

# 0eb1f558-b4e0-4118-80f1-c7b3e1741fca
# 3dc5ea3d-d5e1-4946-91ed-be0d63af8a12
# c514e5d4-e0c9-4c6a-a35a-d3ad706c419b

payload = {
    "uuid": "3dc5ea3d-d5e1-4946-91ed-be0d63af8a12", 
    "text": "sympathy",
    "top": 60,
    "accuracyGreaterThan": 0.2
}


In [10]:
resp = pr.predict(payload)

in check for cache or not
😉 got you've covered, model alread encoded 🤘
text files for 3dc5ea3d-d5e1-4946-91ed-be0d63af8a12 loaded succesfully
"Piety," like "compassion," was an almost superstitious invocation (Score: 0.4356)
Chivalry Yields to Citizenship          Chivalry faded away, to be replaced by citizenship, when megapolitical conditions changed and the military purpose of the vow to one's lord was antiquated (Score: 0.3801)
"53 Piety and Compassion The piety that rationalized the saturation of society by organized religion in the late Middle Ages served the same purpose as the "compassion" that is meant to justify the political domination of life today (Score: 0.3756)
Both chivalry and citizenship added an extra dimension to the simple calculus that would otherwise deter unindoctrinated human beings from going onto a battlefield and staying there when the going got rough (Score: 0.3699)
(Sacrifice that primarily benefits others may actually harm the inclusive fitness of the sel

response and query cached 🌻


In [7]:
resp

OrderedDict([('Never has there been so great a symbolic triumph of efficiency over power',
              '0.5449'),
             ('Mighty as they are, the power they retain is the power to obliterate, not to command',
              '0.4837'),
             ('This determines the importance of magnitude of firepower versus efficiency in employing resources',
              '0.4654'),
             ('209   great bargaining power', '0.4641'),
             ('Rather, they are artifacts of past and ongoing efforts to project power',
              '0.4592')])

In [36]:
# original response
resp

'{"Never has there been so great a symbolic triumph of efficiency over power": 0.5449, "Mighty as they are, the power they retain is the power to obliterate, not to command": 0.4837, "This determines the importance of magnitude of firepower versus efficiency in employing resources": 0.4654, "209   great bargaining power": 0.4641, "Rather, they are artifacts of past and ongoing efforts to project power": 0.4592, "Another important factor that weighs in the    balance in determining whether ultimate power is exercised locally or from a    distance is the scale of the predominant enterprises in which people gain their    livelihoods": 0.459, "Power will once again be exercised on a small scale": 0.431, "They had no choice but to recognize the balance of raw power as they found it": 0.4261, "The United States is the world\'s leading technological power": 0.4238, "The Paradoxes of Power          The use of violence to protect against violence is fraught with paradoxes": 0.4152, "Not only is

In [14]:
resp = pr.predict(payload)

in check for cachce or not
file available in redis cache! 😇


In [5]:
!pip freeze > req.txt

In [14]:
import redis
redis_host = "localhost"
redis_port = 6379

In [15]:
r = redis.StrictRedis(host=redis_host, port=redis_port, decode_responses=True)

In [13]:
r.sadd('uuid:1:query', 'hello')

0

In [14]:
file_list = []
with open('tmp/3dc5ea3d-d5e1-4946-91ed-be0d63af8a12/text_content.txt', 'r') as file:
    file_list = file.read()

In [15]:
def payload_text_preprocess(text):
    text = text.replace('\n', ' ')
    text = text.split('.')
    text = [x for x in text if len(x) >=50]

    return text


In [16]:
file_list = payload_text_preprocess(file_list)

In [19]:
avg_char_len_of_lines_in_book = sum(map(len, file_list))/len(file_list)
print(avg_char_len_of_lines_in_book)

144.5735827171234


In [17]:
a = r.hincrby('unq_ids', ':query_ids', 1)

In [18]:
a

2

In [26]:
import hashlib
import sys

def hash_it(string):
    return hashlib.sha1(string.encode('utf-8')).hexdigest()

a = hash_it(' You could have pocketed anaverage real return of more than 30 percent annually in U')
print(a)
print(len(a))
print(a[:2])
print(a[2::])

691eeed7504e4832bf76f1b49603edd816428804
40
69
1eeed7504e4832bf76f1b49603edd816428804


In [29]:
r.hset(':query_id:1', 'content', 'You could have pocketed anaverage r')

1

In [45]:
r.zadd(':query_id:', {'line': 4, 'score':5, 'sco': 2} )

3

In [46]:
r.zrevrange(':query_id:', 0 ,4)

['score', 'line', 'sco']

In [None]:
%time

maxi=0
maxline=''
for l in file_list:
    if len(l) > maxi:
        maxi=len(l)
        maxline=l
    line_id = r.hincrby('ids', 'si:lines', 1)
    hash_line = hash_it(l)
    r.hset('file:si:'+hash_line[:2], hash_line[3::], line_id)

In [None]:
%time
maxi=0
maxline=''
for l in file_list:
    if len(l) > maxi:
        maxi=len(l)
        maxline=l
    line_id = r.hincrby('ids', 'si:lines', 1)
    hash_line = hash_it(l)
    r.hset('file:si:', l, line_id)