In [45]:
import sys
import orjson
import requests
import urllib3
import os
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

headers = {
    "Content-Type": "text/plain",
}

class Classifier:
    def __init__(self, endpoint, data, column_to_classify):
        self._endpoint = endpoint
        self._data = data
        self._columns_to_classify = column_to_classify

    def classify_description(self):
        for id_row, row in enumerate(self._data["candidates"]):
            for columnn_to_classify in self._columns_to_classify:
                target = columnn_to_classify["target"]
                columns = columnn_to_classify["columns"]
                id_col = self._data["header"].index(target)
                candidates = row[id_col]
                if len(candidates) == 0:
                    continue
                candidate = candidates[0]        
                key = " ".join([candidate["id"]]+columns)
                categories = cache.get(key, None)
                     
                if categories is None:
                    if columns == ["name", "description"]:  
                        text = candidate["name"] + " " + candidate["description"]
                    else:
                        indexes = [self._data["header"].index(col) for col in columns]
                        text = " ".join([self._data["rows"][id_row]["data"][index] for index in indexes])
                    text = text.encode('utf-8')
                    categories = self._get_categories(text)["iptc_categories"]
                    #cache[key] = categories     
                candidate[" ".join(columns)] = categories

    def _get_categories(self, data):
        response = requests.post(self._endpoint, headers=headers, data=data, verify=False)
        result = {"iptc_categories": [], "geo_categories": []}
        if response.status_code == 200:
            #print("Request was successful")
            #print("Response JSON:")
            #print(response.json())
            result = response.json()
            result = {"iptc_categories":result["iptc_categories"], "geo_categories":result["geo_categories"]}
        else:
            print(f"Failed to retrieve data. HTTP Status code: {response.status_code}")
        return result
    
    

start = time.time()   
print("Start classifier")

filename_path = sys.argv[1]
# Reading
with open("data/output5.json", "rb") as f:
    input_data = orjson.loads(f.read())

with(open("s7_expert_ai_api/cache.json", "rb")) as f:
    cache = orjson.loads(f.read())
cache = {}
CLASSIFIER_ENDPOINT = os.environ["CLASSIFIER_ENDPOINT"]

try:
    classifier = Classifier(CLASSIFIER_ENDPOINT, input_data, input_data["services"]["classifier"])
    classifier.classify_description()
except Exception as e:
    print("Error with classifier, details:", str(e))

print("End classifier")

# Writing
with open("/tmp/output.json", "wb") as f:
    f.write(orjson.dumps(input_data, option=orjson.OPT_INDENT_2))

print("End writing")
print(time.time()-start)

Start classifier
End classifier
End writing
56.789581298828125


In [95]:
import sys
import orjson
import requests
import urllib3
import os
import aiohttp
import ssl
import asyncio
import time
import httpx
import traceback 

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

headers = {
    "Content-Type": "text/plain",
}

class Classifier:
    def __init__(self, endpoint, data, columns_to_classify):
        self._endpoint = endpoint
        self._data = data
        self._columns_to_classify = columns_to_classify
        self._lock = asyncio.Lock()  # Initialize a new lock

    async def classify_description(self):
        tasks = []
        for id_row, row in enumerate(self._data["candidates"]):
            for columnn_to_classify in self._columns_to_classify:
                target = columnn_to_classify["target"]
                columns = columnn_to_classify["columns"]
                id_col = self._data["header"].index(target)
                candidates = row[id_col]
                if len(candidates) == 0:
                    continue
                candidate = candidates[0]        
                key = " ".join([candidate["id"]]+columns)
               
                if columns == ["name", "description"]:  
                    text = candidate["name"] + " " + candidate["description"]
                else:
                    indexes = [self._data["header"].index(col) for col in columns]
                    text = " ".join([self._data["rows"][id_row]["data"][index] for index in indexes])
                text = text.encode('utf-8')
                tasks.append((text, candidate, columns))
                
        # Check if tasks list is empty, and skip running requests if it is
        if not tasks:
            return []
        print(len(tasks))
        responses = await self.run_all_requests(tasks)
        return responses

    async def send_request(self, session, candidate, columns, data):
        key = " ".join([candidate["id"]] + columns + [self._generate_short_fingerprint(str(data))])
        categories = cache.get(key)
        if categories is None:
            async with session.post(self._endpoint, headers=headers, data=data, ssl=False) as response:
                response_json = await response.json()
                # Update the cache while the lock is held
                async with self._lock:  # Acquire the lock before accessing the cache
                    cache[key] = response_json["iptc_categories"]
            candidate[" ".join(columns)] = self._cache[key]
        else:
            candidate[" ".join(columns)] = categories

    async def run_all_requests(self, tasks_data):
        async with aiohttp.ClientSession() as session:
            tasks = [self.send_request(session, candidate, columns, text) for text, candidate, columns in tasks_data]
            responses = await asyncio.gather(*tasks, return_exceptions=True)
            return responses


    def _generate_short_fingerprint(self, text, length=8):
        # We're using MD5 here for simplicity. For more collision resistance you can use SHA256
        hash_object = hashlib.md5(text.encode('utf-8'))
        # Truncate the hash to the desired length
        return hash_object.hexdigest()[:length]
    
start = time.time() 
print("Start classifier")

filename_path = sys.argv[1]
# Reading
with open("data/output5.json", "rb") as f:
    input_data = orjson.loads(f.read())

with(open("./s7_expert_ai_api/cache.json", "rb")) as f:
    cache = orjson.loads(f.read())
cache = {}
CLASSIFIER_ENDPOINT = os.environ["CLASSIFIER_ENDPOINT"]

try:
    classifier = Classifier(CLASSIFIER_ENDPOINT, input_data, input_data["services"]["classifier"])
    responses = await classifier.classify_description()
except Exception as e:
    print("Error with classifier, details:", str(e), traceback.print_exc())

print("End classifier")

# Writing
with open("/tmp/output.json", "wb") as f:
    f.write(orjson.dumps(input_data, option=orjson.OPT_INDENT_2))

print("End writing")
print(time.time()-start)

Start classifier
312
End classifier
End writing
11.277249336242676


In [100]:
import metrics as metrics
import os
from lamAPI import LamAPI
import sys
import orjson
import time

cache_obj = {}
cache_lit = {}

class FeaturesExtraction:
    def __init__(self, data, lamAPI):
        self._data = data
        self._lamAPI = lamAPI
    
    
    def compute_features(self):
        rows = self._data["rows"]
        target = self._data["target"]
        for index, row in enumerate(rows):
            cells = row["data"]
            for id_col_ne_subj in target["NE"]:
                for id_col_ne_obj in target["NE"]:
                    if id_col_ne_subj == id_col_ne_obj:
                        continue
                    self._compute_similarity_between_ne_cells(index, id_col_ne_subj, id_col_ne_obj)
                for id_col_lit_obj in target["LIT"]:
                    lit_cell_obj = cells[id_col_lit_obj]
                    self._match_lit_cells(index, id_col_ne_subj, id_col_ne_obj, lit_cell_obj, target["LIT_DATATYPE"][str(id_col_lit_obj)])
        self._extract_features()
        
        
    def _extract_features(self):
        features = [[] for id_col in range(len(self._data["metadata"]["column"]))]
        for row in self._data["candidates"]:
            for id_col, candidates in enumerate(row):
                for candidate in candidates:
                    features[id_col].append(list(candidate["features"].values()))
        self._data["features"] = features
    
    def _compute_similarity_between_ne_cells(self, id_row, id_col_subj_cell, id_col_obj_cell):
        subj_candidates = self._data["candidates"][id_row][id_col_subj_cell]
        obj_candidates = self._data["candidates"][id_row][id_col_obj_cell]
        subj_id_candidates = [candidate["id"] for candidate in subj_candidates if candidate["id"] not in cache_obj]
        obj_id_candidates = [candidate["id"] for candidate in obj_candidates]
        
        if len(subj_id_candidates) > 0:
            prin(len(subj_id_candidates))
            subjects_objects = self._lamAPI.objects(subj_id_candidates)

        object_rel_score_buffer = {}

        for subj_candidate in subj_candidates:
            id_subject = subj_candidate["id"]
            #subj_candidate_objects = subjects_objects.get(id_subject, {}).get("objects", {})
            #cache_obj[id_subject] = subj_candidate_objects
            if id_subject not in cache_obj:
                subj_candidate_objects = subjects_objects.get(id_subject, {}).get("objects", {})
            else:    
                subj_candidate_objects = cache_obj.get(id_subject, {})
            objects_set = set(subj_candidate_objects.keys())
            #subj_candidate["matches"][str(id_col_obj_cell)] = []
            #subj_candidate["pred"][str(id_col_obj_cell)] = {}
              
            objects_itersection = objects_set.intersection(set(obj_id_candidates))
            #print(objects_itersection)
            obj_score_max = 0
            for obj_candidate in obj_candidates:
                id_object = obj_candidate["id"]  
                if id_object not in objects_itersection:
                    continue
                              
                score = obj_candidate["features"]["ed_score"]
                if score > obj_score_max:
                    obj_score_max = score
                   
                if id_object not in object_rel_score_buffer:
                    object_rel_score_buffer[id_object] = 0
                score_rel = subj_candidate["features"]["ed_score"]
                if score_rel > object_rel_score_buffer[id_object]:
                    object_rel_score_buffer[id_object] = score_rel
                for predicate in subj_candidate_objects[id_object]:
                    subj_candidate["matches"][str(id_col_obj_cell)].append({
                        "p": predicate,
                        "o": id_object,
                        "s": round(score, 3)
                    })
                    subj_candidate["predicates"][str(id_col_obj_cell)][predicate] = score
            subj_candidate["features"]["p_subj_ne"] += obj_score_max          
        
        for obj_candidate in obj_candidates:
            id_object = obj_candidate["id"]  
            if id_object not in object_rel_score_buffer:
                continue
            obj_candidate["features"]["p_obj_ne"] += object_rel_score_buffer[id_object]    
        
      
    def _match_lit_cells(self, id_row, id_col_subj_cell, id_col_obj_col, obj_cell, obj_cell_datatype):
    
        def get_score_based_on_datatype(valueInCell, valueFromKG, datatype):
            score = 0
            if datatype == "NUMBER":
                score = metrics.compute_similarty_between_numbers(valueInCell, valueFromKG.lower())
            elif datatype == "DATETIME":
                score = metrics.compute_similarity_between_dates(valueInCell, valueFromKG.lower())
            elif datatype == "STRING":
                score = metrics.compute_similarity_between_string(valueInCell, valueFromKg.lower())
            return score
        
        subj_candidates = self._data["candidates"][id_row][id_col_subj_cell]
        subj_id_candidates = [candidate["id"] for candidate in subj_candidates if candidate["id"] not in cache_lit]
        if len(subj_id_candidates) > 0:
            cand_lamapi_literals = self._lamAPI.literals(subj_id_candidates)
            if len(cand_lamapi_literals) == 0:
                return
        
        datatype = obj_cell_datatype
        
        for subj_candidate in subj_candidates:
            id_subject = subj_candidate["id"]
            #literals = cand_lamapi_literals[id_subject]
            if id_subject not in cache_lit:
                prin("NOT cache!")
                literals = cand_lamapi_literals.get(id_subject, {})
            else:   
                literals = cache_lit.get(id_subject, {})
            if "literals" in literals:
                literals = literals['literals']    
            #cache_lit[id_subject] = literals    
            if len(literals[datatype]) == 0:
                continue
            #subj_candidate["matches"][str(id_col_obj_col)] = []
            #subj_candidate["pred"][str(id_col_obj_col)] = {}
            #subj_cell.candidates_entities()[subject]["match_count"]["lit"] += 1
            max_score = 0
            for predicate in literals[datatype]:
                for valueFromKg in literals[datatype][predicate]:
                    score = get_score_based_on_datatype(obj_cell, valueFromKg, datatype)
                    score = round(score, 3)
                    if score > 0:
                        subj_candidate["matches"][str(id_col_obj_col)].append({
                            "p": predicate,
                            "o": valueFromKg,
                            "s": round(score, 3)
                        })  
                        if score > max_score:
                            max_score = score
                        if predicate not in subj_candidate["predicates"][str(id_col_obj_col)]:
                            subj_candidate["predicates"][str(id_col_obj_col)][predicate] = 0
                        if score > subj_candidate["predicates"][str(id_col_obj_col)][predicate]:
                            subj_candidate["predicates"][str(id_col_obj_col)][predicate] = score    
                            
            subj_candidate["features"]["p_subj_lit"] += max_score
            subj_candidate["features"]["p_subj_lit"] = round(subj_candidate["features"]["p_subj_lit"], 3)


start = time.time()
print("Start features extraction")

LAMAPI_HOST, LAMAPI_PORT = os.environ["LAMAPI_ENDPOINT"].split(":")
LAMAPI_TOKEN = os.environ["LAMAPI_TOKEN"]
lamAPI = LamAPI(LAMAPI_HOST, LAMAPI_PORT, LAMAPI_TOKEN)
filename_path = sys.argv[1]

# Reading
with open("data/output1.json", "rb") as f:
    input_data = orjson.loads(f.read())

with(open("./s3_features_extraction/cache_obj.json", "rb")) as f:
    cache_obj = orjson.loads(f.read())

with(open("./s3_features_extraction/cache_lit.json", "rb")) as f:
    cache_lit = orjson.loads(f.read())

FeaturesExtraction(input_data, lamAPI).compute_features()

print("Finish features extraction")

# Writing
with open("/tmp/output.json", "wb") as f:
    f.write(orjson.dumps(input_data, option=orjson.OPT_INDENT_2))

print("Finish writing")
print(time.time()-start)

Start features extraction
Finish features extraction
Finish writing
6.943553447723389


In [96]:
len(cache)

110

In [88]:
with open("cache.json", "wb") as f:
    f.write(orjson.dumps(cache, option=orjson.OPT_INDENT_2))

In [84]:
import hashlib

def generate_short_fingerprint(text, length=8):
    # We're using MD5 here for simplicity. For more collision resistance you can use SHA256
    hash_object = hashlib.md5(text.encode('utf-8'))
    # Truncate the hash to the desired length
    return hash_object.hexdigest()[:length]

" ".join([generate_short_fingerprint("batman begins")])

'7cd540d1'

In [None]:
input_data["candidates"][0][0]

In [55]:
import sys
import orjson
import requests
import urllib3
import os
import aiohttp
import ssl
import asyncio
import time
import httpx
import traceback 

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

headers = {
    "Content-Type": "text/plain",
}

class Classifier:
    def __init__(self, endpoint, data, columns_to_classify):
        self._endpoint = endpoint
        self._data = data
        self._columns_to_classify = columns_to_classify

    async def classify_description(self):
        tasks = []
        for id_row, row in enumerate(self._data["candidates"]):
            for columnn_to_classify in self._columns_to_classify:
                target = columnn_to_classify["target"]
                columns = columnn_to_classify["columns"]
                id_col = self._data["header"].index(target)
                candidates = row[id_col]
                if len(candidates) == 0:
                    continue
                candidate = candidates[0]        
                key = " ".join([candidate["id"]]+columns)
                categories = cache.get(key, None)
                     
                if categories is None:
                    if columns == ["name", "description"]:  
                        text = candidate["name"] + " " + candidate["description"]
                    else:
                        indexes = [self._data["header"].index(col) for col in columns]
                        text = " ".join([self._data["rows"][id_row]["data"][index] for index in indexes])
                    text = text.encode('utf-8')
                    tasks.append(asyncio.create_task(self._compute_categories(text, candidate, columns)))
                else:
                    candidate[" ".join(columns)] = categories
                    
        responses = await asyncio.gather(*tasks)

                    
    async def _compute_categories(self, data, candidate, columns): 
        async with aiohttp.ClientSession() as session:
            try:
                async with session.post(self._endpoint, headers=headers, data=data, ssl=False) as response:
                    key = " ".join([candidate["id"]]+columns)
                    categories = cache.get(key)
                    if categories is None:
                        response_json = await response.json()
                        cache[" ".join([candidate["id"]]+columns)] = response_json["iptc_categories"]
                        candidate[" ".join(columns)] = response_json["iptc_categories"]
                    else:
                        candidate[" ".join(columns)] = categories
                    return    
            except Exception as e:
                print(f"Request failed: {e}")
                return None
    
    

start = time.time() 
print("Start classifier")

filename_path = sys.argv[1]
# Reading
with open("data/output5.json", "rb") as f:
    input_data = orjson.loads(f.read())

with(open("./s7_expert_ai_api/cache.json", "rb")) as f:
    cache = orjson.loads(f.read())
cache = {}
CLASSIFIER_ENDPOINT = os.environ["CLASSIFIER_ENDPOINT"]

try:
    classifier = Classifier(CLASSIFIER_ENDPOINT, input_data, input_data["services"]["classifier"])
    await classifier.classify_description()
except Exception as e:
    print("Error with classifier, details:", str(e), traceback.print_exc())

print("End classifier")

# Writing
with open("/tmp/output.json", "wb") as f:
    f.write(orjson.dumps(input_data, option=orjson.OPT_INDENT_2))

print("End writing")
print(time.time()-start)

Start classifier
End classifier
End writing
19.25173568725586


In [12]:
import os
from lamAPI3 import LamAPI
import sys
import json 
import metrics as metrics
import utils as utils
import time 
import asyncio
import traceback 

cache = {}
class Lookup:
    def __init__(self, data:object, lamAPI):
        self._header = data.get("header", [])
        self._table_name = data["name"]
        self._target = data["target"]
        self._kg_ref = data["kg_reference"]
        self._limit = data["limit"]
        self._lamAPI = lamAPI
        self._rows = []
        self._cache = {}

        """
        for row in data["rows"]:
            row = self._build_row(row["data"])
            self._rows.append(row)
        """
        
    async def test(self, data):
        tasks = []
        for row in data["rows"]:
            tasks.append(asyncio.create_task(self._build_row(row["data"])))

        # Use asyncio.gather() to run _build_row() concurrently
        responses = await asyncio.gather(*tasks)

        # Collect the results into self._rows
        for response in responses:
            self._rows.append(response)

            
    async def _build_row(self, cells):
        row_candidates = []
        features = ["ntoken", "popularity", "pos_score", "es_score", "es_diff_score", 
                    "ed_score", "jaccard_score", "jaccardNgram_score", "cosine_similarity",
                    "p_subj_ne", "p_subj_lit", "p_obj_ne", "desc", "descNgram", 
                    "cpa", "cpaMax", "cta", "ctaMax", "rho", "diff"]
        row_content_norm = utils.clean_str(" ".join(cells))
        for i, cell in enumerate(cells):
            new_candidites = []
            if i in self._target["NE"]:
                if cell not in self._cache:
                    candidates = await self._get_candidates(cell)
                    self._cache[cell] = candidates
                else:
                    candidates = self._cache[cell]
                #print("len", len(candidates))
                #candidates = cache.get(cell, [])
                #print(cell)
                #cache[cell] = candidates
                for candidate in candidates:
                    item = {
                        "id": candidate["id"],
                        "name": candidate["name"],
                        "description": candidate["description"],
                        "types": candidate["types"],
                        "features": {feature:candidate.get(feature, 0) for feature in features},
                        "matches": {str(id_col):[] for id_col in range(len(cells))},
                        "predicates": {str(id_col):{} for id_col in range(len(cells))}
                    }
                    new_candidites.append(item)
                    desc_norm = utils.clean_str(candidate["description"])
                    desc_score = round(metrics.compute_similarity_between_string(desc_norm, row_content_norm), 3)
                    desc_score_ngram = round(metrics.compute_similarity_between_string(desc_norm, row_content_norm, 3), 3)
                    item["features"]["desc"] = desc_score
                    item["features"]["descNgram"] = desc_score_ngram
            row_candidates.append(new_candidites)
        return row_candidates


    async def _get_candidates(self, cell):
        print("Try lookup for cell:", cell)
        candidates = []
        types = None
        result = None
        try:
            result = await self._lamAPI.lookup(cell)
            #print(len(result))
            #print(result)
            if cell not in result:
                raise Exception("Error from lamAPI")
            candidates = result[cell]        
        except Exception as e:
            print(str(e), traceback.print_exc())
            return []
            
        return candidates


start_time = time.time()

SAMPLE_SIZE = 25
LAMAPI_HOST, LAMAPI_PORT = os.environ["LAMAPI_ENDPOINT"].split(":")
LAMAPI_TOKEN = os.environ["LAMAPI_TOKEN"]
lamAPI = LamAPI(LAMAPI_HOST, LAMAPI_PORT, LAMAPI_TOKEN)
filename_path = sys.argv[1]

with open("./data/output_l.json") as f:
    input = json.loads(f.read())

with(open("./cache.json")) as f:
    cache = json.loads(f.read())
    
p1 = Lookup(input, lamAPI)
await p1.test(input)
input["candidates"] = p1._rows

with open("./output_l.json", "w") as f:
    f.write(json.dumps(input, indent=4))
    
#with(open("./cache.json", "w")) as f:
#    f.write(json.dumps(cache))

print("--- %s seconds ---" % (time.time() - start_time))

Try lookup for cell: allerdale borough council
Try lookup for cell: be first (regeneration) limited
Try lookup for cell: birmingham city council
Try lookup for cell: birmingham city council
Try lookup for cell: birmingham city council
Try lookup for cell: birmingham city council
Try lookup for cell: birmingham city council
Try lookup for cell: blackpool council
Try lookup for cell: bolsover district council
Try lookup for cell: bradford metropolitan district council
Try lookup for cell: city of stoke-on-trent
Try lookup for cell: city of york council
Try lookup for cell: cleeve school & sixth form centre of excellence
Try lookup for cell: cpd - construction division
Try lookup for cell: cpd - construction division
Try lookup for cell: defra network etendering portal
Try lookup for cell: derby city council
Try lookup for cell: derby city council
Try lookup for cell: derby city council
Try lookup for cell: derby city council
Try lookup for cell: derbyshire county council
Try lookup for c

In [24]:
import os
from lamAPI import LamAPI
import sys
import orjson
import metrics as metrics
import utils as utils
import time

class Lookup:
    def __init__(self, data:object, lamAPI):
        self._header = data.get("header", [])
        self._table_name = data["name"]
        self._target = data["target"]
        self._kg_ref = data["kg_reference"]
        self._limit = data["limit"]
        self._lamAPI = lamAPI
        self._setNE = set(self._target["NE"])
        self._rows = []
        for row in data["rows"]:
            row = self._build_row(row["data"])
            self._rows.append(row)


    def _build_row(self, cells):
        row_candidates = []
        features = ["ntoken", "popularity", "pos_score", "es_score", "es_diff_score", 
                    "ed_score", "jaccard_score", "jaccardNgram_score", "cosine_similarity",
                    "p_subj_ne", "p_subj_lit", "p_obj_ne", "desc", "descNgram", 
                    "cpa", "cpaMax", "cta", "ctaMax", "rho", "diff"]
        
        #row_content_norm = utils.clean_str(" ".join(cells))
        cells_to_consider = []
        for i, cell in enumerate(cells):
            if i not in self._target.get("NO_ANN", []):
                cells_to_consider.append(cell)
        row_content_norm = utils.clean_str(" ".join(cells_to_consider))        
        for i, cell in enumerate(cells):
            new_candidites = []
            if i in self._setNE:
                #candidates = self._get_candidates(cell)
                #print("Lookup for cell", cell)
                if cell in cache:
                    candidates = cache.get(cell, [])
                else:
                    candidates = self._get_candidates(cell)    

                for candidate in candidates:
                    item = {
                        "id": candidate["id"],
                        "name": candidate["name"],
                        "description": candidate["description"],
                        "types": candidate["types"],
                        "features": {feature:candidate.get(feature, 0) for feature in features},
                        "matches": {str(id_col):[] for id_col in range(len(cells))},
                        "predicates": {str(id_col):{} for id_col in range(len(cells))}
                    }
                    new_candidites.append(item)
                    desc_norm = utils.clean_str(candidate["description"])
                    desc_score = round(metrics.compute_similarity_between_string(desc_norm, row_content_norm), 3)
                    desc_score_ngram = round(metrics.compute_similarity_between_string(desc_norm, row_content_norm, 3), 3)
                    item["features"]["desc"] = desc_score
                    item["features"]["descNgram"] = desc_score_ngram
            row_candidates.append(new_candidites)
        return row_candidates


    def _get_candidates(self, cell):
        print("Try lookup for cell:", cell)
        candidates = []
        types = None
        result = None
        try:
            result = self._lamAPI.lookup(cell, fuzzy=False, types=types, kg=self._kg_ref, limit=self._limit)
            if cell not in result:
                raise Exception("Error from lamAPI")
            candidates = result[cell]    
        except Exception as e:
            print(str(e))
            return []
            
        return candidates

start_time = time.time()

print("Start lookup")

SAMPLE_SIZE = 25
LAMAPI_HOST, LAMAPI_PORT = os.environ["LAMAPI_ENDPOINT"].split(":")
LAMAPI_TOKEN = os.environ["LAMAPI_TOKEN"]
lamAPI = LamAPI(LAMAPI_HOST, LAMAPI_PORT, LAMAPI_TOKEN)
filename_path = sys.argv[1]

# Reading
with open("./data/output1.json", "rb") as f:
    input_data = orjson.loads(f.read())

with(open("./s2_lookup/cache.json", "rb")) as f:
    cache = orjson.loads(f.read())

p1 = Lookup(input_data, lamAPI)
input_data["candidates"] = p1._rows

print("End lookup")

# Writing
with open("./data/output.json", "wb") as f:
    f.write(orjson.dumps(input_data, option=orjson.OPT_INDENT_2))

print("End writing")
print("--- %s seconds ---" % (time.time() - start_time))

Start lookup
End lookup
End writing
--- 4.374894380569458 seconds ---


In [None]:
import pandas as pd
pd.read_csv("s1_pre_processing/SN_latest.csv")

In [None]:
import os
from lamAPI3 import LamAPI
SAMPLE_SIZE = 25
LAMAPI_HOST, LAMAPI_PORT = os.environ["LAMAPI_ENDPOINT"].split(":")
LAMAPI_TOKEN = os.environ["LAMAPI_TOKEN"]
lamAPI = LamAPI(LAMAPI_HOST, LAMAPI_PORT, LAMAPI_TOKEN)
await lamAPI.lookup("paris")

--- 1125.024539232254 seconds ---


In [None]:
--- 10.024539232254 seconds ---

In [1]:
from keras.models import load_model
import tensorflow as tf
import sys
import json
import time



class Prediction:
    def __init__(self, data, model):
        self._data = data
        self._model = model
        
    def compute_prediction(self, feature_name):
        prediction = []
        indexes = []
        for column_features in self._data["features"]:
            pred = [] 
            if len(column_features) > 0:
                pred = self._model.predict(column_features)
            prediction.append(pred)
            indexes.append(0)
        
        for row in self._data["candidates"]:
            for id_col, candidates in enumerate(row):
                for candidate in candidates:
                    index = indexes[id_col]
                    indexes[id_col] += 1
                    feature = round(float(prediction[id_col][index][1]), 3)
                    if feature_name == "rho2": 
                        candidate[feature_name] = feature
                    else:
                        candidate["features"][feature_name] = feature    
                if feature_name == "rho2":        
                    candidates.sort(key=lambda x:x[feature_name], reverse=True)       
                else:
                    candidates.sort(key=lambda x:x["features"][feature_name], reverse=True)    

start_time = time.time()

filename_path = sys.argv[1]
feature_name = sys.argv[2]

with open("output_feat_ex.json") as f:
    input = json.loads(f.read())
print("The file has been read correctly")

model = load_model("neural_network.h5")
print("The NN has been read correctly")

Prediction(input, model).compute_prediction(feature_name)
print("The NN has been applied correctly")

with open("/tmp/output.json", "w") as f:
    f.write(json.dumps(input, indent=4))
print("The file has been saved correctly")
#print(json.dumps(input), flush=True)
print("--- %s seconds ---" % (time.time() - start_time))

2023-10-20 11:34:36.220231: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-20 11:34:36.240973: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-20 11:34:36.408991: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-20 11:34:36.410143: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


The file has been read correctly
The NN has been read correctly
The NN has been applied correctly
The file has been saved correctly
--- 5.965457439422607 seconds ---


In [20]:
import tensorflow

print(tensorflow.__version__)


2.13.0


In [3]:
tf.config.threading.get_inter_op_parallelism_threads(), tf.config.threading.get_intra_op_parallelism_threads

(0,
 <function tensorflow.python.framework.config.get_intra_op_parallelism_threads()>)

In [None]:
import sys
import orjson
import requests
import urllib3
import os
import time

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

headers = {
    "Content-Type": "text/plain",
}

class Classifier:
    def __init__(self, endpoint, data):
        self._endpoint = endpoint
        self._data = data

    def classify_description(self):
        for row in self._data["candidates"]:
            for candidates in row:
                if len(candidates) > 0:
                    candidate = candidates[0]
                    if candidate["id"] not in cache:
                        temp = candidate["name"] + " " + candidate["description"]
                        temp = temp.encode('utf-8')
                        categories = self._get_categories(temp)["iptc_categories"]
                        cache[candidate["id"]] = categories
                    else:
                        categories = cache.get(candidate["id"], [])
                    candidate["categories"] = categories
                    #candidate["iptc_categories"] = categories["iptc_categories"]
                    #candidate["geo_categories"] = categories["geo_categories"]

    def _get_categories(self, data):
        response = requests.post(self._endpoint, headers=headers, data=data, verify=False)
        result = {"iptc_categories": [], "geo_categories": []}
        if response.status_code == 200:
            print("Request was successful")
            print("Response JSON:")
            print(response.json())
            result = response.json()
            result = {"iptc_categories":result["iptc_categories"], "geo_categories":result["geo_categories"]}
        else:
            print(f"Failed to retrieve data. HTTP Status code: {response.status_code}")
        return result
    
    

start_time = time.time()
print("Start classifier")

filename_path = sys.argv[1]
# Reading
with open("output_pred2.json", "rb") as f:
    input_data = orjson.loads(f.read())

with(open("./s7_expert_ai_api/cache.json", "rb")) as f:
    cache = orjson.loads(f.read())

CLASSIFIER_ENDPOINT = os.environ["CLASSIFIER_ENDPOINT"]

try:
    classifier = Classifier(CLASSIFIER_ENDPOINT, input_data)
    classifier.classify_description()
except Exception as e:
    print("Error with classifier, details:", str(e))

print("End classifier")

# Writing
with open("/tmp/output.json", "wb") as f:
    f.write(orjson.dumps(input_data, option=orjson.OPT_INDENT_2))

print("End writing")
print("--- %s seconds ---" % (time.time() - start_time))

In [2]:
import orjson
with(open("./s7_expert_ai_api/cache.json", "rb")) as f:
    cache = orjson.loads(f.read())
len(cache)

165

In [11]:
pip install aiohttp

Note: you may need to restart the kernel to use updated packages.


In [5]:
import json
with open("./s7_expert_ai_api/cache.json", "w") as f:
    f.write(json.dumps(cache, indent=4))