In [11]:
import os
import numpy as np
import pandas as pd
import math
import time
from lamAPI import LamAPI
from s1_pre_processing.data_preparation import DataPreparation


def clean_str(value):
    value = str(value)
    stop_charaters = ["_"]
    for char in stop_charaters:
        value = value.replace(char, " ")
    value = " ".join(value.split()).lower()
    return value

def format_table(table_df):
    rows = []
    for id_row, row in enumerate(table_df):
        rows.append({"idRow":id_row+1, "data":[clean_str(cell) for cell in row]})
  
    return rows


LAMAPI_HOST, LAMAPI_PORT = os.environ["LAMAPI_ENDPOINT"].split(":")
LAMAPI_TOKEN = os.environ["LAMAPI_TOKEN"]
lamAPI = LamAPI(LAMAPI_HOST, LAMAPI_PORT, LAMAPI_TOKEN)
time = time
name = "film.csv"
input_file = pd.read_csv(name)
rows = format_table(input_file.values.tolist())
header = list(input_file.columns)
kg_reference = "wikidata"
column_metadata = {}
target = None
dp = DataPreparation(rows, lamAPI)

output = {
    "name": name,
    "header": header,
    "rows": rows,
    "metadata": None,
    "target": None,
    "kg_reference": kg_reference,
    "limit": 100,
    "status": "DONE", 
    "time": time.time()
}

if len(column_metadata) == 0:
    column_metadata, target = dp.compute_datatype()
    column_metadata[str(target["SUBJ"])] = "SUBJ"
    #output["column"] = column_metadata
    output["metadata"] = {
        "column": [{"idColumn": int(id_col), "tag": column_metadata[id_col]} for id_col in column_metadata]
    }
    output["target"] = target
dp.rows_normalization()     

In [None]:
output

In [12]:
import os
import numpy as np
import pandas as pd
import math
import time
from lamAPI import LamAPI
from s2_process.lookup import Lookup

SAMPLE_SIZE = 25
LAMAPI_HOST, LAMAPI_PORT = os.environ["LAMAPI_ENDPOINT"].split(":")
LAMAPI_TOKEN = os.environ["LAMAPI_TOKEN"]
lamAPI = LamAPI(LAMAPI_HOST, LAMAPI_PORT, LAMAPI_TOKEN)
p1 = Lookup(output, lamAPI)
output["candidates"] = p1._rows

In [51]:
len(output["candidates"][3][1])

65

In [15]:
"Q5145625" in temp["wikidata"]["Q3512046"]["objects"]

True

In [4]:
import metrics as metrics


class FeaturesExtraction:
    def __init__(self, data, lamAPI):
        self._data = data
        self._lamAPI = lamAPI
    
    
    def compute_features(self):
        rows = self._data["rows"]
        target = self._data["target"]
        for index, row in enumerate(rows):
            cells = row["data"]
            for id_col_ne_subj in target["NE"]:
                ne_cell_subj = cells[id_col_ne_subj]
                for id_col_ne_obj in target["NE"]:
                    if id_col_ne_subj == id_col_ne_obj:
                        continue
                    ne_cell_obj = cells[id_col_ne_obj]
                    self._compute_similarity_between_ne_cells(index, id_col_ne_subj, id_col_ne_obj)
                for id_col_lit_obj in target["LIT"]:
                    lit_cell_obj = cells[id_col_lit_obj]
                    self._match_lit_cells(index, id_col_ne_subj, id_col_ne_obj, lit_cell_obj, target["LIT_DATATYPE"][str(id_col_lit_obj)])
        self._extract_features()
        
        
    def _extract_features(self):
        features = [[] for id_col in range(len(self._data["metadata"]["column"]))]
        for row in self._data["candidates"]:
            for id_col, candidates in enumerate(row):
                for candidate in candidates:
                    features[id_col].append(list(candidate["features"].values()))
        self._data["features"] = features
    
    def _compute_similarity_between_ne_cells(self, id_row, id_col_subj_cell, id_col_obj_cell):
        subj_candidates = self._data["candidates"][id_row][id_col_subj_cell]
        obj_candidates = self._data["candidates"][id_row][id_col_obj_cell]
        subj_id_candidates = [candidate["id"] for candidate in subj_candidates]
        obj_id_candidates = [candidate["id"] for candidate in obj_candidates]
        
        subjects_objects = self._lamAPI.objects(subj_id_candidates)
        object_rel_score_buffer = {}

        for subj_candidate in subj_candidates:
            id_subject = subj_candidate["id"]
            subj_candidate_objects = subjects_objects.get(id_subject, {}).get("objects", {})
            objects_set = set(subj_candidate_objects.keys())
            #subj_candidate["matches"][str(id_col_obj_cell)] = []
            #subj_candidate["pred"][str(id_col_obj_cell)] = {}
              
            objects_itersection = objects_set.intersection(set(obj_id_candidates))
            #print(objects_itersection)
            obj_score_max = 0
            for obj_candidate in obj_candidates:
                id_object = obj_candidate["id"]  
                if id_object not in objects_itersection:
                    continue
                              
                score = obj_candidate["features"]["ed_score"]
                if score > obj_score_max:
                    obj_score_max = score
                   
                if id_object not in object_rel_score_buffer:
                    object_rel_score_buffer[id_object] = 0
                score_rel = subj_candidate["features"]["ed_score"]
                if score_rel > object_rel_score_buffer[id_object]:
                    object_rel_score_buffer[id_object] = score_rel
                for predicate in subj_candidate_objects[id_object]:
                    subj_candidate["matches"][str(id_col_obj_cell)].append({
                        "p": predicate,
                        "o": id_object,
                        "s": round(score, 3)
                    })
                    subj_candidate["predicates"][str(id_col_obj_cell)][predicate] = score
            subj_candidate["features"]["p_subj_ne"] += obj_score_max          
        
        for obj_candidate in obj_candidates:
            id_object = obj_candidate["id"]  
            if id_object not in object_rel_score_buffer:
                continue
            obj_candidate["features"]["p_obj_ne"] += object_rel_score_buffer[id_object]    
        
      
    def _match_lit_cells(self, id_row, id_col_subj_cell, id_col_obj_col, obj_cell, obj_cell_datatype):
    
        def get_score_based_on_datatype(valueInCell, valueFromKG, datatype):
            score = 0
            if datatype == "NUMBER":
                score = metrics.compute_similarty_between_numbers(valueInCell, valueFromKG.lower())
            elif datatype == "DATETIME":
                score = metrics.compute_similarity_between_dates(valueInCell, valueFromKG.lower())
            elif datatype == "STRING":
                score = metrics.compute_similarity_between_string(valueInCell, valueFromKg.lower())
            return score
        
        subj_candidates = self._data["candidates"][id_row][id_col_subj_cell]
        subj_id_candidates = [candidate["id"] for candidate in subj_candidates]
        cand_lamapi_literals = self._lamAPI.literals(subj_id_candidates)
        datatype = obj_cell_datatype
        
        for subj_candidate in subj_candidates:
            id_subject = subj_candidate["id"]
            literals = cand_lamapi_literals[id_subject]
            if "literals" in literals:
                literals = literals['literals']
            if len(literals[datatype.lower()]) == 0:
                continue
            #subj_candidate["matches"][str(id_col_obj_col)] = []
            #subj_candidate["pred"][str(id_col_obj_col)] = {}
            #subj_cell.candidates_entities()[subject]["match_count"]["lit"] += 1
            max_score = 0
            for predicate in literals[datatype.lower()]:
                for valueFromKg in literals[datatype.lower()][predicate]:
                    score = get_score_based_on_datatype(obj_cell, valueFromKg, datatype)
                    score = round(score, 3)
                    if score > 0:
                        subj_candidate["matches"][str(id_col_obj_col)].append({
                            "p": predicate,
                            "o": valueFromKg,
                            "s": round(score, 3)
                        })  
                        if score > max_score:
                            max_score = score
                        if predicate not in subj_candidate["predicates"][str(id_col_obj_col)]:
                            subj_candidate["predicates"][str(id_col_obj_col)][predicate] = 0
                        if score > subj_candidate["predicates"][str(id_col_obj_col)][predicate]:
                            subj_candidate["predicates"][str(id_col_obj_col)][predicate] = score    
                            
            subj_candidate["features"]["p_subj_lit"] += round(max_score, 3) 

In [5]:
FeaturesExtraction(output, lamAPI).compute_features()

In [None]:
output["candidates"][2][1][0:2]

In [None]:
output["features"]

In [None]:
output["candidates"][0][2]

In [None]:
for row in output["candidates"]:
        for id_col, candidates in enumerate(row):

In [6]:
class Prediction:
    def __init__(self, data, model):
        self._data = data
        self._model = model
        
    def compute_prediction(self, feature_name):
        prediction = []
        indexes = []
        for column_features in self._data["features"]:
            pred = [] 
            if len(column_features) > 0:
                pred = self._model.predict(column_features)
            prediction.append(pred)
            indexes.append(0)
        
        for id_row, row in enumerate(self._data["candidates"]):
            for id_col, candidates in enumerate(row):
                for candidate in candidates:
                    index = indexes[id_col]
                    indexes[id_col] += 1
                    feature = round(prediction[id_col][index][1], 3)
                    if feature_name == "score": 
                        candidate[feature_name] = feature
                    else:
                        candidate["features"][feature_name] = feature    
                if feature_name == "score":        
                    candidates.sort(key=lambda x:x[feature_name], reverse=True)       
                else:
                    candidates.sort(key=lambda x:x["features"][feature_name], reverse=True)       

In [None]:
from keras.models import load_model
model = load_model("neural_network.h5")
Prediction(output, model).compute_prediction("cea")

In [8]:
class FeaturesExtractionRevision:
    def __init__(self, data):
        self._data = data
        self._cta = {str(id_col):{} for id_col in range(len(self._data["metadata"]["column"]))}
        self._cpa = {str(id_col):{} for id_col in range(len(self._data["metadata"]["column"]))}
        self._compute_cta_and_cpa_freq()
        
        
    def compute_features(self):
        features = [[] for id_col in range(len(self._data["metadata"]["column"]))]
        for id_row, row in enumerate(self._data["candidates"]):
            for id_col, candidates in enumerate(row):
                id_col = str(id_col)
                for candidate in candidates:
                    (cta, ctaMax) = (0, 0)
                    for t in candidate["types"]:
                        if t["id"] in self._cta[id_col]:
                            cta += self._cta[id_col][t["id"]]
                            if self._cta[id_col][t["id"]] > ctaMax:
                                ctaMax = self._cta[id_col][t["id"]]
                    
                    (cpa, cpaMax) = (0, 0)
                    total_predicates = 0
                    for id_col_pred in candidate["predicates"]:
                        for id_predicate in candidate["predicates"][id_col_pred]:
                            if id_predicate in self._cpa[id_col]:
                                total_predicates += 1
                                cpa += self._cpa[id_col][id_predicate]
                                if self._cpa[id_col][id_predicate] > cpaMax:
                                    cpaMax = self._cpa[id_col][id_predicate]
                                
                    cta /= len(candidate["types"]) if len(candidate["types"]) > 0 else 1
                    candidate["features"]["cta"] = round(cta, 2)
                    candidate["features"]["ctaMax"] = ctaMax
                    
                    cpa /= total_predicates if total_predicates > 0 else 1
                    candidate["features"]["cpa"] = round(cpa, 2)
                    candidate["features"]["cpaMax"] = cpaMax
                    
                    candidate["features"]["diff"] = candidates[0]["features"]["cea"] - candidate["features"]["cea"]
                    
                    features[int(id_col)].append(list(candidate["features"].values()))
        self._data["features"] = features          
                    
            
    def _compute_cta_and_cpa_freq(self):
        for id_row, row in enumerate(self._data["candidates"]):
            for id_col, candidates in enumerate(row):
                id_col = str(id_col)
                history = set()
                for candidate in candidates[0:3]:
                    types = candidate["types"]
                    for t in types:
                        id_type = t["id"]
                        if id_type in history:
                            continue
                        if id_type not in self._cta[id_col]:
                            self._cta[id_col][id_type] = 0
                        self._cta[id_col][id_type] += 1
                        history.add(id_type)
                    
                    predicates = candidate["predicates"]
                    for id_col_rel in predicates:
                        for id_predicate in predicates[id_col_rel]:
                            if id_predicate in history:
                                continue
                            if id_predicate not in self._cpa[id_col]:
                                self._cpa[id_col][id_predicate] = 0
                            self._cpa[id_col][id_predicate] += 1
                            history.add(id_predicate)
        
        n_rows = len(self._data["rows"])
        for id_col in self._cta:
            for id_type in self._cta[id_col]:
                self._cta[id_col][id_type] = round(self._cta[id_col][id_type]/n_rows, 2)
            for id_predicate in self._cpa[id_col]:
                self._cpa[id_col][id_predicate] = round(self._cpa[id_col][id_predicate]/n_rows, 2)    

In [9]:
fe_revsion = FeaturesExtractionRevision(output)
fe_revsion.compute_features()
fe_revsion._cpa

{'0': {'P57': 1.0,
  'P161': 0.25,
  'P58': 0.75,
  'P750': 0.5,
  'P2047': 1.0,
  'P2142': 1.0,
  'P2130': 1.0,
  'P272': 1.0,
  'P8687': 0.5,
  'P162': 0.75,
  'P179': 0.25,
  'P170': 0.25,
  'P1040': 0.25},
 '1': {'P8687': 0.75,
  'P2044': 0.25,
  'P1082': 0.25,
  'P2046': 0.25,
  'P4080': 0.25,
  'P1538': 0.25,
  'P2927': 0.25,
  'P2067': 0.25,
  'P1971': 0.25,
  'P138': 0.25,
  'P4969': 0.25,
  'P364': 0.25,
  'P179': 0.25,
  'P1434': 0.25,
  'P272': 0.25,
  'P2047': 0.25,
  'P2130': 0.25,
  'P2142': 0.25},
 '2': {},
 '3': {'P8687': 0.5,
  'P1128': 0.25,
  'P2137': 0.25,
  'P2403': 0.25,
  'P2226': 0.25,
  'P3362': 0.25,
  'P2295': 0.25},
 '4': {},
 '5': {}}

In [10]:
from keras.models import load_model
model = load_model("neural_network.h5")
Prediction(output, model).compute_prediction("score")



In [40]:
len(output["features"][0])

236

In [43]:
output["features"][1][0]

[2,
 26,
 0.01,
 66.21,
 0.018181,
 1.0,
 1.0,
 1.0,
 1.0,
 0,
 0.035,
 2.0,
 0,
 0,
 0.75,
 0.75,
 1.0,
 1.0,
 1.0,
 0]

In [81]:
output["candidates"][0][0].sort(key=lambda x:x["features"]["cea"], reverse=True)

In [None]:
output["candidates"][3][0]

In [56]:
output["candidates"][3][0]

[{'id': 'Q24871',
  'name': 'Avatar',
  'descritpion': '2009 American epic science fiction film directed by James Cameron',
  'types': [{'id': 'Q229390', 'name': '3D film'},
   {'id': 'Q25110269', 'name': 'live-action/animated film'}],
  'features': {'ntoken': 1,
   'popularity': 117,
   'pos_score': 0.09,
   'es_score': 36.17,
   'es_diff_score': 0.000959,
   'ed_score': 1.0,
   'jaccard_score': 1.0,
   'jaccardNgram_score': 1.0,
   'cosine_similarity': 1.0,
   'p_subj_ne': 2.0,
   'p_subj_lit': 2.081,
   'p_obj_ne': 0,
   'desc': 0,
   'descNgram': 0,
   'cpa': 0,
   'cpaMax': 0,
   'cta': 0,
   'ctaMax': 0,
   'cea': 0.996,
   'diff': 0},
  'matches': {'0': [],
   '1': [{'p': 'P57', 'o': 'Q42574', 's': 1.0},
    {'p': 'P162', 'o': 'Q42574', 's': 1.0},
    {'p': 'P58', 'o': 'Q42574', 's': 1.0},
    {'p': 'P1040', 'o': 'Q42574', 's': 1.0}],
   '2': [],
   '3': [{'p': 'P272', 'o': 'Q434841', 's': 1.0},
    {'p': 'P2047', 'o': '+162', 's': 0.081},
    {'p': 'P8687', 'o': '+148542', 's':

In [None]:
output["candidates"][0][1]

In [None]:
output["candidates"][3][0]

In [4]:
pip install orjson

Collecting orjson
  Obtaining dependency information for orjson from https://files.pythonhosted.org/packages/bd/9f/67ac40f606f36acd1716179b9c7c4943cf5e1c2c15663fdf04374be922a5/orjson-3.9.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading orjson-3.9.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading orjson-3.9.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (138 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.7/138.7 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: orjson
Successfully installed orjson-3.9.9
Note: you may need to restart the kernel to use updated packages.


In [2]:
import metrics as metrics
import os
from lamAPI import LamAPI
import sys
import json 
import time
import orjson

# Reading
with open(filename_path, "rb") as f:
    input_data = orjson.loads(f.read())

# Writing
with open("/tmp/output.json", "wb") as f:
    f.write(orjson.dumps(input_data))



class FeaturesExtraction:
    def __init__(self, data, lamAPI):
        self._data = data
        self._lamAPI = lamAPI
    
    
    def compute_features(self):
        rows = self._data["rows"]
        target = self._data["target"]
        for index, row in enumerate(rows):
            print("row", row)
            cells = row["data"]
            for id_col_ne_subj in target["NE"]:
                for id_col_ne_obj in target["NE"]:
                    if id_col_ne_subj == id_col_ne_obj:
                        continue
                    self._compute_similarity_between_ne_cells(index, id_col_ne_subj, id_col_ne_obj)
                for id_col_lit_obj in target["LIT"]:
                    lit_cell_obj = cells[id_col_lit_obj]
                    self._match_lit_cells(index, id_col_ne_subj, id_col_ne_obj, lit_cell_obj, target["LIT_DATATYPE"][str(id_col_lit_obj)])
        self._extract_features()
        
        
    def _extract_features(self):
        features = [[] for id_col in range(len(self._data["metadata"]["column"]))]
        for row in self._data["candidates"]:
            for id_col, candidates in enumerate(row):
                for candidate in candidates:
                    features[id_col].append(list(candidate["features"].values()))
        self._data["features"] = features
    
    def _compute_similarity_between_ne_cells(self, id_row, id_col_subj_cell, id_col_obj_cell):
        subj_candidates = self._data["candidates"][id_row][id_col_subj_cell]
        obj_candidates = self._data["candidates"][id_row][id_col_obj_cell]
        subj_id_candidates = [candidate["id"] for candidate in subj_candidates]
        obj_id_candidates = [candidate["id"] for candidate in obj_candidates]
        
        #subjects_objects = self._lamAPI.objects(subj_id_candidates)
        object_rel_score_buffer = {}

        for subj_candidate in subj_candidates:
            id_subject = subj_candidate["id"]
            subj_candidate_objects = cache_obj.get(id_subject, {})
            #subj_candidate_objects = subjects_objects.get(id_subject, {}).get("objects", {})
            #cache_obj[id_subject] = subj_candidate_objects
            objects_set = set(subj_candidate_objects.keys())
            #subj_candidate["matches"][str(id_col_obj_cell)] = []
            #subj_candidate["pred"][str(id_col_obj_cell)] = {}
              
            objects_itersection = objects_set.intersection(set(obj_id_candidates))
            #print(objects_itersection)
            obj_score_max = 0
            for obj_candidate in obj_candidates:
                id_object = obj_candidate["id"]  
                if id_object not in objects_itersection:
                    continue
                              
                score = obj_candidate["features"]["ed_score"]
                if score > obj_score_max:
                    obj_score_max = score
                   
                if id_object not in object_rel_score_buffer:
                    object_rel_score_buffer[id_object] = 0
                score_rel = subj_candidate["features"]["ed_score"]
                if score_rel > object_rel_score_buffer[id_object]:
                    object_rel_score_buffer[id_object] = score_rel
                for predicate in subj_candidate_objects[id_object]:
                    subj_candidate["matches"][str(id_col_obj_cell)].append({
                        "p": predicate,
                        "o": id_object,
                        "s": round(score, 3)
                    })
                    subj_candidate["predicates"][str(id_col_obj_cell)][predicate] = score
            subj_candidate["features"]["p_subj_ne"] += obj_score_max          
        
        for obj_candidate in obj_candidates:
            id_object = obj_candidate["id"]  
            if id_object not in object_rel_score_buffer:
                continue
            obj_candidate["features"]["p_obj_ne"] += object_rel_score_buffer[id_object]    
        
      
    def _match_lit_cells(self, id_row, id_col_subj_cell, id_col_obj_col, obj_cell, obj_cell_datatype):
    
        def get_score_based_on_datatype(valueInCell, valueFromKG, datatype):
            score = 0
            if datatype == "NUMBER":
                score = metrics.compute_similarty_between_numbers(valueInCell, valueFromKG.lower())
            elif datatype == "DATETIME":
                score = metrics.compute_similarity_between_dates(valueInCell, valueFromKG.lower())
            elif datatype == "STRING":
                score = metrics.compute_similarity_between_string(valueInCell, valueFromKg.lower())
            return score
        
        subj_candidates = self._data["candidates"][id_row][id_col_subj_cell]
        subj_id_candidates = [candidate["id"] for candidate in subj_candidates]
        #cand_lamapi_literals = self._lamAPI.literals(subj_id_candidates)
        
        #if len(cand_lamapi_literals) == 0:
        #    return
        
        datatype = obj_cell_datatype
        
        for subj_candidate in subj_candidates:
            id_subject = subj_candidate["id"]
            #literals = cand_lamapi_literals[id_subject]
            literals = cache_lit.get(id_subject, {})
            if "literals" in literals:
                literals = literals['literals']
            #cache_lit[id_subject] = literals    
           
            if len(literals[datatype]) == 0:
                continue
            #subj_candidate["matches"][str(id_col_obj_col)] = []
            #subj_candidate["pred"][str(id_col_obj_col)] = {}
            #subj_cell.candidates_entities()[subject]["match_count"]["lit"] += 1
            max_score = 0
            for predicate in literals[datatype]:
                for valueFromKg in literals[datatype][predicate]:
                    score = get_score_based_on_datatype(obj_cell, valueFromKg, datatype)
                    score = round(score, 3)
                    if score > 0:
                        subj_candidate["matches"][str(id_col_obj_col)].append({
                            "p": predicate,
                            "o": valueFromKg,
                            "s": round(score, 3)
                        })  
                        if score > max_score:
                            max_score = score
                        if predicate not in subj_candidate["predicates"][str(id_col_obj_col)]:
                            subj_candidate["predicates"][str(id_col_obj_col)][predicate] = 0
                        if score > subj_candidate["predicates"][str(id_col_obj_col)][predicate]:
                            subj_candidate["predicates"][str(id_col_obj_col)][predicate] = score    
                            
            subj_candidate["features"]["p_subj_lit"] += max_score
            subj_candidate["features"]["p_subj_lit"] = round(subj_candidate["features"]["p_subj_lit"], 3)


start_time = time.time()

LAMAPI_HOST, LAMAPI_PORT = os.environ["LAMAPI_ENDPOINT"].split(":")
LAMAPI_TOKEN = os.environ["LAMAPI_TOKEN"]
lamAPI = LamAPI(LAMAPI_HOST, LAMAPI_PORT, LAMAPI_TOKEN)
filename_path = sys.argv[1]

with open("output2.json") as f:
    input = json.loads(f.read())

with(open("./cache_obj.json")) as f:
    cache_obj = json.loads(f.read())

with(open("./cache_lit.json")) as f:
    cache_lit = json.loads(f.read())

FeaturesExtraction(input, lamAPI).compute_features()

"""
with(open("./cache_obj.json", "w")) as f:
    f.write(json.dumps(cache_obj))

with(open("./cache_lit.json", "w")) as f:
    f.write(json.dumps(cache_lit))
"""

with open("./output_test.json", "w") as f:
    f.write(json.dumps(input, indent=4))

print("--- %s seconds ---" % (time.time() - start_time))

row {'idRow': 1, 'data': ['allerdale borough council', 'allerdale borough council', 'http://www.allerdale.gov.uk/', 'workington', 'cumbria', 'england', 'united kingdom']}
row {'idRow': 2, 'data': ['be first (regeneration) limited', 'be first', 'http://www.befirst.london/', 'barking', 'greater london', 'england', 'united kingdom']}
row {'idRow': 3, 'data': ['birmingham city council', 'birmingham city council', 'http://www.birmingham.gov.uk/', 'birmingham', 'west midlands', 'england', 'united kingdom']}
row {'idRow': 4, 'data': ['birmingham city council', 'birmingham city council.', 'http://birmingham.gov.uk/', 'birmingham', 'west midlands', 'england', 'united kingdom']}
row {'idRow': 5, 'data': ['birmingham city council', 'birmingham city council', 'http://www.birmingham.gov.uk/', 'birmingham', 'west midlands', 'england', 'united kingdom']}
row {'idRow': 6, 'data': ['birmingham city council', 'birmingham city council.', 'http://birmingham.gov.uk/', 'birmingham', 'west midlands', 'englan

In [13]:
import metrics as metrics
import os
from lamAPI import LamAPI
import sys
import orjson

cache_obj = {}
cache_lit = {}

class FeaturesExtraction:
    def __init__(self, data, lamAPI):
        self._data = data
        self._lamAPI = lamAPI
    
    
    def compute_features(self):
        rows = self._data["rows"]
        target = self._data["target"]
        for index, row in enumerate(rows):
            cells = row["data"]
            for id_col_ne_subj in target["NE"]:
                for id_col_ne_obj in target["NE"]:
                    if id_col_ne_subj == id_col_ne_obj:
                        continue
                    self._compute_similarity_between_ne_cells(index, id_col_ne_subj, id_col_ne_obj)
                for id_col_lit_obj in target["LIT"]:
                    lit_cell_obj = cells[id_col_lit_obj]
                    self._match_lit_cells(index, id_col_ne_subj, id_col_ne_obj, lit_cell_obj, target["LIT_DATATYPE"][str(id_col_lit_obj)])
        self._extract_features()
        
        
    def _extract_features(self):
        features = [[] for id_col in range(len(self._data["metadata"]["column"]))]
        for row in self._data["candidates"]:
            for id_col, candidates in enumerate(row):
                for candidate in candidates:
                    features[id_col].append(list(candidate["features"].values()))
        self._data["features"] = features
    
    def _compute_similarity_between_ne_cells(self, id_row, id_col_subj_cell, id_col_obj_cell):
        subj_candidates = self._data["candidates"][id_row][id_col_subj_cell]
        obj_candidates = self._data["candidates"][id_row][id_col_obj_cell]
        subj_id_candidates = [candidate["id"] for candidate in subj_candidates if candidate["id"] not in cache_obj]
        obj_id_candidates = [candidate["id"] for candidate in obj_candidates]
        
        if len(subj_id_candidates) > 0:
            subjects_objects = self._lamAPI.objects(subj_id_candidates)

        object_rel_score_buffer = {}

        for subj_candidate in subj_candidates:
            id_subject = subj_candidate["id"]
            #subj_candidate_objects = subjects_objects.get(id_subject, {}).get("objects", {})
            #cache_obj[id_subject] = subj_candidate_objects
            if id_subject not in cache_obj:
                subj_candidate_objects = subjects_objects.get(id_subject, {}).get("objects", {})
            else:    
                subj_candidate_objects = cache_obj.get(id_subject, {})
            objects_set = set(subj_candidate_objects.keys())
            #subj_candidate["matches"][str(id_col_obj_cell)] = []
            #subj_candidate["pred"][str(id_col_obj_cell)] = {}
              
            objects_itersection = objects_set.intersection(set(obj_id_candidates))
            #print(objects_itersection)
            obj_score_max = 0
            for obj_candidate in obj_candidates:
                id_object = obj_candidate["id"]  
                if id_object not in objects_itersection:
                    continue
                              
                score = obj_candidate["features"]["ed_score"]
                if score > obj_score_max:
                    obj_score_max = score
                   
                if id_object not in object_rel_score_buffer:
                    object_rel_score_buffer[id_object] = 0
                score_rel = subj_candidate["features"]["ed_score"]
                if score_rel > object_rel_score_buffer[id_object]:
                    object_rel_score_buffer[id_object] = score_rel
                for predicate in subj_candidate_objects[id_object]:
                    subj_candidate["matches"][str(id_col_obj_cell)].append({
                        "p": predicate,
                        "o": id_object,
                        "s": round(score, 3)
                    })
                    subj_candidate["predicates"][str(id_col_obj_cell)][predicate] = score
            subj_candidate["features"]["p_subj_ne"] += obj_score_max          
        
        for obj_candidate in obj_candidates:
            id_object = obj_candidate["id"]  
            if id_object not in object_rel_score_buffer:
                continue
            obj_candidate["features"]["p_obj_ne"] += object_rel_score_buffer[id_object]    
        
      
    def _match_lit_cells(self, id_row, id_col_subj_cell, id_col_obj_col, obj_cell, obj_cell_datatype):
    
        def get_score_based_on_datatype(valueInCell, valueFromKG, datatype):
            score = 0
            if datatype == "NUMBER":
                score = metrics.compute_similarty_between_numbers(valueInCell, valueFromKG.lower())
            elif datatype == "DATETIME":
                score = metrics.compute_similarity_between_dates(valueInCell, valueFromKG.lower())
            elif datatype == "STRING":
                score = metrics.compute_similarity_between_string(valueInCell, valueFromKg.lower())
            return score
        
        subj_candidates = self._data["candidates"][id_row][id_col_subj_cell]
        subj_id_candidates = [candidate["id"] for candidate in subj_candidates if candidate["id"] not in cache_lit]
        if len(subj_id_candidates) > 0:
            cand_lamapi_literals = self._lamAPI.literals(subj_id_candidates)
            if len(cand_lamapi_literals) == 0:
                return
        
        datatype = obj_cell_datatype
        
        for subj_candidate in subj_candidates:
            id_subject = subj_candidate["id"]
            #literals = cand_lamapi_literals[id_subject]
            if id_subject not in cache_lit:
                literals = cand_lamapi_literals.get(id_subject, {})
            else:   
                literals = cache_lit.get(id_subject, {})
            if "literals" in literals:
                literals = literals['literals']    
            #cache_lit[id_subject] = literals    
            if len(literals[datatype]) == 0:
                continue
            #subj_candidate["matches"][str(id_col_obj_col)] = []
            #subj_candidate["pred"][str(id_col_obj_col)] = {}
            #subj_cell.candidates_entities()[subject]["match_count"]["lit"] += 1
            max_score = 0
            for predicate in literals[datatype]:
                for valueFromKg in literals[datatype][predicate]:
                    score = get_score_based_on_datatype(obj_cell, valueFromKg, datatype)
                    score = round(score, 3)
                    if score > 0:
                        subj_candidate["matches"][str(id_col_obj_col)].append({
                            "p": predicate,
                            "o": valueFromKg,
                            "s": round(score, 3)
                        })  
                        if score > max_score:
                            max_score = score
                        if predicate not in subj_candidate["predicates"][str(id_col_obj_col)]:
                            subj_candidate["predicates"][str(id_col_obj_col)][predicate] = 0
                        if score > subj_candidate["predicates"][str(id_col_obj_col)][predicate]:
                            subj_candidate["predicates"][str(id_col_obj_col)][predicate] = score    
                            
            subj_candidate["features"]["p_subj_lit"] += max_score
            subj_candidate["features"]["p_subj_lit"] = round(subj_candidate["features"]["p_subj_lit"], 3)

start_time = time.time()
print("Start features extraction")

LAMAPI_HOST, LAMAPI_PORT = os.environ["LAMAPI_ENDPOINT"].split(":")
LAMAPI_TOKEN = os.environ["LAMAPI_TOKEN"]
lamAPI = LamAPI(LAMAPI_HOST, LAMAPI_PORT, LAMAPI_TOKEN)
filename_path = sys.argv[1]

# Reading
with open("./output2.json", "rb") as f:
    input_data = orjson.loads(f.read())

with(open("./cache_obj.json", "rb")) as f:
    cache_obj = orjson.loads(f.read())

with(open("./cache_lit.json", "rb")) as f:
    cache_lit = orjson.loads(f.read())

FeaturesExtraction(input, lamAPI).compute_features()

print("Finish features extraction")

# Writing
with open("/tmp/output.json", "wb") as f:
    f.write(orjson.dumps(input_data, option=orjson.OPT_INDENT_2))

print("Finish writing")
print("--- %s seconds ---" % (time.time() - start_time))

Start features extraction
Finish features extraction
Finish writing
--- 2.7441582679748535 seconds ---


In [None]:
import os
from lamAPI import LamAPI
import sys
import json 
import metrics as metrics
import utils as utils
import time 

cache = {}
class Lookup:
    def __init__(self, data:object, lamAPI):
        self._header = data.get("header", [])
        self._table_name = data["name"]
        self._target = data["target"]
        self._kg_ref = data["kg_reference"]
        self._limit = data["limit"]
        self._lamAPI = lamAPI
        self._rows = []
        for row in data["rows"]:
            row = self._build_row(row["data"])
            self._rows.append(row)


    def _build_row(self, cells):
        row_candidates = []
        features = ["ntoken", "popularity", "pos_score", "es_score", "es_diff_score", 
                    "ed_score", "jaccard_score", "jaccardNgram_score", "cosine_similarity",
                    "p_subj_ne", "p_subj_lit", "p_obj_ne", "desc", "descNgram", 
                    "cpa", "cpaMax", "cta", "ctaMax", "rho", "diff"]
        row_content_norm = utils.clean_str(" ".join(cells))
        for i, cell in enumerate(cells):
            new_candidites = []
            if i in self._target["NE"]:
                #candidates = self._get_candidates(cell)
                candidates = cache.get(cell, [])
                print(cell)
                #cache[cell] = candidates
                for candidate in candidates:
                    item = {
                        "id": candidate["id"],
                        "name": candidate["name"],
                        "description": candidate["description"],
                        "types": candidate["types"],
                        "features": {feature:candidate.get(feature, 0) for feature in features},
                        "matches": {str(id_col):[] for id_col in range(len(cells))},
                        "predicates": {str(id_col):{} for id_col in range(len(cells))}
                    }
                    new_candidites.append(item)
                    desc_norm = utils.clean_str(candidate["description"])
                    desc_score = round(metrics.compute_similarity_between_string(desc_norm, row_content_norm), 3)
                    desc_score_ngram = round(metrics.compute_similarity_between_string(desc_norm, row_content_norm, 3), 3)
                    item["features"]["desc"] = desc_score
                    item["features"]["descNgram"] = desc_score_ngram
            row_candidates.append(new_candidites)
        return row_candidates


    def _get_candidates(self, cell):
        print("Try lookup for cell:", cell)
        candidates = []
        types = None
        result = None
        try:
            result = self._lamAPI.lookup(cell, fuzzy=False, types=types, kg=self._kg_ref, limit=self._limit)
            if cell not in result:
                raise Exception("Error from lamAPI")
            candidates = result[cell]    
        except Exception as e:
            print(str(e))
            return []
            
        return candidates


start_time = time.time()

SAMPLE_SIZE = 25
LAMAPI_HOST, LAMAPI_PORT = os.environ["LAMAPI_ENDPOINT"].split(":")
LAMAPI_TOKEN = os.environ["LAMAPI_TOKEN"]
lamAPI = LamAPI(LAMAPI_HOST, LAMAPI_PORT, LAMAPI_TOKEN)
filename_path = sys.argv[1]

with open("./output_l.json") as f:
    input = json.loads(f.read())

with(open("./cache.json")) as f:
    cache = json.loads(f.read())
    
p1 = Lookup(input, lamAPI)
input["candidates"] = p1._rows

with open("./output_l.json", "w") as f:
    f.write(json.dumps(input, indent=4))
    
#with(open("./cache.json", "w")) as f:
#    f.write(json.dumps(cache))

print("--- %s seconds ---" % (time.time() - start_time))

--- 1125.024539232254 seconds ---


In [None]:
--- 10.024539232254 seconds ---

In [1]:
from keras.models import load_model
import tensorflow as tf
import sys
import json
import time



class Prediction:
    def __init__(self, data, model):
        self._data = data
        self._model = model
        
    def compute_prediction(self, feature_name):
        prediction = []
        indexes = []
        for column_features in self._data["features"]:
            pred = [] 
            if len(column_features) > 0:
                pred = self._model.predict(column_features)
            prediction.append(pred)
            indexes.append(0)
        
        for row in self._data["candidates"]:
            for id_col, candidates in enumerate(row):
                for candidate in candidates:
                    index = indexes[id_col]
                    indexes[id_col] += 1
                    feature = round(float(prediction[id_col][index][1]), 3)
                    if feature_name == "rho2": 
                        candidate[feature_name] = feature
                    else:
                        candidate["features"][feature_name] = feature    
                if feature_name == "rho2":        
                    candidates.sort(key=lambda x:x[feature_name], reverse=True)       
                else:
                    candidates.sort(key=lambda x:x["features"][feature_name], reverse=True)    

start_time = time.time()

filename_path = sys.argv[1]
feature_name = sys.argv[2]

with open("output_feat_ex.json") as f:
    input = json.loads(f.read())
print("The file has been read correctly")

model = load_model("neural_network.h5")
print("The NN has been read correctly")

Prediction(input, model).compute_prediction(feature_name)
print("The NN has been applied correctly")

with open("/tmp/output.json", "w") as f:
    f.write(json.dumps(input, indent=4))
print("The file has been saved correctly")
#print(json.dumps(input), flush=True)
print("--- %s seconds ---" % (time.time() - start_time))

2023-10-20 11:34:36.220231: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-20 11:34:36.240973: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-20 11:34:36.408991: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-20 11:34:36.410143: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


The file has been read correctly
The NN has been read correctly
The NN has been applied correctly
The file has been saved correctly
--- 5.965457439422607 seconds ---


In [20]:
import tensorflow

print(tensorflow.__version__)


2.13.0


In [3]:
tf.config.threading.get_inter_op_parallelism_threads(), tf.config.threading.get_intra_op_parallelism_threads

(0,
 <function tensorflow.python.framework.config.get_intra_op_parallelism_threads()>)

In [14]:
import sys
import orjson
import requests
import urllib3
import os
import time

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

headers = {
    "Content-Type": "text/plain",
}

class Classifier:
    def __init__(self, endpoint, data):
        self._endpoint = endpoint
        self._data = data

    def classify_description(self):
        for row in self._data["candidates"]:
            for candidates in row:
                if len(candidates) > 0:
                    candidate = candidates[0]
                    if candidate["id"] not in cache:
                        temp = candidate["name"] + " " + candidate["description"]
                        temp = temp.encode('utf-8')
                        categories = self._get_categories(temp)["iptc_categories"]
                    else:
                        categories = cache.get(candidate["id"], [])
                    candidate["categories"] = categories
                    #candidate["iptc_categories"] = categories["iptc_categories"]
                    #candidate["geo_categories"] = categories["geo_categories"]

    def _get_categories(self, data):
        response = requests.post(self._endpoint, headers=headers, data=data, verify=False)
        result = {"iptc_categories": [], "geo_categories": []}
        if response.status_code == 200:
            print("Request was successful")
            print("Response JSON:")
            print(response.json())
            result = response.json()
            result = {"iptc_categories":result["iptc_categories"], "geo_categories":result["geo_categories"]}
        else:
            print(f"Failed to retrieve data. HTTP Status code: {response.status_code}")
        return result
    
    

start_time = time.time()
print("Start classifier")

filename_path = sys.argv[1]
# Reading
with open("output_pred2.json", "rb") as f:
    input_data = orjson.loads(f.read())

with(open("./cache.json", "rb")) as f:
    cache = orjson.loads(f.read())

CLASSIFIER_ENDPOINT = os.environ["CLASSIFIER_ENDPOINT"]

try:
    classifier = Classifier(CLASSIFIER_ENDPOINT, input_data)
    classifier.classify_description()
except Exception as e:
    print("Error with classifier, details:", str(e))

print("End classifier")

# Writing
with open("/tmp/output.json", "wb") as f:
    f.write(orjson.dumps(input_data, option=orjson.OPT_INDENT_2))

print("End writing")
print("--- %s seconds ---" % (time.time() - start_time))

Start classifier


KeyError: 'CLASSIFIER_ENDPOINT'