In [1]:
import os
import numpy as np
import pandas as pd
import math
import time
from lamAPI import LamAPI


SAMPLE_SIZE = 25
LAMAPI_HOST, LAMAPI_PORT = os.environ["LAMAPI_ENDPOINT"].split(":")
LAMAPI_TOKEN = os.environ["LAMAPI_TOKEN"]

def clean_str(value):
    value = str(value)
    stop_charaters = ["_"]
    for char in stop_charaters:
        value = value.replace(char, " ")
    value = " ".join(value.split()).lower()
    return value


def format_table(table_df):
    rows = []

    for id_row, row in enumerate(table_df):
        rows.append({"idRow":id_row+1, "data":[clean_str(cell) for cell in row]})
    #buffer = []
    #rows = np.array_split(rows, max(math.floor(len(rows)/SAMPLE_SIZE), 1))
        
    return rows


def compute_datatype(rows, lamapi_wrapper):
    column_metadata = {}
    columns_data = {str(i):[] for i in range(0, len(rows[0]['data']))}
    target = {"SUBJ": 0, "NE": [], "LIT": [], "LIT_DATATYPE": {}}
    for row in rows:
        for id_col, cell in enumerate(row["data"]):
            columns_data[str(id_col)].append(str(cell))
    
    first_NE_column = False     
    for id_col in columns_data:
        metadata = lamapi_wrapper.literal_recognizer(columns_data[id_col])
        max_datatype = max(metadata, key=metadata.get)
        if max_datatype == "ENTITY":
            column_metadata[id_col] = "NE"
            target['NE'].append(int(id_col)) 
            if not first_NE_column:
                target["SUBJ"] = int(id_col)
            first_NE_column = True
        else:
            column_metadata[id_col] = "LIT"
            target['LIT'].append(int(id_col))
            target['LIT_DATATYPE'][str(id_col)] = max_datatype
            
    return column_metadata, target        


lamAPI = LamAPI(LAMAPI_HOST, LAMAPI_PORT, LAMAPI_TOKEN)
time = time
name = "film.csv"
input_file = pd.read_csv(name)
rows = format_table(input_file.values.tolist())
header = list(input_file.columns)
kg_reference = "wikidata"
column_metadata = {}
target = None


output = {
    "name": name,
    "header": header,
    "rows": rows,
    "metadata": None,
    "target": None,
    "kg_reference": kg_reference,
    "limit": 100,
    "status": "DONE", 
    "time": time.time()
}

if len(column_metadata) == 0:
        column_metadata, target = compute_datatype(rows, lamAPI)
        column_metadata[str(target["SUBJ"])] = "SUBJ"
        #output["column"] = column_metadata
        output["metadata"] = {
            "column": [{"idColumn": int(id_col), "tag": column_metadata[id_col]} for id_col in column_metadata]
        }
        output["target"] = target

In [2]:
output

{'name': 'film.csv',
 'header': ['title',
  'director',
  'release year',
  'domestic distributor',
  'length in min',
  'worldwide gross'],
 'rows': [{'idRow': 1,
   'data': ['jurassic world',
    'colin trevorrow',
    '2015',
    'universal pictures',
    '124',
    '1670400637']},
  {'idRow': 2,
   'data': ['superman returns',
    'bryan singer',
    '2006',
    'warner bros.',
    '154',
    '391081192']},
  {'idRow': 3,
   'data': ['batman begins',
    'christopher nolan',
    '2005',
    'warner bros.',
    '140',
    '371853783']},
  {'idRow': 4,
   'data': ['avatar',
    'james cameron',
    '2009',
    '20 century fox',
    '162',
    '2744336793']}],
 'metadata': {'column': [{'idColumn': 0, 'tag': 'SUBJ'},
   {'idColumn': 1, 'tag': 'NE'},
   {'idColumn': 2, 'tag': 'LIT'},
   {'idColumn': 3, 'tag': 'NE'},
   {'idColumn': 4, 'tag': 'LIT'},
   {'idColumn': 5, 'tag': 'LIT'}]},
 'target': {'SUBJ': 0,
  'NE': [0, 1, 3],
  'LIT': [2, 4, 5],
  'LIT_DATATYPE': {'2': 'NUMBER', '4': 'N

In [4]:
import os
import numpy as np
import pandas as pd
import math
import time
from lamAPI import LamAPI
from s2_process.process import Process

SAMPLE_SIZE = 25
LAMAPI_HOST, LAMAPI_PORT = os.environ["LAMAPI_ENDPOINT"].split(":")
LAMAPI_TOKEN = os.environ["LAMAPI_TOKEN"]
lamAPI = LamAPI(LAMAPI_HOST, LAMAPI_PORT, LAMAPI_TOKEN)
p1 = Process(output, lamAPI)
output["candidates"] = p1._rows

In [14]:
len(output["candidates"][3][1])

65

In [15]:
"Q5145625" in temp["wikidata"]["Q3512046"]["objects"]

True

In [7]:
import metrics as metrics


class FeaturesExtraction:
    def __init__(self, data, lamAPI):
        self._data = data
        self._lamAPI = lamAPI
    
    
    def compute_features(self):
        rows = self._data["rows"]
        target = self._data["target"]
        for index, row in enumerate(rows):
            cells = row["data"]
            for id_col_ne_subj in target["NE"]:
                ne_cell_subj = cells[id_col_ne_subj]
                for id_col_ne_obj in target["NE"]:
                    if id_col_ne_subj == id_col_ne_obj:
                        continue
                    ne_cell_obj = cells[id_col_ne_obj]
                    self._compute_similarity_between_ne_cells(index, id_col_ne_subj, id_col_ne_obj)
                for id_col_lit_obj in target["LIT"]:
                    lit_cell_obj = cells[id_col_lit_obj]
                    self._match_lit_cells(index, id_col_ne_subj, id_col_ne_obj, lit_cell_obj, target["LIT_DATATYPE"][str(id_col_lit_obj)])
        self._extract_features()
        
        
    def _extract_features(self):
        features = [[] for id_col in range(len(self._data["metadata"]["column"]))]
        for row in self._data["candidates"]:
            for id_col, candidates in enumerate(row):
                for candidate in candidates:
                    features[id_col].append(list(candidate["features"].values()))
        self._data["features"] = features
    
    def _compute_similarity_between_ne_cells(self, id_row, id_col_subj_cell, id_col_obj_cell):
        subj_candidates = self._data["candidates"][id_row][id_col_subj_cell]
        obj_candidates = self._data["candidates"][id_row][id_col_obj_cell]
        subj_id_candidates = [candidate["id"] for candidate in subj_candidates]
        obj_id_candidates = [candidate["id"] for candidate in obj_candidates]
        
        subjects_objects = self._lamAPI.objects(subj_id_candidates)
        object_rel_score_buffer = {}

        for subj_candidate in subj_candidates:
            id_subject = subj_candidate["id"]
            subj_candidate_objects = subjects_objects.get(id_subject, {}).get("objects", {})
            objects_set = set(subj_candidate_objects.keys())
            #subj_candidate["matches"][str(id_col_obj_cell)] = []
            #subj_candidate["pred"][str(id_col_obj_cell)] = {}
              
            objects_itersection = objects_set.intersection(set(obj_id_candidates))
            #print(objects_itersection)
            obj_score_max = 0
            for obj_candidate in obj_candidates:
                id_object = obj_candidate["id"]  
                if id_object not in objects_itersection:
                    continue
                              
                score = obj_candidate["features"]["ed_score"]
                if score > obj_score_max:
                    obj_score_max = score
                   
                if id_object not in object_rel_score_buffer:
                    object_rel_score_buffer[id_object] = 0
                score_rel = subj_candidate["features"]["ed_score"]
                if score_rel > object_rel_score_buffer[id_object]:
                    object_rel_score_buffer[id_object] = score_rel
                for predicate in subj_candidate_objects[id_object]:
                    subj_candidate["matches"][str(id_col_obj_cell)].append({
                        "p": predicate,
                        "o": id_object,
                        "s": round(score, 3)
                    })
                    subj_candidate["predicates"][str(id_col_obj_cell)][predicate] = score
            subj_candidate["features"]["p_subj_ne"] += obj_score_max          
        
        for obj_candidate in obj_candidates:
            id_object = obj_candidate["id"]  
            if id_object not in object_rel_score_buffer:
                continue
            obj_candidate["features"]["p_obj_ne"] += object_rel_score_buffer[id_object]    
        
      
    def _match_lit_cells(self, id_row, id_col_subj_cell, id_col_obj_col, obj_cell, obj_cell_datatype):
    
        def get_score_based_on_datatype(valueInCell, valueFromKG, datatype):
            score = 0
            if datatype == "NUMBER":
                score = metrics.compute_similarty_between_numbers(valueInCell, valueFromKG.lower())
            elif datatype == "DATETIME":
                score = metrics.compute_similarity_between_dates(valueInCell, valueFromKG.lower())
            elif datatype == "STRING":
                score = metrics.compute_similarity_between_string(valueInCell, valueFromKg.lower())
            return score
        
        subj_candidates = self._data["candidates"][id_row][id_col_subj_cell]
        subj_id_candidates = [candidate["id"] for candidate in subj_candidates]
        cand_lamapi_literals = self._lamAPI.literals(subj_id_candidates)
        datatype = obj_cell_datatype
        
        for subj_candidate in subj_candidates:
            id_subject = subj_candidate["id"]
            literals = cand_lamapi_literals[id_subject]
            if "literals" in literals:
                literals = literals['literals']
            if len(literals[datatype.lower()]) == 0:
                continue
            #subj_candidate["matches"][str(id_col_obj_col)] = []
            #subj_candidate["pred"][str(id_col_obj_col)] = {}
            #subj_cell.candidates_entities()[subject]["match_count"]["lit"] += 1
            max_score = 0
            for predicate in literals[datatype.lower()]:
                for valueFromKg in literals[datatype.lower()][predicate]:
                    score = get_score_based_on_datatype(obj_cell, valueFromKg, datatype)
                    score = round(score, 3)
                    if score > 0:
                        subj_candidate["matches"][str(id_col_obj_col)].append({
                            "p": predicate,
                            "o": valueFromKg,
                            "s": round(score, 3)
                        })  
                        if score > max_score:
                            max_score = score
                        if predicate not in subj_candidate["predicates"][str(id_col_obj_col)]:
                            subj_candidate["predicates"][str(id_col_obj_col)][predicate] = 0
                        if score > subj_candidate["predicates"][str(id_col_obj_col)][predicate]:
                            subj_candidate["predicates"][str(id_col_obj_col)][predicate] = score    
                            
            subj_candidate["features"]["p_subj_lit"] += round(max_score, 3) 

In [8]:
FeaturesExtraction(output, lamAPI).compute_features()

In [None]:
output["candidates"][2][1][0:2]

In [None]:
output["features"]

In [None]:
output["candidates"][0][2]

In [None]:
for row in output["candidates"]:
        for id_col, candidates in enumerate(row):

In [9]:
class Prediction:
    def __init__(self, data, model):
        self._data = data
        self._model = model
        
    def compute_prediction(self, feature_name):
        prediction = []
        indexes = []
        for column_features in self._data["features"]:
            pred = [] 
            if len(column_features) > 0:
                pred = self._model.predict(column_features)
            prediction.append(pred)
            indexes.append(0)
        
        for id_row, row in enumerate(self._data["candidates"]):
            for id_col, candidates in enumerate(row):
                for candidate in candidates:
                    index = indexes[id_col]
                    indexes[id_col] += 1
                    feature = round(prediction[id_col][index][1], 3)
                    candidate["features"][feature_name] = feature
                candidates.sort(key=lambda x:x["features"][feature_name], reverse=True)       
            #new_candidates = sorted(candidates, key=lambda x:x["features"]["cea"], reverse=True)
            #self._data["candidates"][id_row][id_col] = new_candidates

In [10]:
from keras.models import load_model
model = load_model("neural_network.h5")
Prediction(output, model).compute_prediction("cea")

2023-03-18 16:09:26.201542: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-18 16:09:26.536105: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-18 16:09:26.536126: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-18 16:09:27.641428: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-



In [24]:
class FeaturesExtractionRevision:
    def __init__(self, data):
        self._data = data
        self._cta = {str(id_col):{} for id_col in range(len(self._data["metadata"]["column"]))}
        self._cpa = {str(id_col):{} for id_col in range(len(self._data["metadata"]["column"]))}
    
    def compute_features(self):
        for id_row, row in enumerate(self._data["candidates"]):
            for id_col, candidates in enumerate(row):
                id_col = str(id_col)
                history = set()
                for candidate in candidates[0:3]:
                    types = candidate["types"]
                    for t in types:
                        id_type = t["id"]
                        if id_type in history:
                            continue
                        if id_type not in self._cta[id_col]:
                            self._cta[id_col][id_type] = 0
                        self._cta[id_col][id_type] += 1
                        history.add(id_type)
                    
                    predicates = candidate["predicates"]
                    for id_col_rel in predicates:
                        for id_predicate in predicates[id_col_rel]:
                            if id_predicate in history:
                                continue
                            if id_predicate not in self._cpa[id_col]:
                                self._cpa[id_col][id_predicate] = 0
                            self._cpa[id_col][id_predicate] += 1
                            history.add(id_predicate)
        
        n_rows = len(self._data["rows"])
        for id_col in self._cta:
            for id_type in self._cta[id_col]:
                self._cta[id_col][id_type] = round(self._cta[id_col][id_type]/n_rows, 2)
            for id_predicate in self._cpa[id_col]:
                self._cpa[id_col][id_predicate] = round(self._cpa[id_col][id_predicate]/n_rows, 2)    
        
        
        for id_row, row in enumerate(self._data["candidates"]):
            for id_col, candidates in enumerate(row):
                id_col = str(id_col)
                for candidate in candidates:
                    (cta, ctaMax) = (0, 0)
                    for t in candidate["types"]:
                        if t["id"] in self._cta[id_col]:
                            cta += self._cta[id_col][t["id"]]
                            if self._cta[id_col][t["id"]] > ctaMax:
                                ctaMax = self._cta[id_col][t["id"]]
                    
                    (cpa, cpaMax) = (0, 0)
                    total_predicates = 0
                    for id_col_pred in candidate["predicates"]:
                        for id_predicate in candidate["predicates"][id_col_pred]:
                            if id_predicate in self._cpa[id_col]:
                                total_predicates += 1
                                cpa += self._cpa[id_col][id_predicate]
                                if self._cpa[id_col][id_predicate] > cpaMax:
                                    cpaMax = self._cpa[id_col][id_predicate]
                                
                    cta /= len(candidate["types"]) if len(candidate["types"]) > 0 else 1
                    candidate["features"]["cta"] = round(cta, 2)
                    candidate["features"]["ctaMax"] = ctaMax
                    
                    cpa /= total_predicates if total_predicates > 0 else 1
                    candidate["features"]["cpa"] = round(cpa, 2)
                    candidate["features"]["cpaMax"] = cpaMax

In [None]:
fe_revsion = FeaturesExtractionRevision(output)
fe_revsion.compute_features()
fe_revsion._cpa

In [20]:
len(output["features"][0])

236

In [81]:
output["candidates"][0][0].sort(key=lambda x:x["features"]["cea"], reverse=True)

In [28]:
output["candidates"][3][3]

[{'id': 'Q434841',
  'name': '20 Century Fox',
  'descritpion': 'American film studio',
  'types': [{'id': 'Q375336', 'name': 'film studio'},
   {'id': 'Q1762059', 'name': 'film production company'},
   {'id': 'Q10689397', 'name': 'television production company'}],
  'features': {'ntoken': 3,
   'popularity': 82,
   'pos_score': 0.01,
   'es_score': 68.88,
   'es_diff_score': 0.004774,
   'ed_score': 1.0,
   'jaccard_score': 1.0,
   'jaccardNgram_score': 1.0,
   'cosine_similarity': 1.0,
   'p_subj_ne': 0,
   'p_subj_lit': 0.946,
   'p_obj_ne': 1.65,
   'desc': 0,
   'descNgram': 0,
   'cpa': 0.38,
   'cpaMax': 0.5,
   'cta': 0.58,
   'ctaMax': 1.0,
   'cea': 0.999,
   'diff': 0},
  'matches': {'0': [],
   '1': [],
   '2': [],
   '3': [{'p': 'P1128', 'o': '+2300', 's': 0.874},
    {'p': 'P8687', 'o': '+4441100', 's': 0.001},
    {'p': 'P8687', 'o': '+4202335', 's': 0.001},
    {'p': 'P8687', 'o': '+199000', 's': 0.01},
    {'p': 'P1128', 'o': '+2300', 's': 0.07},
    {'p': 'P8687', 'o'

In [123]:
output["candidates"][3][0]

[{'id': 'Q24871',
  'name': 'Avatar',
  'descritpion': '2009 American epic science fiction film directed by James Cameron',
  'types': [{'id': 'Q229390', 'name': '3D film'},
   {'id': 'Q25110269', 'name': 'live-action/animated film'}],
  'features': {'ntoken': 1,
   'popularity': 117,
   'pos_score': 0.09,
   'es_score': 36.17,
   'es_diff_score': 0.000959,
   'ed_score': 1.0,
   'jaccard_score': 1.0,
   'jaccardNgram_score': 1.0,
   'cosine_similarity': 1.0,
   'p_subj_ne': 2.0,
   'p_subj_lit': 2.081,
   'p_obj_ne': 0,
   'desc': 0,
   'descNgram': 0,
   'cpa': 0,
   'cpaMax': 0,
   'cta': 0,
   'ctaMax': 0,
   'cea': 0.996,
   'diff': 0},
  'matches': {'0': [],
   '1': [{'p': 'P57', 'o': 'Q42574', 's': 1.0},
    {'p': 'P162', 'o': 'Q42574', 's': 1.0},
    {'p': 'P58', 'o': 'Q42574', 's': 1.0},
    {'p': 'P1040', 'o': 'Q42574', 's': 1.0}],
   '2': [],
   '3': [{'p': 'P272', 'o': 'Q434841', 's': 1.0},
    {'p': 'P2047', 'o': '+162', 's': 0.081},
    {'p': 'P8687', 'o': '+148542', 's':