In [1]:
import os
import numpy as np
import pandas as pd
import math
import time
from lamAPI import LamAPI


SAMPLE_SIZE = 25
LAMAPI_HOST, LAMAPI_PORT = os.environ["LAMAPI_ENDPOINT"].split(":")
LAMAPI_TOKEN = os.environ["LAMAPI_TOKEN"]

def clean_str(value):
    value = str(value)
    stop_charaters = ["_"]
    for char in stop_charaters:
        value = value.replace(char, " ")
    value = " ".join(value.split()).lower()
    return value


def format_table(table_df):
    rows = []

    for id_row, row in enumerate(table_df):
        rows.append({"idRow":id_row+1, "data":[clean_str(cell) for cell in row]})
    #buffer = []
    #rows = np.array_split(rows, max(math.floor(len(rows)/SAMPLE_SIZE), 1))
        
    return rows


def compute_datatype(rows, lamapi_wrapper):
    column_metadata = {}
    columns_data = {str(i):[] for i in range(0, len(rows[0]['data']))}
    target = {"SUBJ": 0, "NE": [], "LIT": [], "LIT_DATATYPE": {}}
    for row in rows:
        for id_col, cell in enumerate(row["data"]):
            columns_data[str(id_col)].append(str(cell))
    
    first_NE_column = False     
    for id_col in columns_data:
        metadata = lamapi_wrapper.literal_recognizer(columns_data[id_col])
        max_datatype = max(metadata, key=metadata.get)
        if max_datatype == "ENTITY":
            column_metadata[id_col] = "NE"
            target['NE'].append(int(id_col)) 
            if not first_NE_column:
                target["SUBJ"] = int(id_col)
            first_NE_column = True
        else:
            column_metadata[id_col] = "LIT"
            target['LIT'].append(int(id_col))
            target['LIT_DATATYPE'][str(id_col)] = max_datatype
            
    return column_metadata, target        


lamAPI = LamAPI(LAMAPI_HOST, LAMAPI_PORT, LAMAPI_TOKEN)
time = time
name = "film.csv"
input_file = pd.read_csv(name)
rows = format_table(input_file.values.tolist())
header = list(input_file.columns)
kg_reference = "wikidata"
column_metadata = {}
target = None


output = {
    "name": name,
    "header": header,
    "rows": rows,
    "metadata": None,
    "target": None,
    "kg_reference": kg_reference,
    "limit": 100,
    "status": "DONE", 
    "time": time.time()
}

if len(column_metadata) == 0:
        column_metadata, target = compute_datatype(rows, lamAPI)
        column_metadata[str(target["SUBJ"])] = "SUBJ"
        #output["column"] = column_metadata
        output["metadata"] = {
            "column": [{"idColumn": int(id_col), "tag": column_metadata[id_col]} for id_col in column_metadata]
        }
        output["target"] = target

In [2]:
output

{'name': 'film.csv',
 'header': ['title',
  'director',
  'release year',
  'domestic distributor',
  'length in min',
  'worldwide gross'],
 'rows': [{'idRow': 1,
   'data': ['jurassic world',
    'colin trevorrow',
    '2015',
    'universal pictures',
    '124',
    '1670400637']},
  {'idRow': 2,
   'data': ['superman returns',
    'bryan singer',
    '2006',
    'warner bros.',
    '154',
    '391081192']},
  {'idRow': 3,
   'data': ['batman begins',
    'christopher nolan',
    '2005',
    'warner bros.',
    '140',
    '371853783']},
  {'idRow': 4,
   'data': ['avatar',
    'james cameron',
    '2009',
    '20 century fox',
    '162',
    '2744336793']}],
 'metadata': {'column': [{'idColumn': 0, 'tag': 'SUBJ'},
   {'idColumn': 1, 'tag': 'NE'},
   {'idColumn': 2, 'tag': 'LIT'},
   {'idColumn': 3, 'tag': 'NE'},
   {'idColumn': 4, 'tag': 'LIT'},
   {'idColumn': 5, 'tag': 'LIT'}]},
 'target': {'SUBJ': 0,
  'NE': [0, 1, 3],
  'LIT': [2, 4, 5],
  'LIT_DATATYPE': {'2': 'NUMBER', '4': 'N

In [2]:
import os
import numpy as np
import pandas as pd
import math
import time
from lamAPI import LamAPI
from s2_process.process import Process

SAMPLE_SIZE = 25
LAMAPI_HOST, LAMAPI_PORT = os.environ["LAMAPI_ENDPOINT"].split(":")
LAMAPI_TOKEN = os.environ["LAMAPI_TOKEN"]
lamAPI = LamAPI(LAMAPI_HOST, LAMAPI_PORT, LAMAPI_TOKEN)
p1 = Process(output, lamAPI)
output["candidates"] = p1._rows

In [3]:
output["candidates"][0][0][0:2]

[{'id': 'Q3512046',
  'name': 'Jurassic world',
  'descritpion': '2015 American science fiction adventure film directed by Colin Trevorrow',
  'types': [{'id': 'Q229390', 'name': '3D film'},
   {'id': 'Q11424', 'name': 'film'},
   {'id': 'Q261636', 'name': 'sequel'}],
  'features': {'ntoken': 2,
   'popularity': 60,
   'pos_score': 0.01,
   'es_score': 54.63,
   'es_diff_score': 0.006261,
   'ed_score': 1.0,
   'jaccard_score': 1.0,
   'jaccardNgram_score': 1.0,
   'cosine_similarity': 1.0,
   'p_subj_ne': 0,
   'p_subj_lit': 0,
   'p_obj_ne': 0,
   'desc': 0,
   'descNgram': 0,
   'cpa': 0,
   'cpaMax': 0,
   'cta': 0,
   'ctaMax': 0,
   'cea': 0,
   'diff': 0},
  'matches': {'0': [], '1': [], '2': [], '3': [], '4': [], '5': []},
  'pred': {'0': {}, '1': {}, '2': {}, '3': {}, '4': {}, '5': {}}},
 {'id': 'Q21877685',
  'name': 'Jurassic World',
  'descritpion': '2018 5th Jurassic Park film directed by Juan Antonio Bayona',
  'types': [{'id': 'Q11424', 'name': 'film'}],
  'features': {'

In [15]:
"Q5145625" in temp["wikidata"]["Q3512046"]["objects"]

True

In [4]:
import metrics as metrics


class FeaturesExtraction:
    def __init__(self, data, lamAPI):
        self._data = data
        self._lamAPI = lamAPI
    
    
    def compute_features(self):
        rows = self._data["rows"]
        target = self._data["target"]
        for index, row in enumerate(rows):
            cells = row["data"]
            for id_col_ne_subj in target["NE"]:
                ne_cell_subj = cells[id_col_ne_subj]
                for id_col_ne_obj in target["NE"]:
                    if id_col_ne_subj == id_col_ne_obj:
                        continue
                    ne_cell_obj = cells[id_col_ne_obj]
                    self._compute_similarity_between_ne_cells(index, id_col_ne_subj, id_col_ne_obj)
                for id_col_lit_obj in target["LIT"]:
                    lit_cell_obj = cells[id_col_lit_obj]
                    self._match_lit_cells(index, id_col_ne_subj, id_col_ne_obj, lit_cell_obj, target["LIT_DATATYPE"][str(id_col_lit_obj)])
                    
        
    def compute_relationship_score(self):
        ne_cells = self._row.get_ne_cells()
        cells = self._row.get_cells()
        for ne_cell in ne_cells:
            for cell in cells:
                if cell == ne_cell:
                    continue
                elif cell.is_lit_cell:
                    self._match_lit_cells(ne_cell, cell)
                else:
                    self._compute_similarity_between_ne_cells(ne_cell, cell)
                

    def _compute_similarity_between_ne_cells(self, id_row, id_col_subj_cell, id_col_obj_cell):
        subj_candidates = self._data["candidates"][id_row][id_col_subj_cell]
        obj_candidates = self._data["candidates"][id_row][id_col_obj_cell]
        subj_id_candidates = [candidate["id"] for candidate in subj_candidates]
        obj_id_candidates = [candidate["id"] for candidate in obj_candidates]
        
        subjects_objects = self._lamAPI.objects(subj_id_candidates)
        object_rel_score_buffer = {}

        for subj_candidate in subj_candidates:
            id_subject = subj_candidate["id"]
            subj_candidate_objects = subjects_objects.get(id_subject, {}).get("objects", {})
            objects_set = set(subj_candidate_objects.keys())
            #subj_candidate["matches"][str(id_col_obj_cell)] = []
            #subj_candidate["pred"][str(id_col_obj_cell)] = {}
              
            objects_itersection = objects_set.intersection(set(obj_id_candidates))
            #print(objects_itersection)
            obj_score_max = 0
            for obj_candidate in obj_candidates:
                id_object = obj_candidate["id"]  
                if id_object not in objects_itersection:
                    continue
                              
                score = obj_candidate["features"]["ed_score"]
                if score > obj_score_max:
                    obj_score_max = score
                   
                if id_object not in object_rel_score_buffer:
                    object_rel_score_buffer[id_object] = 0
                score_rel = subj_candidate["features"]["ed_score"]
                if score_rel > object_rel_score_buffer[id_object]:
                    object_rel_score_buffer[id_object] = score_rel
                for predicate in subj_candidate_objects[id_object]:
                    subj_candidate["matches"][str(id_col_obj_cell)].append({
                        "p": predicate,
                        "o": id_object,
                        "s": round(score, 3)
                    })
                    subj_candidate["pred"][str(id_col_obj_cell)][predicate] = score
            subj_candidate["features"]["p_subj_ne"] += obj_score_max          
        
        for obj_candidate in obj_candidates:
            id_object = obj_candidate["id"]  
            if id_object not in object_rel_score_buffer:
                continue
            obj_candidate["features"]["p_obj_ne"] += object_rel_score_buffer[id_object]    
        
      
        """
        for subject, objects in subjects_objects.items():
            if "objects" in objects:
                objects = objects["objects"]
            subj_candidates[subject]["matches"][str(id_col_obj_cell)] = []
            subj_candidates[subject]["pred"][str(id_col_obj_cell)] = {}
            objects_set = set(objects.keys())
            obj_score_max = 0
            for candidate in objects_set.intersection(set(obj_id_candidates)):
                score = obj_cell.candidates_entities()[candidate]["features"]["ed"]
                if score > obj_score_max:
                    obj_score_max = score
                   
                if candidate not in object_rel_score_buffer:
                    object_rel_score_buffer[candidate] = 0
                score_rel = subj_candidates[subject]["features"]["ed"]
                if score_rel > object_rel_score_buffer[candidate]:
                    object_rel_score_buffer[candidate] = score_rel
                for predicate in objects[candidate]:
                    subj_cell.candidates_entities()[subject]["matches"][str(obj_cell._id_col)].append({
                        "p": predicate,
                        "o": candidate,
                        "s": round(score, 3)
                    })
                    subj_cell.candidates_entities()[subject]["pred"][str(obj_cell._id_col)][predicate] = score
            subj_cell.candidates_entities()[subject]["features"]["p_subj_ne"] += obj_score_max
              
        for candidate in object_rel_score_buffer:
            temp = obj_cell.candidates_entities().get(candidate)
            if temp is not None:
                temp["features"]["p_obj_ne"] += object_rel_score_buffer[candidate]
        """    
    
    def _match_lit_cells(self, id_row, id_col_subj_cell, id_col_obj_col, obj_cell, obj_cell_datatype):
    
        def get_score_based_on_datatype(valueInCell, valueFromKG, datatype):
            score = 0
            if datatype == "NUMBER":
                score = metrics.compute_similarty_between_numbers(valueInCell, valueFromKG.lower())
            elif datatype == "DATETIME":
                score = metrics.compute_similarity_between_dates(valueInCell, valueFromKG.lower())
            elif datatype == "STRING":
                score = metrics.compute_similarity_between_string(valueInCell, valueFromKg.lower())
            return score
        
        subj_candidates = self._data["candidates"][id_row][id_col_subj_cell]
        subj_id_candidates = [candidate["id"] for candidate in subj_candidates]
        cand_lamapi_literals = self._lamAPI.literals(subj_id_candidates)
        datatype = obj_cell_datatype
        
        for subj_candidate in subj_candidates:
            id_subject = subj_candidate["id"]
            literals = cand_lamapi_literals[id_subject]
            if "literals" in literals:
                literals = literals['literals']
            if len(literals[datatype.lower()]) == 0:
                continue
            #subj_candidate["matches"][str(id_col_obj_col)] = []
            #subj_candidate["pred"][str(id_col_obj_col)] = {}
            #subj_cell.candidates_entities()[subject]["match_count"]["lit"] += 1
            max_score = 0
            for predicate in literals[datatype.lower()]:
                for valueFromKg in literals[datatype.lower()][predicate]:
                    score = get_score_based_on_datatype(obj_cell, valueFromKg, datatype)
                    score = round(score, 3)
                    if score > 0:
                        subj_candidate["matches"][str(id_col_obj_col)].append({
                            "p": predicate,
                            "o": valueFromKg,
                            "s": round(score, 3)
                        })  
                        if score > max_score:
                            max_score = score
                        if predicate not in subj_candidate["pred"][str(id_col_obj_col)]:
                            subj_candidate["pred"][str(id_col_obj_col)][predicate] = 0
                        if score > subj_candidate["pred"][str(id_col_obj_col)][predicate]:
                            subj_candidate["pred"][str(id_col_obj_col)][predicate] = score    
                            
            subj_candidate["features"]["p_subj_lit"] += round(max_score, 3) 

In [5]:
FeaturesExtraction(output, lamAPI).compute_features()

In [6]:
output["candidates"][0][0][0:2]

[{'id': 'Q3512046',
  'name': 'Jurassic world',
  'descritpion': '2015 American science fiction adventure film directed by Colin Trevorrow',
  'types': [{'id': 'Q229390', 'name': '3D film'},
   {'id': 'Q11424', 'name': 'film'},
   {'id': 'Q261636', 'name': 'sequel'}],
  'features': {'ntoken': 2,
   'popularity': 60,
   'pos_score': 0.01,
   'es_score': 54.63,
   'es_diff_score': 0.006261,
   'ed_score': 1.0,
   'jaccard_score': 1.0,
   'jaccardNgram_score': 1.0,
   'cosine_similarity': 1.0,
   'p_subj_ne': 2.0,
   'p_subj_lit': 2.061,
   'p_obj_ne': 0,
   'desc': 0,
   'descNgram': 0,
   'cpa': 0,
   'cpaMax': 0,
   'cta': 0,
   'ctaMax': 0,
   'cea': 0,
   'diff': 0},
  'matches': {'0': [],
   '1': [{'p': 'P57', 'o': 'Q5145625', 's': 1.0},
    {'p': 'P161', 'o': 'Q5145625', 's': 1.0},
    {'p': 'P58', 'o': 'Q5145625', 's': 1.0}],
   '2': [],
   '3': [{'p': 'P750', 'o': 'Q168383', 's': 1.0},
    {'p': 'P2047', 'o': '+124', 's': 0.061},
    {'p': 'P2047', 'o': '+124', 's': 1.0},
    {'p