In [None]:
import os
import sys
import time
import traceback

import joblib
import redis
from keras.models import load_model

from phases.cea import CEAProcess
from phases.cpa import CPAProcess
from phases.cta import CTAProcess
from phases.data_preparation import compute_datatype, pre_processing
from phases.revision import RevisionProcess
from wrapper.lamAPI import LamAPI
from wrapper.logistic_regressor import LogisticRegressor
from wrapper.mongodb_conn import get_collection
from wrapper.neural_network import NeuralNetwork

neural1_path = "./ml_models/neural_network1.h5"
neural2_path = "./ml_models/neural_network2.h5"
logistic1_path = "./ml_models/logistic1.pkl"
logistic2_path = "./ml_models/logistic2.pkl"

rankers = [
                NeuralNetwork(load_model(neural1_path), "neural1"), 
                NeuralNetwork(load_model(neural2_path), "neural2"), 
                LogisticRegressor(joblib.load(logistic1_path), "logistic1"),
                LogisticRegressor(joblib.load(logistic2_path), "logistic2")
        ]


start = time.time()

REDIS_ENDPOINT = os.environ["REDIS_ENDPOINT"]
REDIS_JOB_DB = int(os.environ["REDIS_JOB_DB"])
LAMAPI_HOST, LAMAPI_PORT = os.environ["LAMAPI_ENDPOINT"].split(":")
LAMAPI_TOKEN = os.environ["LAMAPI_TOKEN"]

lamAPI = LamAPI(LAMAPI_HOST, LAMAPI_PORT, LAMAPI_TOKEN)
job_active = redis.Redis(host=REDIS_ENDPOINT, db=REDIS_JOB_DB)

row_c = get_collection('row')
log_c = get_collection('log')
cea_init_c = get_collection('ceaInit')
header_cea_c = get_collection('ceaHeader')
cea_c = get_collection('cea')
cpa_c = get_collection('cpa')
cta_c = get_collection('cta')
header_candidate_scored_c = get_collection('headerCandidateScored')
candidate_scored_c = get_collection('candidateScored')
data = row_c.find_one({})


if data is None:
    job_active.set("STOP", "")
    sys.exit(0)

header = data.get("header", [])
rows = data["rows"]
kg_reference = data["kgReference"]
candidate_size = data["candidateSize"]
column_metadata = data["column"]
target = data["target"]
type = data["types"]
_id = data["_id"]
dataset_name = data["datasetName"]
table_name = data["tableName"]

types_weights = data["typesWeights"]
predicates_weights = data["predicatesWeights"]
obj_row_update = {"status": "DONE", "time": None}

if len(column_metadata) == 0:
    column_metadata, target = compute_datatype(rows, lamAPI)
    column_metadata[str(target["SUBJ"])] = "SUBJ"
    obj_row_update["column"] = column_metadata
    obj_row_update["metadata"] = {
        "column": [{"idColumn": int(id_col), "tag": column_metadata[id_col]} for id_col in column_metadata]
    }
    obj_row_update["target"] = target

cells_set = pre_processing(header, rows, column_metadata, type, candidate_size)
cea = CEAProcess(data, lamAPI, rankers, target, log_c, type, kg_ref=kg_reference, size=candidate_size)
(results_cea, candidates) = cea.compute()
winning_candidates = [row["winningCandidates"] for row in results_cea]
keys = {"datasetName":dataset_name, "tableName":table_name}
cpa = CPAProcess(winning_candidates, keys, target, len(rows[0]["data"]), kg_ref=kg_reference)
results_cpa = cpa.compute()
cta = CTAProcess(winning_candidates, keys, target, len(rows[0]["data"]), kg_ref=kg_reference)
results_cta = cta.compute()
cpa_data, cta_data = (cpa._cpa, cta._cta)
if len(types_weights) > 0:
    cta_data = types_weights
if len(predicates_weights) > 0:
    cpa_data = predicates_weights     

revision = RevisionProcess(rankers, results_cea, candidates, cpa._cpa, cta._cta, target["SUBJ"])
revision.compute()
end = time.time()
execution_time = round(end - start, 2)
execution_time

In [1]:
import os
import sys
import time
import traceback
import joblib
import redis
import utils.utils as utils
from keras.models import load_model
from phases.data_preparation import DataPreparation
from wrapper.lamAPI import LamAPI
from wrapper.mongodb_conn import get_collection
from phases.lookup import Lookup
from phases.feauturesExtraction import FeauturesExtraction
from phases.prediction import Prediction
from phases.featuresExtractionRevision import FeaturesExtractionRevision
from phases.storage import Storage


neural2_path = "./ml_models/neural_network2.h5"


model = load_model(neural2_path)
               

start = time.time()

REDIS_ENDPOINT = os.environ["REDIS_ENDPOINT"]
REDIS_JOB_DB = int(os.environ["REDIS_JOB_DB"])
LAMAPI_HOST, LAMAPI_PORT = os.environ["LAMAPI_ENDPOINT"].split(":")
LAMAPI_TOKEN = os.environ["LAMAPI_TOKEN"]

lamAPI = LamAPI(LAMAPI_HOST, LAMAPI_PORT, LAMAPI_TOKEN)
job_active = redis.Redis(host=REDIS_ENDPOINT, db=REDIS_JOB_DB)

row_c = get_collection('row')
log_c = get_collection('log')
cea_prelinking_c = get_collection('ceaPrelinking')
header_cea_c = get_collection('ceaHeader')
cea_c = get_collection('cea')
cpa_c = get_collection('cpa')
cta_c = get_collection('cta')
header_candidate_scored_c = get_collection('headerCandidateScored')
candidate_scored_c = get_collection('candidateScored')
data = row_c.find_one({"datasetName": "HardTableR3-2021_100_m3", "tableName": "0B5GPQG2"})


if data is None:
    job_active.set("STOP", "")
    sys.exit(0)

header = data.get("header", [])
rows = data["rows"]
kg_reference = data["kgReference"]
limit = data["candidateSize"]
column_metadata = data["column"]
target = data["target"]
_type = data["types"]
_id = data["_id"]
dataset_name = data["datasetName"]
table_name = data["tableName"]

types_weights = data["typesWeights"]
predicates_weights = data["predicatesWeights"]
obj_row_update = {"status": "DONE", "time": None}
dp = DataPreparation(rows, lamAPI)
if len(column_metadata) == 0:
    column_metadata, target = dp.compute_datatype()
    column_metadata[str(target["SUBJ"])] = "SUBJ"
    obj_row_update["column"] = column_metadata
    obj_row_update["metadata"] = {
        "column": [{"idColumn": int(id_col), "tag": column_metadata[id_col]} for id_col in column_metadata]
    }
    obj_row_update["target"] = target
    
metadata = {
    "datasetName": dataset_name,
    "tableName": table_name,
    "kgReference": kg_reference
}

collections = {
    "ceaPrelinking": cea_prelinking_c,
    "cea": cea_c,
    "cta": cta_c,
    "cpa": cpa_c,
    "candidateScored": candidate_scored_c
}

dp.rows_normalization()
l = Lookup(data, lamAPI, target, log_c, kg_reference, limit)
rows = l.get_rows()
features = FeauturesExtraction(rows, lamAPI).compute_feautures()
Prediction(rows, features, model).compute_prediction("cea")
cea_preliking_data = utils.get_cea_pre_linking_data(metadata, rows)
revision = FeaturesExtractionRevision(rows)
features = revision.compute_features()
Prediction(rows, features, model).compute_prediction("score")
storage = Storage(metadata, cea_preliking_data, rows, revision._cta, revision._cpa, collections)

2023-03-29 09:07:29.137494: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-29 09:07:29.281783: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-29 09:07:29.286805: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-29 09:07:29.286831: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore 



In [2]:
rows[0].get_subject_cell()._id_col

0

In [7]:
target

{'SUBJ': 0, 'NE': [1, 2], 'LIT': [0], 'LIT_DATATYPE': {'0': 'DATETIME'}}

In [4]:
revision._cpa_weights

{'0': {'P2048': 46.02499999999999,
  'P2049': 35.444,
  'P1082': 0.002,
  'P2046': 0.014,
  'P1081': 0.048,
  'P2044': 0.216},
 '1': {}}

In [2]:
revision._cpa

{'0': {'P2048': 0.97,
  'P2049': 0.74,
  'P1082': 0.0,
  'P2046': 0.0,
  'P1081': 0.0,
  'P2044': 0.01},
 '1': {}}

In [2]:
cta = revision._cta
cta

{'0': {'Q18965': 1.0}, '1': {}}

In [2]:
cta = storage.store_cta_data()

In [3]:
max(revision._cta['0'])

'Q930752'

In [47]:
import heapq
from operator import itemgetter

n = 5

items = revision._cta['0']

topitems = heapq.nlargest(n, items.items(), key=itemgetter(1))  # Use .iteritems() on Py2
topitemsasdict = dict(topitems)
topitemsasdict

{'Q429785': 0.33,
 'Q108586636': 0.27,
 'Q43229': 0.27,
 'Q16521': 0.27,
 'Q11424': 0.27}

In [None]:
rows[9].get_cells()[0].candidates()

In [None]:
rows[1].get_cells()[0].candidates()

In [6]:
import json
print(json.dumps(rows[9].get_cells()[0].candidates()))

[{"id": "Q55213748", "name": "planta dos terrenos de anna luiza do espirito santo", "description": "work by joaquim rodrigues antunes junior 2/2", "types": [{"id": "Q18965", "name": "floor plan"}], "features": {"ntoken": 9, "popularity": 1, "pos_score": 0.03, "es_score": 200.64, "es_diff_score": 0.851958, "ed": 1.0, "jaccard": 1.0, "jaccardNgram": 1.0, "cosine_similarity": 1.0, "p_subj_ne": 0, "p_subj_lit": 1.0, "p_obj_ne": 0, "desc": 0.0, "descNgram": 0.038, "cpa": 0.92, "cpaMax": 1.0, "cta": 1.0, "ctaMax": 1.0, "cea": 0.967, "diff": 0.007000000000000006}, "matches": {"0": [], "1": [{"p": "P2048", "o": "+53", "s": 0.835}, {"p": "P2049", "o": "+63.5", "s": 1.0}]}, "predicates": {"0": {}, "1": {"P2048": 0.835, "P2049": 1.0}}, "match": false, "score": 0.98}, {"id": "Q55213747", "name": "planta dos terrenos de anna luiza do espirito santo", "description": "work by joaquim rodrigues antunes junior 1/2", "types": [{"id": "Q18965", "name": "floor plan"}], "features": {"ntoken": 9, "popularit

In [None]:
cea_prelinking_c.insert_one(cea_preliking_data[0])

In [15]:
type(cea_preliking_data[0]["winningCandidates"][0][0]["types"])

list

In [9]:
type(cea_preliking_data[0]["winningCandidates"][0]["types"])

TypeError: list indices must be integers or slices, not str

In [6]:
type(cea_preliking_data[0]["types"])

KeyError: 'types'

In [None]:
cea_preliking_data[0]

In [3]:
l.get_rows()[1].get_cells()[1].candidates()

[]

In [5]:
import pickle
# Load the Model back from file
logistic_r1_path = "logistic1_r1.pkl"
with open(logistic_r1_path, 'rb') as file:  
    logistic_r1_path = pickle.load(file)

In [2]:
pip install tensorflow

Successfully installed absl-py-1.4.0 astunparse-1.6.3 cachetools-5.3.0 flatbuffers-23.3.3 gast-0.4.0 google-auth-2.16.3 google-auth-oauthlib-0.4.6 google-pasta-0.2.0 grpcio-1.53.0 jax-0.4.7 keras-2.12.0 libclang-16.0.0 markdown-3.4.3 ml_dtypes-0.0.4 opt-einsum-3.3.0 pyasn1-0.4.8 pyasn1-modules-0.2.8 requests-oauthlib-1.3.1 rsa-4.9 tensorboard-2.12.0 tensorboard-data-server-0.7.0 tensorboard-plugin-wit-1.8.1 tensorflow-2.12.0 tensorflow-estimator-2.12.0 tensorflow-io-gcs-filesystem-0.31.0 termcolor-2.2.0 werkzeug-2.2.3
Note: you may need to restart the kernel to use updated packages.


In [None]:
header_results

In [None]:
header_candidates

In [2]:
cta_data

{'0': {'Q33999': 0.667,
  'Q488205': 0.667,
  'Q639669': 0.667,
  'Q2405480': 0.667,
  'Q177220': 0.667,
  'Q36834': 0.333,
  'Q10798782': 0.667,
  'Q10800557': 0.667,
  'Q9648008': 0.333,
  'Q55960555': 0.667,
  'Q5': 1.0,
  'Q486748': 0.333,
  'Q855091': 0.333,
  'Q1327329': 0.333,
  'Q1028181': 0.333,
  'Q482980': 0.333,
  'Q3282637': 0.333,
  'Q970153': 0.333,
  'Q2259451': 0.333},
 '1': {'Q13218391': 0.333,
  'Q62049': 0.333,
  'Q1093829': 0.667,
  'Q1549591': 1.0,
  'Q50330360': 0.333,
  'Q515': 1.0,
  'Q1388464': 0.333,
  'Q21507383': 0.333,
  'Q208511': 0.333,
  'Q2264924': 0.333,
  'Q51929311': 0.333,
  'Q15063611': 0.333,
  'Q1637706': 0.333},
 '2': {'Q3624078': 1.0,
  'Q1520223': 0.667,
  'Q5255892': 0.667,
  'Q512187': 0.667,
  'Q1489259': 0.667,
  'Q6256': 1.0,
  'Q99541706': 0.667,
  'Q223832': 0.333,
  'Q202686': 0.333,
  'Q43702': 0.333}}

In [None]:
result = lamAPI.lookup("wikidata")
result

In [None]:
Pickled_LR_Model.pre([])

In [19]:
results_cea[0]["winningCandidates"][0][0]['features'].keys()

dict_keys(['ntoken', 'popularity', 'pos_score', 'es_score', 'es_diff_score', 'ed', 'jaccard', 'jaccardNgram', 'cosine_similarity', 'p_subj_ne', 'p_subj_lit', 'p_obj_ne', 'desc', 'descNgram', 'cea', 'cpa', 'cpaMax', 'cta', 'ctaMax'])

In [21]:
features = ['ntoken', 'popularity', 'pos_score', 'es_score', 'es_diff_score', 'ed', 'jaccard', 'jaccardNgram', 'cosine_similarity', 'p_subj_ne', 'p_subj_lit', 'p_obj_ne', 'desc', 'descNgram']

In [58]:
temp = results_cea[0]["winningCandidates"][1][0]['features']
Xnew = [temp[feature] for feature in features]

In [59]:
import numpy as np
predictions = Pickled_LR_Model.predict([Xnew])
predictions



array([[0.06421909, 0.93578094]], dtype=float32)

In [60]:
predictions[0][1]

0.93578094

In [52]:
seq_predictions=Pickled_LR_Model.predict([Xnew])
print('Outputs shape')    
print(seq_predictions.shape) # prints (n,1) but  need (n,)
seq_predictions=np.transpose(seq_predictions)[0]  # transformation to get (n,)
print(seq_predictions.shape)  # now the shape is (n,)
# Applying transformation to get binary values predictions with 0.5 as thresold
seq_predictions = list(map(lambda x: 0 if x<0.5 else 1, seq_predictions))

Outputs shape
(1, 2)
(1,)


In [2]:
results_cea[0]

{'datasetName': 'Dataset1',
 'tableName': 'Test1',
 'row': 1,
 'data': ['zooey deschanel', 'los angeles', 'united states'],
 'winningCandidates': [[{'id': 'Q191719',
    'name': 'zooey deschanel',
    'description': 'american actress, model, and singer-songwriter',
    'types': [{'id': 'Q33999', 'name': 'actor'},
     {'id': 'Q488205', 'name': 'singer-songwriter'},
     {'id': 'Q639669', 'name': 'musician'},
     {'id': 'Q2405480', 'name': 'voice actor'},
     {'id': 'Q177220', 'name': 'singer'},
     {'id': 'Q36834', 'name': 'composer'},
     {'id': 'Q10798782', 'name': 'television actor'},
     {'id': 'Q10800557', 'name': 'film actor'},
     {'id': 'Q9648008', 'name': 'banjoist'},
     {'id': 'Q55960555', 'name': 'recording artist'},
     {'id': 'Q5', 'name': 'human'}],
    'match_count': {'obj': 2, 'lit': 0, 'rel': 0},
    'matches': {'1': [{'p': 'P19', 'o': 'Q65', 's': 1.0}],
     '2': [{'p': 'P27', 'o': 'Q30', 's': 1.0}]},
    'features': {'ntoken': 2,
     'popularity': 71,
     

In [None]:
ynew = Pickled_LR_Model.predict([Xnew])
# show the inputs and predicted outputs
for i in range(len(Xnew)):
    print("X=%s, Predicted=%s" % (Xnew[i], ynew[i]))

In [10]:
results_cea[0]

{'datasetName': 'Dataset1',
 'tableName': 'Test1',
 'row': 1,
 'data': ['zooey deschanel', 'los angeles', 'united states'],
 'winningCandidates': [[{'id': 'Q191719',
    'name': 'zooey deschanel',
    'description': 'american actress, model, and singer-songwriter',
    'types': [{'id': 'Q33999', 'name': 'actor'},
     {'id': 'Q488205', 'name': 'singer-songwriter'},
     {'id': 'Q639669', 'name': 'musician'},
     {'id': 'Q2405480', 'name': 'voice actor'},
     {'id': 'Q177220', 'name': 'singer'},
     {'id': 'Q36834', 'name': 'composer'},
     {'id': 'Q10798782', 'name': 'television actor'},
     {'id': 'Q10800557', 'name': 'film actor'},
     {'id': 'Q9648008', 'name': 'banjoist'},
     {'id': 'Q55960555', 'name': 'recording artist'},
     {'id': 'Q5', 'name': 'human'}],
    'match_count': {'obj': 2, 'lit': 0, 'rel': 0},
    'matches': {'1': [{'p': 'P19', 'o': 'Q65', 's': 1.0}],
     '2': [{'p': 'P27', 'o': 'Q30', 's': 1.0}]},
    'features': {'ntoken': 2,
     'popularity': 71,
     

In [2]:
from wrapper.lamAPI import LamAPI
from wrapper.mongodb_conn import get_collection
import os

LAMAPI_HOST, LAMAPI_PORT = os.environ["LAMAPI_ENDPOINT"].split(":")
LAMAPI_TOKEN = os.environ["LAMAPI_TOKEN"]


lamAPI = LamAPI(LAMAPI_HOST, LAMAPI_PORT, LAMAPI_TOKEN, kg="crunchbase")
lamAPI.literals(["fe41b3a3-3ad1-4b78-ba21-765d2339a787"])

{'fe41b3a3-3ad1-4b78-ba21-765d2339a787': {'literals': {'STRING': {'facebook_url': ['https://www.facebook.com/OfficialTfGM'],
    'primary_role': ['company'],
    'short_description': ['Transport for Greater Manchester is UK based company that oversees transport and travel in Greater Manchester.'],
    'email': ['customer.relations@tfgm.com'],
    'homepage_url': ['https://tfgm.com'],
    'type': ['organization'],
    'country_code': ['GBR'],
    'category_groups_list': ['Transportation,Travel and Tourism'],
    'category_list': ['Public Transportation,Transportation,Travel'],
    'alias1': ['tfgm'],
    'phone': ['44 161 244 1000'],
    'roles': ['company'],
    'cb_url': ['https://www.crunchbase.com/organization/transport-for-greater-manchester'],
    'uuid': ['fe41b3a3-3ad1-4b78-ba21-765d2339a787'],
    'linkedin_url': ['https://www.linkedin.com/company/transportforgreatermanchester'],
    'logo_url': ['https://res.cloudinary.com/crunchbase-production/image/upload/odejrfpbnyfhfrhonwc

In [None]:
from wrapper.lamAPI import LamAPI
from wrapper.mongodb_conn import get_collection
import os

LAMAPI_HOST, LAMAPI_PORT = os.environ["LAMAPI_ENDPOINT"].split(":")
LAMAPI_TOKEN = os.environ["LAMAPI_TOKEN"]


lamAPI = LamAPI(LAMAPI_HOST, LAMAPI_PORT, LAMAPI_TOKEN, kg="wikidata")
lamAPI.literals(["Q31"])

In [None]:
lamAPI_wrapper.literal_recognizer([20, 50])

In [None]:
import os
from wrapper.lamAPI import LamAPI


LAMAPI_HOST = os.environ["LAMAPI_ENDPOINT"]
LAMAPI_TOKEN = os.environ["LAMAPI_TOKEN"]
lamapi_wrapper = LamAPI(LAMAPI_HOST, LAMAPI_TOKEN)
subject_objects = lamapi_wrapper.objects(["Q30"])
subject_objects

In [5]:
lamapi_wrapper.lookup("united states")

{'united states': [{'id': 'Q35657',
   'name': 'United States states',
   'description': 'constituent political entity sharing sovereignty as the United States of America',
   'types': [{'id': 'Q28872924',
     'name': 'designation for an administrative territorial entity of a single country'}],
   'ambiguity_mention': 0.747,
   'corrects_tokens': 1.0,
   'ntoken_mention': 2,
   'ntoken_entity': 3,
   'length_mention': 13,
   'length_entity': 20,
   'popularity': 0.14,
   'pos_score': 0.01,
   'es_score': 1.0,
   'ed_score': 0.65,
   'jaccard_score': 1.0,
   'jaccardNgram_score': 1.0,
   'cosine_similarity': 0.94},
  {'id': 'Q30',
   'name': 'United States',
   'description': 'sovereign state in North America',
   'types': [{'id': 'Q3624078', 'name': 'sovereign state'},
    {'id': 'Q1520223', 'name': 'constitutional republic'},
    {'id': 'Q5255892', 'name': 'democratic republic'},
    {'id': 'Q512187', 'name': 'federal republic'},
    {'id': 'Q1489259', 'name': 'superpower'},
    {'id

In [None]:
column_metadata

In [None]:
header_results[0]["winningCandidates"]

In [None]:
lamapi_wrapper.literal_recognizer(["ROME", "usA", "belgium"])

In [None]:
compute_datatype(rows, lamapi_wrapper)

In [None]:
cpa._cpa

In [3]:
cta._cta

{'0': {'Q33999': 0.667,
  'Q488205': 0.667,
  'Q639669': 0.333,
  'Q2405480': 0.333,
  'Q177220': 0.667,
  'Q36834': 0.333,
  'Q10798782': 0.667,
  'Q10800557': 0.667,
  'Q9648008': 0.333,
  'Q55960555': 0.667,
  'Q5': 1.0,
  'Q486748': 0.333,
  'Q855091': 0.333,
  'Q1327329': 0.333,
  'Q1028181': 0.333,
  'Q482980': 0.333,
  'Q3282637': 0.333,
  'Q2259451': 0.333},
 '1': {'Q13218391': 0.333,
  'Q62049': 0.333,
  'Q1093829': 0.667,
  'Q1549591': 1.0,
  'Q50330360': 0.333,
  'Q515': 1.0,
  'Q1388464': 0.333,
  'Q21507383': 0.333,
  'Q208511': 0.333,
  'Q2264924': 0.333,
  'Q51929311': 0.333,
  'Q15063611': 0.333,
  'Q1637706': 0.333},
 '2': {'Q3624078': 1.0,
  'Q1520223': 0.667,
  'Q5255892': 0.667,
  'Q512187': 0.667,
  'Q1489259': 0.667,
  'Q6256': 1.0,
  'Q99541706': 0.667,
  'Q223832': 0.333,
  'Q202686': 0.333,
  'Q43702': 0.333}}

In [None]:
results_cea

In [None]:
rows

In [None]:
weights = {"ed": 8, "jaccard": 7, "jaccardNgram": 5, "p_subj_ne": 5, "p_subj_lit": 7, "p_obj_ne": 4, "desc":2, "descNgram": 3} 
candidates = results_cea[0]["winningCandidates"][0]
for candidate in candidates:
    score = 0
    for feature in weights:     
        score += candidate["features"][feature] * weights[feature]
    print(candidate, score)    

In [None]:
results_cea[0]["winningCandidates"][1]

In [None]:
results_cea[2]

In [None]:
candidates[3]['candidates'][0]

In [None]:
import pandas as pd
from tqdm import tqdm
def get_cea_ann(cea_gt_path):
    gt_files = pd.read_csv(cea_gt_path)
    target = {}
    for row in tqdm(gt_files.itertuples(), total=len(gt_files)):
        id_table, id_row, id_col, gt = (row[i] for i in range(1, 5))
        key = f"{id_table} {id_row} {id_col}"
        target[key] = gt
    return target 

cea_target_path = '../data/2T/DataSets/ToughTablesR2-WD/Valid/gt/cea_gt.csv'
cpa_target_path = './data/HardTableR2/targets/HardTable_CPA_WD_Round2_Targets.csv'
cta_target_path = './data/2T/DataSets/ToughTablesR2-WD/Valid/gt/cta_gt.csv'

cea_gt = get_cea_ann(cea_target_path)
my_cea = { }
for row in results_cea:
    id_row = row["row"]
    id_table = row["tableName"]
    for id_col in row["cea"]:
        if row["winningCandidates"][int(id_col)][0]["score"] > 1 or True:
            my_cea[f"{id_table} {id_row} {id_col}"] = row["cea"][id_col]
sum([1 for key in my_cea if my_cea[key] in cea_gt.get(key, [])]) / len(my_cea)        

In [None]:
len(my_cea)

In [None]:
[(key, my_cea[key], cea_gt.get(key)) for key in my_cea if my_cea[key] not in cea_gt.get(key, [])]

In [None]:
pip install nltk

In [None]:
import os
import time
import traceback

from elasticsearch import Elasticsearch
from mongodb_conn import get_collection
from body import body
import copy

ELASTIC_ENDPOINT = os.environ['ELASTIC_ENDPOINT']
ELASTIC_ENDPOINT_PORT = os.environ['ELASTIC_ENDPOINT_PORT']
ELASTIC_INDEX_NAME = os.environ['ELASTIC_INDEX_NAME']
ELASTICSEARCH_USERNAME = os.environ['ELASTICSEARCH_USERNAME']
ELASTICSEARCH_PASSWORD = os.environ['ELASTICSEARCH_PASSWORD']



es = Elasticsearch(hosts=f'http://{ELASTIC_ENDPOINT}:{ELASTIC_ENDPOINT_PORT}', request_timeout=300, max_retries=10, 
                    retry_on_timeout=True, http_auth=(ELASTICSEARCH_USERNAME, ELASTICSEARCH_PASSWORD))
es.info()

In [None]:
f'http://{ELASTIC_ENDPOINT}:{ELASTIC_ENDPOINT_PORT}', ELASTICSEARCH_USERNAME, ELASTICSEARCH_PASSWORD

In [None]:
import os
import time
import traceback

from elasticsearch import Elasticsearch
from mongodb_conn import get_collection
from body import body
import copy
import requests

headers = {
    'accept': 'application/json',
    # Already added when you pass json= but not when you pass data=
    # 'Content-Type': 'application/json',
}

params = {
    'token': 'ee4ba0c4f8db0eb3580cb3b7b5536c54',
}
ELASTIC_ENDPOINT = os.environ['ELASTIC_ENDPOINT']
ELASTIC_ENDPOINT_PORT = os.environ['ELASTIC_ENDPOINT_PORT']


(cells_c, candidate_c, log_c, missing_c) = [get_collection(name) for name in ['cell', 'candidate', 'log', 'missing_c']]


def get_query(cell, type=None):
    if type is None:
        query = copy.deepcopy(body['token+ngrams'])
    else:
        query = copy.deepcopy(body['token+ngrams+type'])
        query['bool']['should'][2]['match']['type']['query'] = type
    query['bool']['should'][0]['match']['name']['query'] = cell
    query['bool']['should'][1]['match']['name.ngrams']['query'] = cell    
    query['bool']['must'][0]['range']['ntoken']['gte'] = len(cell.split(' ')) - 3
    query['bool']['must'][0]['range']['ntoken']['lte'] = len(cell.split(' ')) + 3    
    return query

query = get_query('paris')
candidates = requests.post(f'http://{ELASTIC_ENDPOINT}:{ELASTIC_ENDPOINT_PORT}/lookupES', 
                                        params=params, headers=headers, 
                                        json={'json': {"query": query}})

In [None]:
candidates.json()

In [1]:
from keras.models import load_model



neural2_path = "./ml_models/neural_network2.h5"


model = load_model(neural2_path)

2023-05-30 09:03:18.308273: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-30 09:03:18.689361: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-05-30 09:03:18.689381: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-05-30 09:03:20.036941: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [13]:
featues = { 
    "ntoken" : 3, 
    "popularity" : 100, 
    "pos_score" : 0.02, 
    "es_score" : 52.99, 
    "es_diff_score" : 0.072337, 
    "ed" : 0.48, 
    "jaccard" : 0.5, 
    "jaccardNgram" : 0.72, 
    "cosine_similarity" : 0.65, 
    "p_subj_ne" : 0, 
    "p_subj_lit" : 0, 
    "p_obj_ne" : 0, 
    "desc" : 0.125, 
    "descNgram" : 0.277, 
    "cpa" : 0.0, 
    "cpaMax" : 0, 
    "cta" : 1.0, 
    "ctaMax" : 1.0, 
    "cea" : 1.0, 
    "diff" : 0.0 
}

[[3,
  998617,
  0.02,
  52.99,
  0.072337,
  0.48,
  0.5,
  0.72,
  0.65,
  0,
  0,
  0,
  0.125,
  0.277,
  0.0,
  0,
  1.0,
  1.0,
  1.0,
  0.0]]

In [14]:
model.predict([list(featues.values())])



array([[0.3178707, 0.6821293]], dtype=float32)