In [191]:
import json
import pandas as pd
from haversine import haversine
from pygeohash import decode_exactly
from tqdm import tqdm

In [168]:
from sklearn.metrics.pairwise import cosine_similarity

In [161]:
def haversine_from_geohash(hash1:str, hash2:str) -> float:
    """
    function to estimate haversine distance from geohash strings
    :param hash1: first loaction encoded in geohash
    :param hash2: second location encoded in geohash
    :return: estimated distance between locations in km
    """
    # only take first two parts of tuples, rest are error estimations
    hd = haversine(decode_exactly(hash1)[:2], decode_exactly(hash2)[:2])
    return hd

In [162]:
with open('geohash_precision.json', 'r') as f:
    geohash_precision_map = json.load(f)

In [211]:
candidates = pd.read_parquet('candidates_embedding.parquet.gzip')
subjects = pd.read_parquet('subjects_embedding.parquet.gzip')

In [214]:
with open('predicate_map.json', 'r') as f:
    predicate_map = json.load(f)
with open('literal_map.json', 'r') as f:
    literal_map = json.load(f)
with open('type_map.json', 'r') as f:
    type_map = json.load(f)

In [215]:
predicates = pd.DataFrame.from_dict(predicate_map, orient='index')
literals = pd.DataFrame.from_dict(literal_map, orient='index')
types = pd.DataFrame.from_dict(type_map, orient='index')

In [216]:
cos_sim_literal = cosine_similarity(candidates.loc[:, [f'label_emb{i}' for i in range(300)]], literals.values)
literal_cossim_df = pd.DataFrame(cos_sim_literal, index=candidates.index.to_list(), columns=literals.index)

In [217]:
cos_sim_predicate = cosine_similarity(types.values, predicates.values)
predicate_cossim_df = pd.DataFrame(cos_sim_predicate, index=types.index, columns=predicates.index)

In [250]:
for precision in set(geohash_precision_map.values()):
    print(f"distance matrix prec = {precision}: ({len(candidates.apply(lambda row: row['geohash'][:precision], axis=1).unique())}, {len(subjects.apply(lambda row: row['geohash'][:precision], axis=1).unique())})")

distance matrix prec = 1: (2, 1)
distance matrix prec = 3: (8, 6)
distance matrix prec = 4: (64, 55)


In [221]:
distance_matrices = {}
for p in tqdm(set(geohash_precision_map.values())):
    dist_mat = {hash1: {hash2: haversine_from_geohash(hash1, hash2) for hash2 in candidates.apply(lambda row: row['geohash'][:p], axis=1).unique()} for hash1 in subjects.apply(lambda row: row['geohash'][:p], axis=1).unique()}
    distance_matrices.update({p: dist_mat})

100%|██████████| 3/3 [00:48<00:00, 16.31s/it]


In [268]:
matched = {}
pairs = []
count = 0
num_subjects = 2000
for index, row in tqdm(subjects[:num_subjects].iterrows(), total=num_subjects):
    precision = geohash_precision_map[row['predicate']]
    gh = row['geohash'][:precision]
    pred = row['predicate']
    lit = row['literal']
    if (gh, pred, lit) in matched:  # only consider new constellations
        count += 1
        pairs.append((row['uri'], row['predicate'], matched[(gh, pred, lit)]))
    else:
        dm_precision = distance_matrices[precision][gh]
        cossim_matching_predicate= predicate_cossim_df.loc[:, pred]
        cossim_matching_literal = literal_cossim_df.loc[:, lit]
        distances = candidates.apply(lambda cand: dm_precision[cand['geohash'][:precision]], axis=1)
        max_val = max(distances)
        distances = 1 - (distances / max_val)
        embed_sim = candidates.apply(lambda cand: cossim_matching_predicate.loc[cand['type']] + cossim_matching_literal.loc[cand.name], axis=1)
        candidate_eval = distances + embed_sim
        best_candidate = candidate_eval.idxmax()
        best_candidate_uri = candidates.loc[best_candidate, 'uri']
        pairs.append((row['uri'], row['predicate'], best_candidate_uri))
        matched.update({(gh, pred, lit): best_candidate_uri})
print(f'{count/num_subjects*100:.2f}% of matches saved by checking')
# 2.7 s by doing embedding simultaneously
# 2.5 s by removing repetitive calls
# 2.5 s for adding matched into dictionary
# 2.5 for checking about the same if 10000 entries
# 2.4 s removed repetitive call for dist_mat
# 2.3 s removing repeated calls on cossim matrices
# with match saving 6:30 for 2000
# interpolation for 8200 subjects a 3.6s: 8.2 h

100%|██████████| 2000/2000 [06:34<00:00,  5.07it/s]

91.45% of matches saved by checking



