In [270]:
import json
import pandas as pd
from haversine import haversine
from pygeohash import decode_exactly
from tqdm import tqdm
import csv
from sklearn.metrics.pairwise import cosine_similarity

In [161]:
def haversine_from_geohash(hash1:str, hash2:str) -> float:
    """
    function to estimate haversine distance from geohash strings
    :param hash1: first loaction encoded in geohash
    :param hash2: second location encoded in geohash
    :return: estimated distance between locations in km
    """
    # only take first two parts of tuples, rest are error estimations
    hd = haversine(decode_exactly(hash1)[:2], decode_exactly(hash2)[:2])
    return hd

In [162]:
with open('geohash_precision.json', 'r') as f:
    geohash_precision_map = json.load(f)

In [211]:
candidates = pd.read_parquet('candidates_embedding.parquet.gzip')
subjects = pd.read_parquet('subjects_embedding.parquet.gzip')

In [214]:
with open('predicate_map.json', 'r') as f:
    predicate_map = json.load(f)
with open('literal_map.json', 'r') as f:
    literal_map = json.load(f)
with open('type_map.json', 'r') as f:
    type_map = json.load(f)

In [215]:
predicates = pd.DataFrame.from_dict(predicate_map, orient='index')
literals = pd.DataFrame.from_dict(literal_map, orient='index')
types = pd.DataFrame.from_dict(type_map, orient='index')

In [216]:
cos_sim_literal = cosine_similarity(candidates.loc[:, [f'label_emb{i}' for i in range(300)]], literals.values)
literal_cossim_df = pd.DataFrame(cos_sim_literal, index=candidates.index.to_list(), columns=literals.index)

In [217]:
cos_sim_predicate = cosine_similarity(types.values, predicates.values)
predicate_cossim_df = pd.DataFrame(cos_sim_predicate, index=types.index, columns=predicates.index)

In [250]:
for precision in set(geohash_precision_map.values()):
    print(f"distance matrix prec = {precision}: ({len(candidates.apply(lambda row: row['geohash'][:precision], axis=1).unique())}, {len(subjects.apply(lambda row: row['geohash'][:precision], axis=1).unique())})")

distance matrix prec = 1: (2, 1)
distance matrix prec = 3: (8, 6)
distance matrix prec = 4: (64, 55)


In [288]:
distance_matrices = {}
for p in tqdm(set(geohash_precision_map.values())):
    dist_mat = {hash1: {hash2: haversine_from_geohash(hash1, hash2) for hash2 in candidates.apply(lambda row: row['geohash'][:p], axis=1).unique()} for hash1 in subjects.apply(lambda row: row['geohash'][:p], axis=1).unique()}
    distance_frame = pd.DataFrame.from_dict(dist_mat, orient='index')
    max_val = distance_frame.values.max()
    distance_frame = pd.DataFrame(1 - (distance_frame.values / max_val), columns=distance_frame.columns, index=distance_frame.index)
    distance_matrices.update({p: distance_frame})

100%|██████████| 3/3 [00:46<00:00, 15.64s/it]


In [296]:
matched = {}
pairs = []
num_subjects = 20
for index, row in tqdm(subjects[:num_subjects].iterrows(), total=len(subjects), desc='- Finding matches'):
    # save core properties for repetitive access
    precision = geohash_precision_map[row['predicate']]
    gh = row['geohash'][:precision]
    pred = row['predicate']
    lit = row['literal']
    if (gh, pred, lit) in matched:  # only consider new constellations for subjects
        # if these three values are the same, the uslp-score will also be the same
        pairs.append((row['uri'], row['predicate']) + matched[(gh, pred, lit)])
    else:
        # prepare similarity and distance matrices for repetitive access
        dm_precision = distance_matrices[precision].loc[gh, :]
        cossim_matching_predicate = predicate_cossim_df.loc[:, pred]
        cossim_matching_literal = literal_cossim_df.loc[:, lit]

        # iterate through candidates and compute uslp-score
        candidate_eval = candidates.apply(
            lambda cand: dm_precision.loc[cand['geohash'][:precision]]
                         + cossim_matching_predicate.loc[cand['type']]
                         + cossim_matching_literal.loc[cand.name],
            axis=1
        )

        # find best candidate
        best_candidate = candidate_eval.idxmax()
        best_candidate_uri = candidates.loc[best_candidate, 'uri']
        best_candidate_score = candidate_eval[best_candidate]

        # store matches and save constellation
        pairs.append((row['uri'], row['predicate'], best_candidate_uri, candidate_eval[best_candidate]))
        matched.update({(gh, pred, lit): (best_candidate_uri, best_candidate_score)})
# 2.7 s by doing embedding simultaneously
# 2.5 s by removing repetitive calls
# 2.5 s for adding matched into dictionary
# 2.5 for checking about the same if 10000 entries
# 2.4 s removed repetitive call for dist_mat
# 2.3 s removing repeated calls on cossim matrices
# with match saving 6:30 for 2000
# interpolation for 8200 subjects a 3.6s: 8.2 h

- Finding matches:   0%|          | 20/8231 [00:04<28:38,  4.78it/s] 


In [271]:
with open('uslp-triplets.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    for s, p, o in pairs:
        writer.writerow([s, p, o])

In [287]:
for precision, distances in distance_matrices.items():
    distance_frame = pd.DataFrame.from_dict(distances, orient='index')
    max_val = distance_frame.values.max()
    distance_frame = pd.DataFrame(1 - (distance_frame.values / max_val), columns=distance_frame.columns, index=distance_frame.index)

{'u': {'u': 0.0, 's': 5003.778610508981}}

In [297]:
pairs

[('wkg:1197611988', 'wkgs:isInCountry', 'wkg:208592566', 2.0793118916642017),
 ('wkg:1214729180', 'wkgs:isInCountry', 'wkg:208592566', 2.2529707538465242),
 ('wkg:1216078565', 'wkgs:isInCountry', 'wkg:208592566', 2.2529707538465242),
 ('wkg:1216078570', 'wkgs:isInCountry', 'wkg:208592566', 2.2529707538465242),
 ('wkg:1216078808', 'wkgs:isInCountry', 'wkg:208592566', 2.2529707538465242),
 ('wkg:1216078898', 'wkgs:isInCountry', 'wkg:208592566', 2.2529707538465242),
 ('wkg:1403075421', 'wkgs:isInCountry', 'wkg:208592566', 2.2529707538465242),
 ('wkg:1438157550', 'wkgs:isInCountry', 'wkg:208592566', 2.2529707538465242),
 ('wkg:1438321931', 'wkgs:isInCountry', 'wkg:208592566', 2.2529707538465242),
 ('wkg:1438321938', 'wkgs:isInCountry', 'wkg:208592566', 2.2529707538465242),
 ('wkg:1438321941', 'wkgs:isInCountry', 'wkg:208592566', 2.2529707538465242),
 ('wkg:1438321978', 'wkgs:isInCountry', 'wkg:208592566', 2.2529707538465242),
 ('wkg:1438321984', 'wkgs:isInCountry', 'wkg:208592566', 2.25297