In [1]:
import json
import pandas as pd
from haversine import haversine
from pygeohash import decode_exactly
from tqdm import tqdm
import csv
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def haversine_from_geohash(hash1:str, hash2:str) -> float:
    """
    function to estimate haversine distance from geohash strings
    :param hash1: first loaction encoded in geohash
    :param hash2: second location encoded in geohash
    :return: estimated distance between locations in km
    """
    # only take first two parts of tuples, rest are error estimations
    hd = haversine(decode_exactly(hash1)[:2], decode_exactly(hash2)[:2])
    return hd

In [3]:
with open('geohash_precision.json', 'r') as f:
    geohash_precision_map = json.load(f)

In [4]:
candidates = pd.read_parquet('candidates_embedding.parquet.gzip')
subjects = pd.read_parquet('subjects_embedding.parquet.gzip')

In [5]:
with open('predicate_map.json', 'r') as f:
    predicate_map = json.load(f)
with open('literal_map.json', 'r') as f:
    literal_map = json.load(f)
with open('type_map.json', 'r') as f:
    type_map = json.load(f)

In [6]:
predicates = pd.DataFrame.from_dict(predicate_map, orient='index')
literals = pd.DataFrame.from_dict(literal_map, orient='index')
types = pd.DataFrame.from_dict(type_map, orient='index')

In [21]:
is_in_routine = True
if is_in_routine:
    contains_map = {}
    for predicate in tqdm(predicate_map.keys()):
        similarities = {}
        for t in type_map.keys():
            similarity = 0
            if t != '<UNK>':
                if str(t) in predicate:
                    similarity = 0.5
            similarities.update({t: similarity})
        contains_map.update({predicate: similarities})
    type_contained = pd.DataFrame.from_dict(contains_map, orient='index')

100%|██████████| 11/11 [00:00<00:00, 3677.45it/s]


In [23]:
cos_sim_literal = cosine_similarity(candidates.loc[:, [f'label_emb{i}' for i in range(300)]], literals.values)
literal_cossim_df = pd.DataFrame(cos_sim_literal, index=candidates.index.to_list(), columns=literals.index)

In [24]:
cos_sim_predicate = cosine_similarity(types.values, predicates.values)
predicate_cossim_df = pd.DataFrame(cos_sim_predicate, index=types.index, columns=predicates.index)

In [25]:
for precision in set(geohash_precision_map.values()):
    print(f"distance matrix prec = {precision}: ({len(candidates.apply(lambda row: row['geohash'][:precision], axis=1).unique())}, {len(subjects.apply(lambda row: row['geohash'][:precision], axis=1).unique())})")

distance matrix prec = 1: (2, 1)
distance matrix prec = 3: (8, 6)
distance matrix prec = 4: (64, 55)


In [26]:
distance_matrices = {}
for p in tqdm(set(geohash_precision_map.values())):
    dist_mat = {hash1: {hash2: haversine_from_geohash(hash1, hash2) for hash2 in candidates.apply(lambda row: row['geohash'][:p], axis=1).unique()} for hash1 in subjects.apply(lambda row: row['geohash'][:p], axis=1).unique()}
    distance_frame = pd.DataFrame.from_dict(dist_mat, orient='index')
    max_val = distance_frame.values.max()
    distance_frame = pd.DataFrame(1 - (distance_frame.values / max_val), columns=distance_frame.columns, index=distance_frame.index)
    distance_matrices.update({p: distance_frame})

100%|██████████| 3/3 [00:48<00:00, 16.12s/it]


In [47]:
matched = {}
pairs = []
# num_subjects = 20 # [:num_subjects]
for index, row in tqdm(subjects.iterrows(), total=len(subjects), desc='- Finding matches'):
    # save core properties for repetitive access
    precision = geohash_precision_map[row['predicate']]
    gh = row['geohash'][:precision]
    pred = row['predicate']
    lit = row['literal']
    if (gh, pred, lit) in matched:  # only consider new constellations for subjects
        # if these three values are the same, the uslp-score will also be the same
        pairs.append((row['uri'], row['predicate']) + matched[(gh, pred, lit)])
    else:
        # prepare similarity and distance matrices for repetitive access
        dm_precision = distance_matrices[precision].loc[gh, :]
        cossim_matching_predicate = predicate_cossim_df.loc[:, pred]
        cossim_matching_literal = literal_cossim_df.loc[:, lit]

        # iterate through candidates and compute uslp-score
        candidate_eval = candidates.apply(
            lambda cand: dm_precision.loc[cand['geohash'][:precision]]
                         + cossim_matching_predicate.loc[cand['type']]
                         + cossim_matching_literal.loc[cand.name],
            axis=1
        )

        # find best candidate
        best_candidate = candidate_eval.idxmax()
        best_candidate_uri = candidates.loc[best_candidate, 'uri']
        best_candidate_score = candidate_eval[best_candidate]

        # store matches and save constellation
        pairs.append((row['uri'], row['predicate'], best_candidate_uri, candidate_eval[best_candidate]))
        matched.update({(gh, pred, lit): (best_candidate_uri, best_candidate_score)})
# 2.7 s by doing embedding simultaneously
# 2.5 s by removing repetitive calls
# 2.5 s for adding matched into dictionary
# 2.5 for checking about the same if 10000 entries
# 2.4 s removed repetitive call for dist_mat
# 2.3 s removing repeated calls on cossim matrices
# with match saving 6:30 for 2000
# interpolation for 8200 subjects a 3.6s: 8.2 h

- Finding matches: 100%|██████████| 8231/8231 [12:52<00:00, 10.65it/s] 


In [30]:
matched = {}
pairs = []
# num_subjects = 20 # [:num_subjects]
for index, row in tqdm(subjects.iterrows(), total=len(subjects), desc='- Finding matches'):
    # save core properties for repetitive access
    precision = geohash_precision_map[row['predicate']]
    gh = row['geohash'][:precision]
    pred = row['predicate']
    lit = row['literal']
    if (gh, pred, lit) in matched:  # only consider new constellations for subjects
        # if these three values are the same, the uslp-score will also be the same
        pairs.append((row['uri'], row['predicate']) + matched[(gh, pred, lit)])
    else:
        # prepare similarity and distance matrices for repetitive access
        dm_precision = distance_matrices[precision].loc[gh, :]
        cossim_matching_predicate = predicate_cossim_df.loc[:, pred]
        cossim_matching_literal = literal_cossim_df.loc[:, lit]
        contains_matching_predicate = type_contained.loc[pred, :]

        # iterate through candidates and compute uslp-score
        candidate_eval = candidates.apply(
            lambda cand: dm_precision.loc[cand['geohash'][:precision]]
                         + cossim_matching_predicate.loc[cand['type']]
                         + cossim_matching_literal.loc[cand.name] 
                         + contains_matching_predicate.loc[cand['type']],
            axis=1
        )

        # find best candidate
        best_candidate = candidate_eval.idxmax()
        best_candidate_uri = candidates.loc[best_candidate, 'uri']
        best_candidate_score = candidate_eval[best_candidate]

        # store matches and save constellation
        pairs.append((row['uri'], row['predicate'], best_candidate_uri, candidate_eval[best_candidate]))
        matched.update({(gh, pred, lit): (best_candidate_uri, best_candidate_score)})
# 2.7 s by doing embedding simultaneously
# 2.5 s by removing repetitive calls
# 2.5 s for adding matched into dictionary
# 2.5 for checking about the same if 10000 entries
# 2.4 s removed repetitive call for dist_mat
# 2.3 s removing repeated calls on cossim matrices
# with match saving 6:30 for 2000
# interpolation for 8200 subjects a 3.6s: 8.2 h
# 16 minutes with contains

- Finding matches: 100%|██████████| 8231/8231 [16:09<00:00,  8.49it/s]  


In [32]:
pairs

[('wkg:1197611988', 'wkgs:isInCountry', 'wkg:208592566', 2.739311891664202),
 ('wkg:1214729180', 'wkgs:isInCountry', 'wkg:208592566', 2.9129707538465244),
 ('wkg:1216078565', 'wkgs:isInCountry', 'wkg:208592566', 2.9129707538465244),
 ('wkg:1216078570', 'wkgs:isInCountry', 'wkg:208592566', 2.9129707538465244),
 ('wkg:1216078808', 'wkgs:isInCountry', 'wkg:208592566', 2.9129707538465244),
 ('wkg:1216078898', 'wkgs:isInCountry', 'wkg:208592566', 2.9129707538465244),
 ('wkg:1403075421', 'wkgs:isInCountry', 'wkg:208592566', 2.9129707538465244),
 ('wkg:1438157550', 'wkgs:isInCountry', 'wkg:208592566', 2.9129707538465244),
 ('wkg:1438321931', 'wkgs:isInCountry', 'wkg:208592566', 2.9129707538465244),
 ('wkg:1438321938', 'wkgs:isInCountry', 'wkg:208592566', 2.9129707538465244),
 ('wkg:1438321941', 'wkgs:isInCountry', 'wkg:208592566', 2.9129707538465244),
 ('wkg:1438321978', 'wkgs:isInCountry', 'wkg:208592566', 2.9129707538465244),
 ('wkg:1438321984', 'wkgs:isInCountry', 'wkg:208592566', 2.912970

In [48]:
# i suggest cutoff at 2.0 score. Worse results seem really bad
with open('uslp-triplets.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    for s, p, o, val in pairs:
        writer.writerow([s, p, o, val])

In [287]:
for precision, distances in distance_matrices.items():
    distance_frame = pd.DataFrame.from_dict(distances, orient='index')
    max_val = distance_frame.values.max()
    distance_frame = pd.DataFrame(1 - (distance_frame.values / max_val), columns=distance_frame.columns, index=distance_frame.index)

{'u': {'u': 0.0, 's': 5003.778610508981}}

In [49]:
predictions = pd.DataFrame(pairs, columns=['s', 'p', 'o', 'score'])

In [37]:
predictions.sort_values(by='score', ascending=False)

Unnamed: 0,s,p,o,score
63,wkg:3723320619,wkgs:isInCountry,wkg:208592566,2.912971
80,wkg:414949756,wkgs:isInCountry,wkg:208592566,2.912971
93,wkg:4762715686,wkgs:isInCountry,wkg:208592566,2.912971
92,wkg:4759852818,wkgs:isInCountry,wkg:208592566,2.912971
91,wkg:4759852817,wkgs:isInCountry,wkg:208592566,2.912971
...,...,...,...,...
6993,wkg:8357456906,wkgs:country,wkg:208592566,1.632498
7029,wkg:8866929385,wkgs:country,wkg:208592566,1.611731
7028,wkg:8866562785,wkgs:country,wkg:208592566,1.594955
7038,wkg:9554528548,wkgs:country,wkg:208592566,1.593100


In [38]:
candid_data = candidates[['uri', 'label', 'type']]
subject_info = subjects[['uri', 'literal']]

In [50]:
#predictions = pd.read_csv('uslp-triplets.csv', header=None, names=['s', 'p', 'o', 'score'])
predictions = predictions.merge(candid_data, left_on='s', right_on='uri')
predictions = predictions.merge(candid_data, left_on='o', right_on='uri')
predictions = predictions.merge(subject_info, left_on='s', right_on='uri')
predictions = predictions.drop(columns=['uri_x', 'uri_y'])
predictions = predictions[['s', 'label_x', 'type_x', 'p', 'type_y', 'literal', 'o', 'label_y', 'score']]


In [51]:
predictions.to_csv('predictions_annotated_simple.csv')
predictions.sort_values(by='score', ascending=True)

Unnamed: 0,s,label_x,type_x,p,type_y,literal,o,label_y,score
150,wkg:9554528548,Embassy of Ukraine in the Republic of Slovenia,Diplomatic,wkgs:country,Country,UA,wkg:208592566,Slovenija,1.593100
151,wkg:9554594660,Konzulat Ukrajine v Kranju,Diplomatic,wkgs:country,Country,UA,wkg:208592566,Slovenija,1.593100
147,wkg:8866562785,Veleposlaništvo Gruzije,Diplomatic,wkgs:country,Country,GE,wkg:208592566,Slovenija,1.594955
148,wkg:8866929385,Veleposlaništvo Kosova,Diplomatic,wkgs:country,Country,XK,wkg:208592566,Slovenija,1.611731
517,wkg:1339589275,Mercator Beli Križ,ShopSupermarket,wkgs:addrState,Studio,SI,wkg:5249405282,Radio SI,1.670489
...,...,...,...,...,...,...,...,...,...
984,wkg:3889214477,Kmetija Fornazarič,Alcohol,wkgs:addrPlace,MotorwayJunction,Vogrsko,wkg:1362760677,Vogrsko,2.391819
985,wkg:6387582479,Frlanova Kmetija,<UNK>,wkgs:addrPlace,MotorwayJunction,Vogrsko,wkg:1362760677,Vogrsko,2.391819
986,wkg:8014566124,Mehanika RG,<UNK>,wkgs:addrPlace,MotorwayJunction,Vogrsko,wkg:1362760677,Vogrsko,2.391819
996,wkg:2476370197,Picerija Etna,Restaurant,wkgs:addrPlace,MotorwayJunction,Divača,wkg:288288440,Divača,2.391819


In [44]:
len(predictions[predictions['score'] >= 2])

1018

In [45]:
len(predictions)

1188