In [69]:
#! pip install haversine

Collecting haversine
  Downloading haversine-2.8.0-py2.py3-none-any.whl (7.7 kB)
Installing collected packages: haversine
Successfully installed haversine-2.8.0


In [5]:
#!pip install pygeohash

Collecting pygeohash
  Downloading pygeohash-1.2.0.tar.gz (5.0 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pygeohash
  Building wheel for pygeohash (setup.py): started
  Building wheel for pygeohash (setup.py): finished with status 'done'
  Created wheel for pygeohash: filename=pygeohash-1.2.0-py2.py3-none-any.whl size=6178 sha256=f560e3a3a0ca51e9a40983dcbffbf17cfe1a11d3c22b6eb692f2bbdfe205f0e4
  Stored in directory: c:\users\morit\appdata\local\pip\cache\wheels\95\22\7a\35719e5f20cdc599cc837c67031a3ec2f011e1d418f57a37ce
Successfully built pygeohash
Installing collected packages: pygeohash
Successfully installed pygeohash-1.2.0


In [1]:
import pandas as pd
import numpy as np
import pygeohash
import re
from haversine import haversine
import gensim
import requests
import os
import json

In [2]:
def wkt_to_geohash(wkt:str) -> str:
    m = re.match(r'Point\((.*) (.*)\)', wkt)
    if m:
        lon = float(m.group(1))
        lat = float(m.group(2))
        return pygeohash.encode(longitude=lon, latitude=lat, precision = 6)
    else:
        return '000000'

In [3]:
def haversine_from_geohash(hash1:str, hash2:str) -> float:
    """
    function to estimate haversine distance from geohash strings
    :param hash1: first loaction encoded in geohash
    :param hash2: second location encoded in geohash
    :return: estimated distance between locations in km
    """
    # only take first two parts of tuples, rest are error estimations
    hd = haversine(pygeohash.decode_exactly(hash1)[:2], pygeohash.decode_exactly(hash2)[:2])
    return hd

In [41]:
def generate_embedding(sentence:str, model:gensim.models.fasttext) -> np.array:
    """
    generate average embedding for list of strings
    :param sentence: string to embed
    :param model: model to generate embeddings from
    :return: average embedding of all words
    """
    embeddings = []
    words = sentence.split(' ')  # splitting to avoid parsing as subwords
    for word in words:
        if word in model.wv:
            embeddings.append(model.wv[word])
        else:
            embeddings.append(np.zeros(300))
    return np.mean(embeddings, axis=0).tolist()

In [66]:
def generate_tail_label_embedding(label:str, name:str, model:gensim.models.fasttext) -> np.array:
    """
    wrapper function to conditionally combine label and name embeddings for tail candidates
    :param label: string containing label
    :param name: string containing name
    :param model: fasttext model used to generate embeddings
    :return: conditionally, average embedding of labels 
    """
    embedding = generate_embedding(label, model)
    if name != '<UNK>':
        embedding = np.mean([embedding, generate_embedding(name, model)], axis=0)
    return embedding

In [6]:
# need geohash distance based on type
# need cosine between literal and embedded labels
# need cosine between relation and type

In [39]:
location = 'E:\Datasets/Embeddings'
if os.path.exists(os.path.join(location, 'cc.en.300.bin.gz')):
    print('file present')
else:
    r = requests.get('https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz')
    with open(os.path.join(location, 'cc.en.300.bin.gz'), 'wb') as f:
        f.write(r.content)
ft_model = gensim.models.fasttext.load_facebook_model(os.path.join(location, 'cc.en.300.bin.gz'))

In [68]:
candidates = pd.read_parquet('candidates_type.parquet.gzip')
col_names = list(candidates.columns) + ['geoh'] + [f'label_emb{i}' for i in range(300)]
geoh = candidates.apply(lambda row: wkt_to_geohash(row['location']), axis=1)
label_emb = candidates.apply(lambda row: generate_tail_label_embedding(row['label'], row['label_en'], ft_model), axis=1, result_type='expand')
candidates = pd.concat([candidates, geoh, label_emb], axis=1)
candidates.columns = col_names
type_map = {obj_type: np.zeros if obj_type == '<UNK>' else generate_embedding(obj_type, ft_model) for obj_type in candidates['type'].unique()}

In [70]:
candidates.to_parquet('candidates_embedding.parquet.gzip', compression='gzip', engine='pyarrow')
with open('type_map.json', 'w') as f:
    json.dump(type_map, f)

In [60]:
subjects = pd.read_parquet('subjects_type.parquet.gzip')
subjects['geohash'] = subjects.apply(lambda row: wkt_to_geohash(row['location']), axis=1)
predicate_map = {pred: generate_embedding(pred.split(':')[-1], ft_model) for pred in subjects['predicate'].unique()}
literal_map = {literal: generate_embedding(literal, ft_model) for literal in subjects['literal'].unique()}

In [71]:
subjects.to_parquet('subjects_embedding.parquet.gzip', compression='gzip', engine='pyarrow')
with open('predicate_map.json', 'w') as f:
    json.dump(predicate_map, f)
with open('literal_map.json', 'w') as f:
    json.dump(literal_map, f)