In [1]:
import pandas as pd

In [2]:
candidates = pd.read_parquet('candidates.parquet.gzip')
subjects = pd.read_parquet('subjects.parquet.gzip')

In [3]:
candidates.head()

Unnamed: 0,predicate,label,location
0,wkg:10000310952,Planina Grohat,Point(14.7443577 46.423256)
1,wkg:10000464002,Bar Sokol,Point(14.5208932 46.03545)
2,wkg:10000633532,Oton Župančič,Point(14.5049273 46.0542961)
3,wkg:10001176746,Slovenska Bistrica,Point(15.5529495 46.3925384)
4,wkg:10001265524,Bar Platana,Point(14.5048953 46.0500597)


In [5]:
#!pip install pygeohash

Collecting pygeohash
  Downloading pygeohash-1.2.0.tar.gz (5.0 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pygeohash
  Building wheel for pygeohash (setup.py): started
  Building wheel for pygeohash (setup.py): finished with status 'done'
  Created wheel for pygeohash: filename=pygeohash-1.2.0-py2.py3-none-any.whl size=6178 sha256=f560e3a3a0ca51e9a40983dcbffbf17cfe1a11d3c22b6eb692f2bbdfe205f0e4
  Stored in directory: c:\users\morit\appdata\local\pip\cache\wheels\95\22\7a\35719e5f20cdc599cc837c67031a3ec2f011e1d418f57a37ce
Successfully built pygeohash
Installing collected packages: pygeohash
Successfully installed pygeohash-1.2.0


In [4]:
import pygeohash
import re

In [6]:
test_string = pygeohash.encode(longitude=14.744357766666, latitude=46.4232566666)
print(pygeohash.decode_exactly(test_string))

(46.42325663007796, 14.74435793235898, 8.381903171539307e-08, 1.6763806343078613e-07)


In [7]:
def wkt_to_geohash(wkt:str) -> str:
    m = re.match(r'Point\((.*) (.*)\)', wkt)
    if m:
        lon = float(m.group(1))
        lat = float(m.group(2))
        return pygeohash.encode(longitude=lon, latitude=lat, precision = 6)
    else:
        return '000000'

In [8]:
candidates['geohash'] = candidates.apply(lambda row: wkt_to_geohash(row['location']), axis=1)

In [9]:
candidates.head()

Unnamed: 0,predicate,label,location,geohash
0,wkg:10000310952,Planina Grohat,Point(14.7443577 46.423256),u262pe
1,wkg:10000464002,Bar Sokol,Point(14.5208932 46.03545),u24mfk
2,wkg:10000633532,Oton Župančič,Point(14.5049273 46.0542961),u24mfp
3,wkg:10001176746,Slovenska Bistrica,Point(15.5529495 46.3925384),u25pcv
4,wkg:10001265524,Bar Platana,Point(14.5048953 46.0500597),u24mfp


In [18]:
from haversine import haversine
import numpy as np

In [23]:
def haversine_from_geohash(hash1:str, hash2:str) -> float:
    """
    function to estimate haversine distance from geohash strings
    :param hash1: first loaction encoded in geohash
    :param hash2: second location encoded in geohash
    :return: estimated distance between locations in km
    """
    # only take first two parts of tuples, rest are error estimations
    hd = haversine(pygeohash.decode_exactly(hash1)[:2], pygeohash.decode_exactly(hash2)[:2])
    return hd

In [26]:
for i in range(1, 6 + 1):
    h1 = candidates['geohash'][0][:i]
    h2 = candidates['geohash'][1][:i]
    print(f'precision {i} \t{h1} \t{h2}')
    print(f'haversine: {haversine_from_geohash(h1, h2)}')

precision 1 	u 	u
haversine: 0.0
precision 2 	u2 	u2
haversine: 0.0
precision 3 	u26 	u24
haversine: 156.3680815784051
precision 4 	u262 	u24m
haversine: 58.63803059190192
precision 5 	u262p 	u24mf
haversine: 47.11436456470272
precision 6 	u262pe 	u24mfk
haversine: 46.85819773667396


In [36]:
candidates.loc[4, 'geohash']

'u24mfp'

In [69]:
! pip install haversine

Collecting haversine
  Downloading haversine-2.8.0-py2.py3-none-any.whl (7.7 kB)
Installing collected packages: haversine
Successfully installed haversine-2.8.0


In [None]:
# need geohash distance based on type
# need cosine between embedded labels
# need cosine between types