In [2]:
import json
import math
import numpy as np
from pyproj import Transformer

transformer = Transformer.from_crs(crs_from=31370, crs_to=4326)

with open('data/segments.json') as f:
    data = json.load(f)

def convert(feature):
    a = np.array(feature['geometry']['coordinates'])
    coord = np.mean(a, axis=1)[0]
    gps_coord = transformer.transform(*coord)
    return feature['properties']['oidn'] , gps_coord
gps_segments = [convert(f) for f in data['features']]

In [3]:
def bbox(lon, lat, d=600):    
    r_earth = 6378
    xmin = lat - (d/1000.0 / r_earth) * (180 / math.pi)
    xmax = lat + (d/1000.0 / r_earth) * (180 / math.pi)
    ymin = lon - (d/1000.0 / r_earth) * (180 / math.pi) / math.cos(lat * math.pi/180)
    ymax = lon + (d/1000.0 / r_earth) * (180 / math.pi) / math.cos(lat * math.pi/180)
    return xmin, xmax, ymin, ymax    

In [41]:
import sqlite3
conn = sqlite3.connect("belgium.sqlite")
conn.enable_load_extension(True)
conn.load_extension("/opt/homebrew/lib/mod_spatialite.dylib")
#conn.set_trace_callback(print)

sql = f"""
SELECT k,v FROM osm_node_tags WHERE osm_node_tags.node_id in (
    SELECT pkid FROM idx_osm_nodes_Geometry
    WHERE xmin > ? AND xmax < ? AND ymin > ? AND ymax < ?
);
"""

def query(lon, lat):
    xmin, xmax, ymin, ymax = bbox(lon, lat)
    cur = conn.execute(sql, (xmin, xmax, ymin, ymax))
    for r in cur.fetchall():
        for c in r:
            if not c.isnumeric():
                yield c.lower()

In [42]:
set(query(*gps_segments[0][1]))

{'access',
 'addr:housename',
 'addr:postcode',
 'addr:street',
 'alma',
 'alt_name',
 'amenity',
 'backrest',
 'backward',
 'barrier',
 'bench',
 'bockor;rodenbach',
 'bollard',
 'both',
 'brewery',
 'bump',
 'bus',
 'bus_stop',
 'cafe',
 'covered',
 'crossing',
 'crossing:island',
 'cycle_barrier',
 'de cantor',
 'de lijn',
 'de zweetkelder',
 'description',
 'diet:vegetarian',
 'direction',
 'dlwv',
 'double',
 'entrance',
 'fixed',
 'forward',
 'free',
 'from 30 to 57 rechts 77 rechtdoor 80',
 'from 77 to 57 links 30 rechts 80',
 'from 80 to 57 rechtdoor 30 links 77',
 'give_way',
 'highway',
 'hump',
 'kortrijk lange munte',
 'kortrijk m.p.i.',
 'kortrijk studentencentrum',
 'lange munteplein',
 'leiestreek',
 'leisure',
 'library',
 'main',
 'marked',
 'material',
 'name',
 'network',
 'network:type',
 'nl:alma (leuven)',
 'no',
 'node_network',
 'onthaal',
 'operator',
 'outside',
 'paving_stones',
 'permissive',
 'picnic_table',
 'pitch',
 'platform',
 'pub',
 'public_transport

In [43]:
tags = {seg_id: ' '.join(list(query(*coord))) for seg_id, coord in gps_segments}

In [44]:
with open('tags.json', 'w') as f: 
    json.dump(tags, f, indent=2)

In [45]:
corpus = list(tags.values())

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=0.05, max_df=0.95)
X = vectorizer.fit_transform(corpus)
X.shape

(5015, 981)

In [47]:
vectorizer.get_feature_names_out()

array(['00', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
       '11', '12', '13', '14', '15', '15a', '16', '17', '18', '19', '1a',
       '20', '2014', '2015', '2016', '2017', '2018', '2019', '2020',
       '2021', '2022', '21', '22', '23', '230', '24', '25', '26', '27',
       '28', '29', '2a', '2b', '30', '31', '32', '33', '34', '35', '36',
       '37', '38', '39', '3a', '40', '41', '42', '43', '44', '45', '46',
       '47', '470', '472', '473', '475', '479', '48', '484', '485', '49',
       '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60',
       '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71',
       '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82',
       '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93',
       '94', '95', '96', '97', '98', '99', 'aan', 'access', 'addr',
       'advertising', 'al', 'albert', 'alcohol', 'aldi', 'allego', 'alpr',
       'alt_name', 'amenity', 'an', 'and', 'antenna', 'an