In [127]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN

In [45]:
import json

with open('export.geojson', 'r', encoding='utf-8') as f:
    dataset = json.load(f)


In [162]:
#good_keys = ['shop']
good_keys = []

# specific_rules = {"leisure" : ["fitness_centre", "sport_centre", "stadium"],
#                   "tourism" : ["museum", "gallery", "hotel"],
#                   "shop" : ["supermarket", "mall", "hairdresser"],
#                   "amenity": [
#                         "cinema", "hospital", "clinic", "dentist", "university", "bank", "restaurant",
#                         "fast_food", "cafe", "marketplace", "library", "theatre", "parking"
#                         ],
#                   "building": ["office"],
#                   "aerodrome": ["international"],
#                   "historic": ["manor"]}


specific_rules = {"leisure" : ["fitness_centre", "sport_centre", "stadium"],
                  "tourism" : ["museum"],
                  "shop" : ["supermarket", "mall"],
                  "amenity": [
                        "cinema", "hospital", "university",
                        "marketplace", "library", "theatre", "parking"
                        ],
                  "building": ["office"],
                  "aerodrome": ["international"],
                  "historic": ["manor"]}

In [None]:
relevant_objects = []
for obj in dataset['features']:
    if any([word in obj['properties'].keys() for word in good_keys]):
        relevant_objects.append(obj)
    else:
        for key in specific_rules.keys():
            if key in obj['properties'].keys() and any([obj['properties'][key] == value for value in specific_rules[key]]):
               obj['extra'] = obj['properties'][key]
               relevant_objects.append(obj)

# with open('relevant_objects.jsonl', 'w', encoding = 'utf-8') as f:
#     for line in relevant_objects:
#         f.write(json.dumps({'name' : line['properties']['name'], 'lat' : line['geometry']['coordinates'][1], 'lon' : line['geometry']['coordinates'][0], 'extra' : line['extra']}, ensure_ascii=False) + '\n')

In [187]:
from sklearn.cluster import DBSCAN
import numpy as np
import pandas as pd

def merge_close_points(
    df: pd.DataFrame,
    lat_col: str = "lat",
    lon_col: str = "lon",
    name_col: str = "name",
    extra_col: str = "extra",          # 👈 new
    threshold_m: float = 300.0,
) -> pd.DataFrame:
    """Cluster points within threshold_m and merge: name_1 + name_2, avg coords.
       Adds 'count' and 'object_type' columns."""
    EARTH_RADIUS_M = 6_371_008.8

    coords_rad = np.radians(df[[lat_col, lon_col]].to_numpy())
    eps = threshold_m / EARTH_RADIUS_M

    db = DBSCAN(eps=eps, min_samples=1, metric="haversine")
    labels = db.fit_predict(coords_rad)

    tmp = df.copy()
    tmp["_cluster"] = labels

    def concat_names(s: pd.Series) -> str:
        seen, out = set(), []
        for v in s.astype(str):
            if v not in seen:
                seen.add(v); out.append(v)
        return " + ".join(out)

    merged = (
        tmp.groupby("_cluster", as_index=False)
           .agg(
               **{
                   name_col: (name_col, concat_names),
                   lat_col: (lat_col, "mean"),
                   lon_col: (lon_col, "mean"),
                   "count": ("_cluster", "size"),
                   "_extra_first": (extra_col, "first"),
               }
           )
           .drop(columns=["_cluster"])
    )

    merged["object_type"] = np.where(
        merged["count"] == 1, merged["_extra_first"], "merged"
    )
    merged = merged.drop(columns=["_extra_first"])
    return merged



In [188]:
df = pd.read_json('relevant_objects.jsonl', lines = True)

In [189]:
grouped_relevant_objects = merge_close_points(df, threshold_m=50).sort_values('count', ascending=False).reset_index(drop = True)

In [190]:
grouped_relevant_objects.to_json('grouped_relevant_objects.jsonl', lines = True, orient = 'records', force_ascii=False)