In [5]:
import sys
# Add src to path to import our modules
sys.path.append('..')

from src.core.point_unification import (
    unify_points,
    compute_cluster_centroids,
)

from tqdm import tqdm

import pandas as pd
import geopandas as gpd
from shapely import wkt
import numpy as np
from shapely.ops import nearest_points

def gen_clusters(panoramas, eps, min_samples=1):
    print(f"Applying DBSCAN clustering with eps={eps}, min_samples={min_samples}")
    # Apply DBSCAN clustering
    dbscan_results = unify_points(panoramas, eps=eps, min_samples=min_samples)
    
    # Compute centroids
    #print("Computing cluster centroids")
    centroids = compute_cluster_centroids(dbscan_results)
    return dbscan_results, centroids

def evaluate_dbscan_clusters(clusters_gdf, points_gdf, disable_tqdm=False):
    """
    Evaluate DBSCAN clustering results with simple, interpretable metrics.
    
    Parameters
    ----------
    clusters_gdf : GeoDataFrame
        DataFrame with cluster centroids. Must have columns:
        - 'cluster_id'
        - 'geometry' (Point)
        
    points_gdf : GeoDataFrame
        DataFrame with points assigned to clusters. Must have columns:
        - 'cluster_id' (DBSCAN labels, -1 = noise)
        - 'geometry' (Point)
        
    Returns
    -------
    metrics : dict
        Dictionary with evaluation metrics.
    """

    # Transform to EPSG:3857 to get metrics in meters
    clusters_gdf = clusters_gdf.to_crs(3857)
    points_gdf = points_gdf.to_crs(3857)
    
    # Exclude noise for cluster-based calculations
    clustered_points = points_gdf[points_gdf["cluster_id"] != -1]
    valid_clusters = clusters_gdf[clusters_gdf["cluster_id"] != -1]
    
    # Number of clusters
    n_clusters = valid_clusters["cluster_id"].nunique()
    
    # Noise ratio
    noise_ratio = (points_gdf["cluster_id"] == -1).mean()
    
    # Cluster size distribution
    cluster_sizes = clustered_points.groupby("cluster_id").size()
    avg_cluster_size = cluster_sizes.mean()
    cluster_size_stats = cluster_sizes.describe().to_dict()
    
    # Within-cluster average distance
    within_distances = []
    for cid, group in tqdm(clustered_points.groupby("cluster_id"), total=n_clusters, disable=disable_tqdm):
        centroid = valid_clusters.loc[valid_clusters["cluster_id"] == cid, "geometry"].values[0]
        dists = group["geometry"].apply(lambda g: g.distance(centroid)).values
        within_distances.append(np.mean(dists))
    avg_within_distance = np.mean(within_distances) if within_distances else np.nan
    
    # Between-cluster distances (pairwise between centroids)
    between_distances = []
    centroids = valid_clusters["geometry"].values
    for i in tqdm(range(len(centroids)), total=len(centroids), disable=disable_tqdm):
        for j in range(i+1, len(centroids)):
            between_distances.append(centroids[i].distance(centroids[j]))
    avg_between_distance = np.mean(between_distances) if between_distances else np.nan
    
    # Separation / Cohesion ratio
    sep_coh_ratio = (
        avg_between_distance / avg_within_distance
        if avg_within_distance and not np.isnan(avg_within_distance) else np.nan
    )
    
    return {
        "n_clusters": n_clusters,
        "noise_ratio": noise_ratio,
        "avg_cluster_size": avg_cluster_size,
        "cluster_size_stats": cluster_size_stats,
        "avg_within_distance": avg_within_distance,
        "avg_between_distance": avg_between_distance,
        "sep_coh_ratio": sep_coh_ratio
    }

def run_dbscan(points_gdf, eps_values, min_samples_values, disable_tqdm=True):
    """
    Run DBSCAN for a grid of eps and min_samples values,
    evaluate results with evaluate_dbscan_clusters,
    and return a DataFrame with all metrics.
    
    Parameters
    ----------
    points_gdf : GeoDataFrame
        Input points with geometry (no cluster_id column yet).
    eps_values : list
        List of eps values to try.
    min_samples_values : list
        List of min_samples values to try.
    metric : str
        Distance metric for DBSCAN (default = 'euclidean').
    
    Returns
    -------
    results_df : DataFrame
        Evaluation results for each (eps, min_samples).
    """
    
    results = []

    for eps in tqdm(eps_values):
        for ms in tqdm(min_samples_values):
            
            dbscan_results, centroids = gen_clusters(points_gdf, eps=eps, min_samples=ms)
            
            # Evaluate
            metrics = evaluate_dbscan_clusters(centroids, points_gdf, disable_tqdm=disable_tqdm)
            metrics["eps"] = eps
            metrics["min_samples"] = ms
            
            results.append(metrics)
    
    return pd.DataFrame(results)


def df_to_gdf(df):
    gdf = df.copy()
    gdf.loc[:, "geometry"] = gdf.geometry.apply(wkt.loads)
    gdf = gpd.GeoDataFrame(gdf, geometry='geometry', crs=4326)
    return gdf

In [6]:
extent = wkt.loads(
    "POLYGON((-58.58198 -34.582224, -58.515231 -34.582224, -58.515231 -34.635357, -58.58198 -34.635357, -58.58198 -34.582224))"
)

In [7]:
panos = df_to_gdf(pd.read_csv('../data/tresdefebrero/panos_enriched.csv'))

rbp = df_to_gdf(pd.read_csv('../data/tresdefebrero/renabap_intersected.csv'))

panos = gpd.sjoin_nearest(
    panos.to_crs(3857),
    rbp[["id_renabap", "geometry"]].to_crs(3857),
    how="left",
    distance_col="distance",
).to_crs(4326)
panos = panos.rename(columns={"id_renabap": "closest_barrio"}).drop(
    columns=["index_right"]
)

In [8]:
panos = panos[panos.intersects(extent)]

In [9]:
panos.head()

Unnamed: 0,pano_id,lat,lon,geometry,closest_barrio,distance
0,--4tdN434sL2rFkixRxoDw,-34.620094,-58.549171,POINT (-58.54917 -34.62009),1122,100.484082
3,--f8BTEjiIsPZir88hHt9w,-34.617903,-58.553512,POINT (-58.55351 -34.6179),313,411.591546
4,--mAOMLBMdhkoEMbHqDhYA,-34.624371,-58.545562,POINT (-58.54556 -34.62437),1122,444.202933
5,--r2r0tM84f7arlojji25w,-34.621858,-58.57735,POINT (-58.57735 -34.62186),2929,465.321905
7,-07N8ueih88oii9NUZAj8w,-34.627565,-58.574763,POINT (-58.57476 -34.62756),1107,489.06796


In [10]:
panos.shape

(45234, 6)

In [14]:
panos, centroids = gen_clusters(panos, eps=2.5, min_samples=2)

In [15]:
centroids.shape

(9676, 3)

In [20]:
import ast
results = pd.read_csv('results_eps_5_2p5.csv')
results['cluster_size_stats'] = results.cluster_size_stats.apply(ast.literal_eval)

In [21]:
results

Unnamed: 0,n_clusters,noise_ratio,avg_cluster_size,cluster_size_stats,avg_within_distance,avg_between_distance,sep_coh_ratio,eps,min_samples
0,6052,0.010567,7.395241,"{'count': 6052.0, 'mean': 7.395241242564442, '...",2.352114,2129.594252,905.395692,5.0,2
1,5673,0.027325,7.755685,"{'count': 5673.0, 'mean': 7.755684822845056, '...",2.433513,2150.874245,883.855768,5.0,3
2,4932,0.086859,8.374899,"{'count': 4932.0, 'mean': 8.374898621248986, '...",2.535986,2212.723579,872.529937,5.0,4
3,9675,0.119269,4.117726,"{'count': 9675.0, 'mean': 4.1177260981912145, ...",1.088648,2117.585934,1945.15236,2.5,2
4,6941,0.240151,4.95188,"{'count': 6941.0, 'mean': 4.951880132545742, '...",1.271799,2159.173864,1697.731419,2.5,3
5,4584,0.42919,5.632635,"{'count': 4584.0, 'mean': 5.632635253054101, '...",1.327328,2185.643171,1646.648828,2.5,4
6,8334,0.539351,2.50024,"{'count': 8334.0, 'mean': 2.500239980801536, '...",0.364241,2177.134155,5977.184399,1.0,2
7,2694,0.788721,3.547513,"{'count': 2694.0, 'mean': 3.5475129918337047, ...",0.478422,2211.393479,4622.267304,1.0,3
8,739,0.925963,4.5318,"{'count': 739.0, 'mean': 4.531799729364005, 's...",0.517529,2220.82577,4291.209456,1.0,4
