In [1]:
import os, sys
# Add src to path to import our modules
sys.path.append('..')

from src.core.point_unification import (
    unify_points,
    compute_cluster_centroids,
)

from src.api.streetview import get_panorama_metadata

from tqdm import tqdm

import pandas as pd
import geopandas as gpd
from shapely import wkt
import numpy as np
from shapely.ops import nearest_points

def gen_clusters(panoramas, eps, min_samples=1):
    print(f"Applying DBSCAN clustering with eps={eps}, min_samples={min_samples}")
    # Apply DBSCAN clustering
    dbscan_results = unify_points(panoramas, eps=eps, min_samples=min_samples)
    
    # Compute centroids
    #print("Computing cluster centroids")
    centroids = compute_cluster_centroids(dbscan_results)
    return dbscan_results, centroids

def evaluate_dbscan_clusters(clusters_gdf, points_gdf, disable_tqdm=False):
    """
    Evaluate DBSCAN clustering results with simple, interpretable metrics.
    
    Parameters
    ----------
    clusters_gdf : GeoDataFrame
        DataFrame with cluster centroids. Must have columns:
        - 'cluster_id'
        - 'geometry' (Point)
        
    points_gdf : GeoDataFrame
        DataFrame with points assigned to clusters. Must have columns:
        - 'cluster_id' (DBSCAN labels, -1 = noise)
        - 'geometry' (Point)
        
    Returns
    -------
    metrics : dict
        Dictionary with evaluation metrics.
    """

    # Transform to EPSG:3857 to get metrics in meters
    clusters_gdf = clusters_gdf.to_crs(3857)
    points_gdf = points_gdf.to_crs(3857)
    
    # Exclude noise for cluster-based calculations
    clustered_points = points_gdf[points_gdf["cluster_id"] != -1]
    valid_clusters = clusters_gdf[clusters_gdf["cluster_id"] != -1]
    
    # Number of clusters
    n_clusters = valid_clusters["cluster_id"].nunique()
    
    # Noise ratio
    noise_ratio = (points_gdf["cluster_id"] == -1).mean()
    
    # Cluster size distribution
    cluster_sizes = clustered_points.groupby("cluster_id").size()
    avg_cluster_size = cluster_sizes.mean()
    cluster_size_stats = cluster_sizes.describe().to_dict()
    
    # Within-cluster average distance
    within_distances = []
    for cid, group in tqdm(clustered_points.groupby("cluster_id"), total=n_clusters, disable=disable_tqdm):
        centroid = valid_clusters.loc[valid_clusters["cluster_id"] == cid, "geometry"].values[0]
        dists = group["geometry"].apply(lambda g: g.distance(centroid)).values
        within_distances.append(np.mean(dists))
    avg_within_distance = np.mean(within_distances) if within_distances else np.nan
    
    # Between-cluster distances (pairwise between centroids)
    between_distances = []
    centroids = valid_clusters["geometry"].values
    for i in tqdm(range(len(centroids)), total=len(centroids), disable=disable_tqdm):
        for j in range(i+1, len(centroids)):
            between_distances.append(centroids[i].distance(centroids[j]))
    avg_between_distance = np.mean(between_distances) if between_distances else np.nan
    
    # Separation / Cohesion ratio
    sep_coh_ratio = (
        avg_between_distance / avg_within_distance
        if avg_within_distance and not np.isnan(avg_within_distance) else np.nan
    )
    
    return {
        "n_clusters": n_clusters,
        "noise_ratio": noise_ratio,
        "avg_cluster_size": avg_cluster_size,
        "cluster_size_stats": cluster_size_stats,
        "avg_within_distance": avg_within_distance,
        "avg_between_distance": avg_between_distance,
        "sep_coh_ratio": sep_coh_ratio
    }

def run_dbscan(points_gdf, eps_values, min_samples_values, disable_tqdm=True):
    """
    Run DBSCAN for a grid of eps and min_samples values,
    evaluate results with evaluate_dbscan_clusters,
    and return a DataFrame with all metrics.
    
    Parameters
    ----------
    points_gdf : GeoDataFrame
        Input points with geometry (no cluster_id column yet).
    eps_values : list
        List of eps values to try.
    min_samples_values : list
        List of min_samples values to try.
    metric : str
        Distance metric for DBSCAN (default = 'euclidean').
    
    Returns
    -------
    results_df : DataFrame
        Evaluation results for each (eps, min_samples).
    """
    
    results = []

    for eps in tqdm(eps_values):
        for ms in tqdm(min_samples_values):
            
            dbscan_results, centroids = gen_clusters(points_gdf, eps=eps, min_samples=ms)
            
            # Evaluate
            metrics = evaluate_dbscan_clusters(centroids, points_gdf, disable_tqdm=disable_tqdm)
            metrics["eps"] = eps
            metrics["min_samples"] = ms
            
            results.append(metrics)
    
    return pd.DataFrame(results)


def df_to_gdf(df):
    gdf = df.copy()
    gdf.loc[:, "geometry"] = gdf.geometry.apply(wkt.loads)
    gdf = gpd.GeoDataFrame(gdf, geometry='geometry', crs=4326)
    return gdf

In [None]:
extent = wkt.loads(
    "POLYGON((-58.58198 -34.582224, -58.515231 -34.582224, -58.515231 -34.635357, -58.58198 -34.635357, -58.58198 -34.582224))"
)

panos = df_to_gdf(pd.read_csv('../data/tresdefebrero/panos_enriched.csv'))

rbp = df_to_gdf(pd.read_csv('../data/tresdefebrero/renabap_intersected.csv'))

panos = gpd.sjoin_nearest(
    panos.to_crs(3857),
    rbp[["id_renabap", "geometry"]].to_crs(3857),
    how="left",
    distance_col="distance",
).to_crs(4326)
panos = panos.rename(columns={"id_renabap": "closest_barrio"}).drop(
    columns=["index_right"]
)

In [4]:
panos = panos[panos.intersects(extent)]

In [5]:
panos.head()

Unnamed: 0,pano_id,lat,lon,geometry,closest_barrio,distance
0,--4tdN434sL2rFkixRxoDw,-34.620094,-58.549171,POINT (-58.54917 -34.62009),1122,100.484082
3,--f8BTEjiIsPZir88hHt9w,-34.617903,-58.553512,POINT (-58.55351 -34.6179),313,411.591546
4,--mAOMLBMdhkoEMbHqDhYA,-34.624371,-58.545562,POINT (-58.54556 -34.62437),1122,444.202933
5,--r2r0tM84f7arlojji25w,-34.621858,-58.57735,POINT (-58.57735 -34.62186),2929,465.321905
7,-07N8ueih88oii9NUZAj8w,-34.627565,-58.574763,POINT (-58.57476 -34.62756),1107,489.06796


In [6]:
panos.shape

(45234, 6)

In [7]:
panos, centroids = gen_clusters(panos, eps=2.5, min_samples=2)

Applying DBSCAN clustering with eps=2.5, min_samples=2


In [8]:
centroids.shape

(9676, 3)

In [9]:
cluster_ids = [x for x in centroids.cluster_id.unique() if x != -1]

In [10]:
SAMPLE_N = 5000

import random
random.seed(32)

smp = random.sample(cluster_ids, SAMPLE_N)
random.seed(None)

In [11]:
sample_centroids = centroids[centroids.cluster_id.isin(smp)]
sample_panos = panos[panos.cluster_id.isin(smp)]

In [12]:
sample_panos.shape

(20582, 7)

In [13]:
api_key = os.getenv("GOOGLE_STREET_VIEW_API_KEY")

In [14]:
sample_centroids[sample_centroids.point_count==4].head()

Unnamed: 0,cluster_id,point_count,geometry
1,0,4,POINT (-58.54917 -34.62009)
16,15,4,POINT (-58.57215 -34.6273)
27,26,4,POINT (-58.57233 -34.62511)
75,74,4,POINT (-58.57252 -34.62001)
78,77,4,POINT (-58.53976 -34.60231)


In [15]:
pano_id = '0nzArjiiKDwZRcHn16nxLA'

In [16]:
pano_ids = sample_panos.pano_id.unique().tolist()

In [17]:
len(pano_ids)

20582

In [19]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def fetch_metadata_parallel(pano_ids, api_key, max_workers=10):
    """
    Fetch panorama metadata in parallel.
    
    Parameters
    ----------
    pano_ids : list
        List of panorama IDs.
    api_key : str
        API key for the metadata function.
    max_workers : int
        Number of parallel workers (default=10).
        
    Returns
    -------
    metadata_dates : dict
        Mapping pano_id -> date.
    failed_set : list
        List of pano_ids that failed.
    """

    metadata_dates = {}
    failed_set = []

    def task(pid):
        """Wrapper to fetch metadata for one pano_id."""
        try:
            metadata = get_panorama_metadata(pid, api_key)
            return pid, metadata['date'], None
        except Exception as e:
            return pid, None, e

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(task, pid): pid for pid in pano_ids}
        for future in tqdm(as_completed(futures), total=len(futures)):
            pid, date, error = future.result()
            if error:
                failed_set.append(pid)
            else:
                metadata_dates[pid] = date

    return metadata_dates, failed_set


In [20]:
metadata_dates, failed_set = fetch_metadata_parallel(pano_ids, api_key, max_workers=20)

print("Fetched:", len(metadata_dates))
print("Failed:", len(failed_set))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20582/20582 [05:50<00:00, 58.72it/s]


Fetched: 20537
Failed: 45


In [26]:
sample_panos.loc[:, 'date'] = sample_panos.pano_id.map(metadata_dates)

sample_panos.date.isna().sum()

np.int64(45)

In [22]:
sample_panos.head()

Unnamed: 0,pano_id,lat,lon,geometry,closest_barrio,distance,cluster_id,date
0,--4tdN434sL2rFkixRxoDw,-34.620094,-58.549171,POINT (-58.54917 -34.62009),1122,100.484082,0,2023-10
4,--mAOMLBMdhkoEMbHqDhYA,-34.624371,-58.545562,POINT (-58.54556 -34.62437),1122,444.202933,2,2024-05
20,-1EOix82q4dncp7HIBr8lA,-34.626108,-58.574256,POINT (-58.57426 -34.62611),1107,284.166719,10,2025-07
24,-27xWrpGhLE5e2v9PNJxzg,-34.620992,-58.534424,POINT (-58.53442 -34.62099),5384,249.655819,11,2023-04
26,-2C15MRBcSt9w83IKqdzww,-34.630305,-58.552834,POINT (-58.55283 -34.6303),5385,260.989342,12,2024-05


In [23]:
dates_per_cluster_count_dict = sample_panos.groupby('cluster_id').date.size().to_dict()

In [24]:
sample_centroids.loc[:, 'dates_count'] = sample_centroids.cluster_id.map(dates_per_cluster_count_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


#### Number of dates inside cluster should match the cluster size, meaning that each panorama inside a cluster should represent a different date.

In [25]:
matches = (sample_centroids.point_count == sample_centroids.dates_count).sum() / len(sample_centroids)
print(f"{matches * 100:.1f}% of cluster points have cluster_size == n_dates")

100.0% of cluster points have cluster_size == n_dates
