Rough example of doing cosine similarity search based on a few known examples of log yards

In [1]:
import os

import dask.array as da
import dask.dataframe as dd
import geopandas as gpd
import numpy as np
import pandas as pd
import shapely
from tqdm import tqdm


In [2]:
# Your local path to the NAIP embeddings data (or a subset/subdirectory of it)
root = "/mnt/c/data/clay_naip/wa"

In [3]:
# Load example log yards
log_yard_paths = [
    "/mnt/c/data/clay_naip_results/labels/log_yard_1.gpkg",
    "/mnt/c/data/clay_naip_results/labels/log_yard_2.gpkg",
    "/mnt/c/data/clay_naip_results/labels/log_yard_3.gpkg",
]
# load with geopandas
log_yard_gdfs = []
for path in log_yard_paths:
    gdf = gpd.read_file(path)
    log_yard_gdfs.append(gdf)
log_yard_gdf = pd.concat(log_yard_gdfs, ignore_index=True)
# convert to points (centroids)
log_yard_gdf.crs = "EPSG:4326"
log_yard_gdf["geometry"] = log_yard_gdf.geometry.centroid


  log_yard_gdf["geometry"] = log_yard_gdf.geometry.centroid


In [4]:
# Load all parquet files to get embeddings of log yards and all geometries for visualisation

def read_parquet_files(root, intersecting_gdf=None):
    """
    Reads a list of parquet files and processes their geometries and embeddings.
    Returns embeddings for rows that intersect with the provided GeoDataFrame.
    Returns all geometries in a GeoDataFrame.

    Args:
        files (list): List of file paths to parquet files.
        intersecting_gdf (GeoDataFrame, optional): A GeoDataFrame to check for intersections.
            If provided, only geometries and embeddings intersecting with this GeoDataFrame
            will be included in the second returned GeoDataFrame.

    Returns:
        tuple: A tuple containing:
            - main_gdf (GeoDataFrame): A GeoDataFrame containing all geometries from the parquet files.
            - select_embeddings (GeoDataFrame or None): A GeoDataFrame containing rows with geometries
              and embeddings that intersect with the provided GeoDataFrame. Returns None if no
              intersecting_gdf is provided or no intersections are found.
    """

    files = []
    for root, dirs, filenames in os.walk(root):
        for filename in filenames:
            if filename.endswith(".parquet"):
                files.append(os.path.join(root, filename))

    geoms = []
    select_embeddings = []
    for i, file in enumerate(tqdm(files)):

        # read the parquet file
        df = gpd.read_parquet(file)
        # append the geometry and embedding to the lists
        geoms.extend(df.geometry.tolist())
        if intersecting_gdf is not None:
            # check for intersection with the provided gdf
            # use sjoin
            df.crs = "EPSG:4326"
            intersecting_rows = gpd.sjoin(
                df, intersecting_gdf, how="inner", predicate="intersects"
            )
            if len(intersecting_rows) > 0:
                select_embeddings.append(intersecting_rows)

    # return the 2 gdfs
    main_gdf = gpd.GeoDataFrame(geometry=geoms, crs="EPSG:4326")
    if len(select_embeddings) > 0:
        # concatenate the list of dataframes into a single dataframe
        select_embeddings = pd.concat(select_embeddings)

    return main_gdf, select_embeddings

# read the parquet files
main_gdf, logyard_embeddings_gdf = read_parquet_files(root, log_yard_gdf)

100%|██████████| 3184/3184 [04:37<00:00, 11.46it/s]


In [5]:
# Calculate mean embedding for log yards
# Warning: see Readme - this doesn't work well for many examples!

# Define cosine similarity function
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

query_embeddings = logyard_embeddings_gdf.embeddings#.iloc[0].embeddings
query_centroid = query_embeddings.mean(axis=0)


In [6]:
# Define a dask pipeline to compute cosine similarity

# Function to apply to each row embedding
def compute_cos_sim(embedding):
    emb_array = np.array(embedding)
    return cosine_similarity(emb_array, query_centroid)

# Load Dask dataframe from Parquet
ddf = dd.read_parquet(root)

# Compute cosine similarity lazily
ddf_sim = ddf.assign(
            cos_sim=ddf["embeddings"].apply(compute_cos_sim, meta=("cos_sim", "f8"))
            ).drop(columns=["embeddings"])

In [7]:
# Run the computation and convert results to geodataframe

result_df = ddf_sim.compute()

result_df["geometry"] = result_df["geometry"].apply(shapely.wkb.loads)

# Reconstruct GeoDataFrame
result_gdf = gpd.GeoDataFrame(result_df, geometry="geometry", crs="EPSG:4326")

In [8]:
# Add cosine similiarity rank column
# and sort by it
result_gdf["rank"] = result_gdf["cos_sim"].rank(ascending=False)
result_gdf = result_gdf.sort_values("rank")

In [9]:
# save the output
output_path = os.path.join(root, "log_yards_cosinesim_ranked.gpkg")
os.makedirs(os.path.dirname(output_path), exist_ok=True)
result_gdf.to_file(output_path, driver="GPKG")