In [2]:
from pathlib import Path

import geopandas as gpd
import numpy as np
import pandas as pd
import requests
from lonboard import Map, PolygonLayer
from lonboard.colormap import apply_categorical_cmap
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

In [3]:
files = [
    "embeddings_ca_m_3712213_ne_10_060_20220518.gpq",
    "embeddings_ca_m_3712213_nw_10_060_20220518.gpq",
    "embeddings_ca_m_3712213_se_10_060_20220518.gpq",
    "embeddings_ca_m_3712213_sw_10_060_20220518.gpq",
    "embeddings_ca_m_3712214_sw_10_060_20220518.gpq",
    "embeddings_ca_m_3712221_ne_10_060_20220518.gpq",
    "embeddings_ca_m_3712221_nw_10_060_20220518.gpq",
    "embeddings_ca_m_3712221_sw_10_060_20220518.gpq",
    "embeddings_ca_m_3712222_sw_10_060_20220518.gpq",
    "embeddings_ca_m_3712229_ne_10_060_20220518.gpq",
    "embeddings_ca_m_3712230_nw_10_060_20220518.gpq",
    "embeddings_ca_m_3712212_ne_10_060_20220519.gpq",
    "embeddings_ca_m_3712212_nw_10_060_20220519.gpq",
    "embeddings_ca_m_3712212_se_10_060_20220519.gpq",
    "embeddings_ca_m_3712228_ne_10_060_20220519.gpq",
    "embeddings_ca_m_3712221_se_10_060_20220518.gpq",
    "embeddings_ca_m_3712222_nw_10_060_20220518.gpq",
    "embeddings_ca_m_3712220_ne_10_060_20220519.gpq",
    "embeddings_ca_m_3712229_nw_10_060_20220518.gpq",
    "embeddings_ca_m_3712214_nw_10_060_20220518.gpq",
    "marinas.geojson",
    "baseball.geojson",
]

url_template = "https://huggingface.co/datasets/made-with-clay/classify-embeddings-sf-baseball-marinas/resolve/main/{filename}"

for filename in files:
    dst = f"../data/classify-embeddings-sf-baseball-marinas/{filename}"
    print(dst)
    if Path(dst).exists():
        continue
    with requests.get(url_template.format(filename=filename)) as r:
        r.raise_for_status()
        with open(dst, "wb") as f:
            f.write(r.content)

../data/classify-embeddings-sf-baseball-marinas/embeddings_ca_m_3712213_ne_10_060_20220518.gpq
../data/classify-embeddings-sf-baseball-marinas/embeddings_ca_m_3712213_nw_10_060_20220518.gpq
../data/classify-embeddings-sf-baseball-marinas/embeddings_ca_m_3712213_se_10_060_20220518.gpq
../data/classify-embeddings-sf-baseball-marinas/embeddings_ca_m_3712213_sw_10_060_20220518.gpq
../data/classify-embeddings-sf-baseball-marinas/embeddings_ca_m_3712214_sw_10_060_20220518.gpq
../data/classify-embeddings-sf-baseball-marinas/embeddings_ca_m_3712221_ne_10_060_20220518.gpq
../data/classify-embeddings-sf-baseball-marinas/embeddings_ca_m_3712221_nw_10_060_20220518.gpq
../data/classify-embeddings-sf-baseball-marinas/embeddings_ca_m_3712221_sw_10_060_20220518.gpq
../data/classify-embeddings-sf-baseball-marinas/embeddings_ca_m_3712222_sw_10_060_20220518.gpq
../data/classify-embeddings-sf-baseball-marinas/embeddings_ca_m_3712229_ne_10_060_20220518.gpq
../data/classify-embeddings-sf-baseball-marinas/em

In [4]:
# Open embeddings DB
embeddings = []
for src in Path("../data/classify-embeddings-sf-baseball-marinas/").glob("*.gpq"):
    gdf = gpd.read_parquet(src)
    embeddings.append(gdf)
embeddings = pd.concat(embeddings)
embeddings

Unnamed: 0,item_id,embeddings,geometry
0,ca_m_3712221_se_10_060_20220518,"[0.17232932, -0.11894383, -0.050105773, 0.0222...","POLYGON ((-122.43800 37.68965, -122.43801 37.6..."
1,ca_m_3712221_se_10_060_20220518,"[0.14396428, -0.10565443, -0.044277802, -0.058...","POLYGON ((-122.43626 37.68964, -122.43627 37.6..."
2,ca_m_3712221_se_10_060_20220518,"[0.033249076, -0.033166107, -0.046707135, -0.1...","POLYGON ((-122.43452 37.68963, -122.43453 37.6..."
3,ca_m_3712221_se_10_060_20220518,"[0.063117094, -0.055101607, -0.008491949, -0.0...","POLYGON ((-122.43277 37.68962, -122.43278 37.6..."
4,ca_m_3712221_se_10_060_20220518,"[0.12641063, -0.008674846, -0.013385041, -0.05...","POLYGON ((-122.43103 37.68961, -122.43104 37.6..."
...,...,...,...
1819,ca_m_3712213_ne_10_060_20220518,"[0.040293757, 0.14174993, 0.010576039, 0.05624...","POLYGON ((-122.38092 37.81178, -122.38093 37.8..."
1820,ca_m_3712213_ne_10_060_20220518,"[0.06605764, 0.17537387, 0.012644318, 0.070903...","POLYGON ((-122.37918 37.81178, -122.37919 37.8..."
1821,ca_m_3712213_ne_10_060_20220518,"[0.14264803, 0.20087151, 0.019587237, 0.044009...","POLYGON ((-122.37743 37.81177, -122.37744 37.8..."
1822,ca_m_3712213_ne_10_060_20220518,"[0.12027763, 0.2004021, 0.093764625, 0.0726644...","POLYGON ((-122.37569 37.81176, -122.37570 37.8..."


In [5]:
layer = PolygonLayer.from_geopandas(
    embeddings,
    get_fill_color=[255, 0, 200, 80],
    get_line_color=[130, 65, 100, 80],
    get_line_width=10,
    line_width_max_pixels=3,
)
m = Map(layer)
m

Map(custom_attribution='', layers=(PolygonLayer(get_fill_color=[255, 0, 200, 80], get_line_color=[130, 65, 100…

In [None]:
embeddings["embeddings"][0]

0    [0.17232932, -0.11894383, -0.050105773, 0.0222...
0    [-0.06468353, 0.13653064, 0.045454692, -0.0666...
0    [-0.08439956, 0.06409763, 0.13156177, -0.00107...
0    [-0.036583245, 0.055622153, 0.098900124, -0.05...
0    [0.048047226, 0.11920324, -0.006197748, 0.2471...
0    [0.18016735, 0.19260001, 0.048831623, -0.04761...
0    [0.15443294, 0.20867755, 0.07934863, 0.1143028...
0    [-0.009225773, 0.008043288, 0.13796215, -0.046...
0    [0.08624789, -0.10204594, -0.05695264, 0.09048...
0    [0.064169854, -0.20678988, 0.09236072, 0.02509...
0    [0.007973772, 0.07687424, 0.12668172, 0.021993...
0    [-0.046016138, -0.04745782, -0.117088005, 0.05...
0    [0.08747518, 0.013187, 0.095478006, 0.00944589...
0    [0.2350052, 0.12738042, -0.13766141, 0.0198862...
0    [0.16514593, 0.17026933, -0.031763624, 0.00054...
0    [0.010442328, 0.14983672, -0.03280224, -0.1923...
0    [-0.13105132, 0.110206105, -0.099434026, 0.026...
0    [-0.04912903, 0.15913685, 0.1266249, -0.030492...
0    [-0.0

In [7]:
# Open marinas training data
# points = gpd.read_file(
#     "../data/classify-embeddings-sf-baseball-marinas/marinas.geojson"
# )

# Uncomment this to use the baseball training dataset.
points = gpd.read_file(
    "../data/classify-embeddings-sf-baseball-marinas/baseball.geojson"
)

# Spatial join of training data with embeddings
merged = embeddings.sjoin(points)
print(f"Found {len(merged)} embeddings to train on")
print(f"{sum(merged['class'])} marked locations")
print(f"{len(merged) - sum(merged['class'])} negative examples")

merged

Found 106 embeddings to train on
27 marked locations
79 negative examples


Unnamed: 0,item_id,embeddings,geometry,index_right,class
904,ca_m_3712221_se_10_060_20220518,"[0.07704136, 0.084164895, -0.058089547, -0.050...","POLYGON ((-122.38600 37.65755, -122.38601 37.6...",66,0
936,ca_m_3712221_se_10_060_20220518,"[0.14865625, 0.13005117, -0.06987675, -0.09711...","POLYGON ((-122.39646 37.65621, -122.39647 37.6...",67,0
944,ca_m_3712221_se_10_060_20220518,"[0.054749593, 0.10467231, -0.07063145, -0.1439...","POLYGON ((-122.38253 37.65614, -122.38254 37.6...",88,0
1126,ca_m_3712221_se_10_060_20220518,"[0.08150553, 0.13835192, -0.08615049, -0.15301...","POLYGON ((-122.39652 37.64929, -122.39653 37.6...",68,0
1566,ca_m_3712221_se_10_060_20220518,"[-0.020788662, 0.08532106, -0.08479196, -0.011...","POLYGON ((-122.42450 37.63282, -122.42451 37.6...",86,0
...,...,...,...,...,...
232,ca_m_3712222_nw_10_060_20220518,"[-0.2378247, 0.10699723, -0.01938209, -0.15397...","POLYGON ((-122.36864 37.74383, -122.36866 37.7...",29,0
538,ca_m_3712222_nw_10_060_20220518,"[-0.04248124, 0.18976109, 0.04242536, 0.070044...","POLYGON ((-122.36525 37.73274, -122.36526 37.7...",30,0
578,ca_m_3712222_nw_10_060_20220518,"[-0.11456938, 0.07650192, -0.027554782, 0.0461...","POLYGON ((-122.36178 37.73133, -122.36179 37.7...",31,0
948,ca_m_3712228_ne_10_060_20220519,"[0.1412763, 0.14356944, 0.087244324, 0.1193411...","POLYGON ((-122.50060 37.59359, -122.50061 37.5...",65,0


In [None]:
training_layer = PolygonLayer.from_geopandas(
    merged,
    get_fill_color=apply_categorical_cmap(
        merged["class"], {0: [0, 150, 255, 100], 1: [0, 255, 150, 150]}
    ),
    get_line_color=[0, 100, 100, 0],
)
m = Map(training_layer)
# add background map

m

Map(custom_attribution='', layers=(PolygonLayer(get_fill_color=arro3.core.ChunkedArray<FixedSizeList(Field { n…

In [12]:
# Extract X and y and split into test/train set
X = np.array([dat for dat in merged["embeddings"].values])
y = merged["class"].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Fit Random Forest classifier
model = RandomForestClassifier()
model = model.fit(X_train, y_train)

# Make test prediction and evaluate
pred = model.predict(X_test)
print(f"Accuracy is {accuracy_score(y_test, pred)}")
print(f"Precision is {precision_score(y_test, pred)}")
print(f"Recall is {recall_score(y_test, pred)}")

Accuracy is 0.75
Precision is 0.6666666666666666
Recall is 0.2222222222222222


In [13]:
%%time
# Make inference on entire embedding dataset
X = np.array([x for x in embeddings["embeddings"]])
predicted = model.predict(X)
print(f"Found {np.sum(predicted)} locations")

# Add inference to geopandas df and export
result = embeddings[predicted.astype("bool")]
result = result[["item_id", "geometry"]]

Found 40 locations
CPU times: user 152 ms, sys: 27.4 ms, total: 179 ms
Wall time: 177 ms


In [15]:
predicted_layer = PolygonLayer.from_geopandas(
    result,
    filled=False,
    get_line_color=[255, 0, 0, 100],
    get_line_width=50,
    line_width_max_pixels=5,
)
m = Map([training_layer, predicted_layer])
m

Map(custom_attribution='', layers=(PolygonLayer(get_fill_color=arro3.core.ChunkedArray<FixedSizeList(Field { n…