In [None]:
# ----------------------------------------- #
#                  MODULES                  #

# Standard Modules
import os
import warnings
from itertools import product
from joblib import Parallel, delayed
from datetime import date, timedelta

# Third-Party Modules
import geopandas as gpd
import h3
import numpy as np
import pandas as pd
import plotly.express as px
from shapely.geometry import box, Point, Polygon
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

warnings.filterwarnings("ignore")

# System Configuration
parallel = Parallel(n_jobs=8)

#                                           #
# ----------------------------------------- #

# ----------------------------------------- #
#                 FUNCTIONS                 #


# Data Opener for Sightings Data
def open_sightings(path):
    if os.path.exists(path):
        if ".csv" in path:
            sightings = pd.read_csv(path)
            return sightings
        elif ".parquet" in path:
            sightings = pd.read_parquet(path)
            return sightings
    #     else:
    #         raise "WARNING: Unsupported file type - supported types include parquet and csv"
    else:
        print("WARNING: Path does not exist.")


# Geometry Point to H3 Grid
def point_to_h3(point, res):
    # point: shapely Point geometry
    return h3.latlng_to_cell(point.y, point.x, res)


# H3 Grid to Polygon
def h3_to_polygon(h3_index):
    boundary = h3.cell_to_boundary(h3_index)
    boundary_lonlat = [(lon, lat) for lat, lon in boundary]
    return Polygon(boundary_lonlat)


# Open Water Region Polygon
def open_water_polygon_aoi(path, data_crs="EPSG:4326"):
    if not os.path.exists(path):
        print(f"WARNING: Path does not exist - {path}")
        return None
    else:
        # Open Polygon
        polygon_area = gpd.read_parquet(path)

        # Dissolve Geometry
        polygon_area = polygon_area.dissolve()

        # Enforce Projection
        polygon_area = polygon_area.to_crs(data_crs)

        return polygon_area


# Get H3 Grids Over Polygon Extent
def get_h3_grids_over_polygon(poly_area, target_resolution=6):
    poly_area_detach = poly_area.copy()
    poly_area_detach["geometry"] = poly_area_detach.sample_points(10000)
    poly_area_detach = poly_area_detach[["geometry"]]
    poly_area_detach = poly_area_detach.explode()

    # Apply to all points
    poly_area_detach["h3_index"] = poly_area_detach.geometry.apply(
        lambda p: point_to_h3(p, 3)
    )

    # Get unique H3 cells
    parent_h3_cells = poly_area_detach["h3_index"].unique().tolist()

    # Get children for each parent at target resolution
    all_children = []
    for parent in parent_h3_cells:
        children = h3.cell_to_children(parent, target_resolution)  # Get children

        all_children.extend(children)

    # Remove duplicates (set automatically removes duplicates)
    unique_children = list(set(all_children))

    # Get H3 Index Points for Point in Geometry
    h3_index_gdf = pd.DataFrame(
        {
            "h3_index": unique_children,
            "latlong": [h3.cell_to_latlng(cell) for cell in unique_children],
        }
    )

    # Build GeoDataFrame
    geometry = [
        Point(xy)
        for xy in zip(
            h3_index_gdf["latlong"].str[1],
            h3_index_gdf["latlong"].str[0],
        )
    ]

    h3_index_gdf = gpd.GeoDataFrame(h3_index_gdf, geometry=geometry, crs="EPSG:4326")
    h3_index_gdf = h3_index_gdf.clip(poly_area.buffer(0.01))

    return h3_index_gdf


# Build Polygons for H3 List
def build_polygons_for_h3_list(h3_index):
    # Build H3 Grid
    h3_gdf = gpd.GeoDataFrame(
        {
            "h3_index": h3_index,
            "geometry": [h3_to_polygon(h) for h in h3_index],
        },
        crs="EPSG:4326",
    )

    # Ensure they Overlap with Polygon Area
    print("Total Cells:", len(h3_gdf))

    return h3_gdf


def get_date_range(start_date, end_date):
    """
    Returns a list of date objects between start_date and end_date (inclusive).
    start_date and end_date can be datetime.date objects or 'YYYY-MM-DD' strings.
    """
    # Convert strings to date objects if needed
    if isinstance(start_date, str):
        start_date = date.fromisoformat(start_date)
    if isinstance(end_date, str):
        end_date = date.fromisoformat(end_date)

    if start_date > end_date:
        raise ValueError("start_date must be before or equal to end_date")

    delta = (end_date - start_date).days
    return [start_date + timedelta(days=i) for i in range(delta + 1)]


#                                           #
# ----------------------------------------- #

# Orca Presence Model - Baseline Models

<b>POC:</b>Tyler Stevenson <br>
<b>LAST MODIFIED:</b>2025-08-12 <br>

## Data + Preprocessing

In [None]:
## Data Paths
sightings_path = "../data/processed/ORCA_SIGHTINGS/ORCA_SIGHTINGS.parquet"

In [None]:
# Target Resolution
h3_resolution = 5

In [None]:
## CLIP TO JUST OREGON + PUGET SOUND

# Bounding box coordinates covering Puget Sound + Salish Sea
min_lon, min_lat = -126.3, 44.0  # southwest corner (offshore west of Port Angeles)
max_lon, max_lat = -121.5, 51.3  # northeast corner (north of Vancouver)

# Create bounding box polygon
bbox = box(min_lon, min_lat, max_lon, max_lat)

# Make GeoDataFrame
gdf = gpd.GeoDataFrame({"geometry": [bbox]}, crs="EPSG:4326")

In [None]:
# Open Sightings Data
sightings = open_sightings(path=sightings_path)

# Preprocess Sightings - Remove Spurious Data
sightings["LONGITUDE"] = np.where(
    sightings["LONGITUDE"] > 0, sightings["LONGITUDE"] * -1, sightings["LONGITUDE"]
)
sightings = sightings[(sightings["LONGITUDE"] < -115) & (sightings["LONGITUDE"] > -160)]

geometry = [
    Point(xy)
    for xy in zip(
        sightings["LONGITUDE"],
        sightings["LATITUDE"],
    )
]
# Build GeoDataFrame
sightings = gpd.GeoDataFrame(sightings, geometry=geometry, crs="EPSG:4326")

# Clip to AOI
sightings = sightings.clip(gdf)

# Add H3 Index
sightings["h3_index"] = sightings.apply(
    lambda x: h3.latlng_to_cell(x["LATITUDE"], x["LONGITUDE"], h3_resolution), axis=1
)

# Drop Unecessary Features
sightings = sightings.drop(
    columns=[
        "DATE",
        "DATETIME",
        "YEAR_MONTH",
        "STAT_WEEK_SUNDAY",
        "geometry",
        "YEAR_WEEK",
        "LATITUDE",
        "LONGITUDE",
        "SOURCE",
        "COUNT",
    ],
    errors="ignore",
)

In [None]:
sightings

## Feature Engineering

In [None]:
# Filter for SRKW
sightings = sightings[sightings["POD_TYPE"] == "SRKW"].copy()
sightings = sightings[["DOY", "WOY", "MONTH", "YEAR", "h3_index"]].drop_duplicates()

In [None]:
# Ensure All Dates Exist
all_years = sightings["YEAR"].unique().tolist()
all_doy = list(range(1, 366))

all_sightings_merge = sightings[["h3_index"]].drop_duplicates()
all_sightings_merge["YEAR"] = all_sightings_merge.apply(lambda x: all_years, axis=1)
all_sightings_merge = all_sightings_merge.explode("YEAR")

all_sightings_merge["DOY"] = all_sightings_merge.apply(lambda x: all_doy, axis=1)
all_sightings_merge = all_sightings_merge.explode("DOY")

wm = sightings[["DOY", "WOY", "MONTH"]].drop_duplicates()
all_sightings_merge = pd.merge(all_sightings_merge, wm, how="left")

sightings["COUNT"] = 1
sightings = pd.merge(all_sightings_merge, sightings, how="left")

In [None]:
sightings["COUNT"] = sightings["COUNT"].fillna(0)
sightings["DOY"] = sightings["DOY"].astype(int)

In [None]:
# Add Features

# Temporal Features:
## DOY (sin/cos transformed for cyclicality):
sightings["DOY_sin"] = np.sin(2 * np.pi * sightings["DOY"] / 365.25)
sightings["DOY_cos"] = np.cos(2 * np.pi * sightings["DOY"] / 365.25)

sightings["sighting"] = np.where(sightings["COUNT"] == 0, 0, 1)
srkw_df = sightings.copy()

In [None]:
# Scale and Encode Fatures
# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ["DOY_sin", "DOY_cos", "YEAR"]),
        (
            "cat",
            OneHotEncoder(handle_unknown="ignore", sparse_output=True),
            ["h3_index"],
        ),
    ]
)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score

In [None]:
# Features and target
X = srkw_df[
    [
        "DOY_sin",
        "DOY_cos",
        "YEAR",
        "h3_index",
    ]
]
y = srkw_df["sighting"]

# Train-test split (chronological, e.g., test on 2024–2025)
train_idx = srkw_df["YEAR"] < 2023
test_idx = srkw_df["YEAR"] >= 2023
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

# Pipeline
model = Pipeline(
    [
        ("preprocessor", preprocessor),
        (
            "classifier",
            LogisticRegression(max_iter=1000, random_state=42, class_weight="balanced"),
        ),
    ]
)

# Fit model
model.fit(X_train, y_train)

# Evaluate
y_pred_proba = model.predict_proba(X_test)[:, 1]
print("SRKW AUC-ROC:", roc_auc_score(y_test, y_pred_proba))
print("SRKW F1-Score:", f1_score(y_test, y_pred_proba > 0.5))

In [None]:
px.histogram(y_pred_proba)

In [None]:
# print(srkw_df['sighting'].value_counts(normalize=True))

In [None]:
# Create prediction DataFrame
future_days1 = pd.DataFrame({"DOY": range(1, 366), "YEAR": 2023})
future_days1["DOY_sin"] = np.sin(2 * np.pi * future_days1["DOY"] / 365.25)
future_days1["DOY_cos"] = np.cos(2 * np.pi * future_days1["DOY"] / 365.25)

future_days2 = pd.DataFrame({"DOY": range(1, 366), "YEAR": 2024})
future_days2["DOY_sin"] = np.sin(2 * np.pi * future_days2["DOY"] / 365.25)
future_days2["DOY_cos"] = np.cos(2 * np.pi * future_days2["DOY"] / 365.25)

future_days3 = pd.DataFrame({"DOY": range(1, 366), "YEAR": 2025})
future_days3["DOY_sin"] = np.sin(2 * np.pi * future_days3["DOY"] / 365.25)
future_days3["DOY_cos"] = np.cos(2 * np.pi * future_days3["DOY"] / 365.25)

future_days = pd.concat([future_days1, future_days2, future_days3])

h3_cells = sightings.h3_index.unique().tolist()
future_days["h3_index"] = future_days.apply(lambda x: h3_cells, axis=1)
future_days = future_days.explode("h3_index")

In [None]:
# Cross with all H3 cells
pred_df = future_days

# Predict probabilities
pred_df["prob_SRKW"] = model.predict_proba(pred_df)[:, 1]

# Aggregate by H3 cell (average probability over 7 days)
# pred_df = pred_df.groupby("h3_index")["prob_SRKW"].mean().reset_index()

In [None]:
tmp = pd.merge(pred_df, sightings[test_idx], how="outer")

In [28]:
# import plotly.express as px

# fig = px.scatter(tmp, x="sighting", y="prob_SRKW")
# fig.show()