In [None]:
import pandas as pd
import geopandas as gpd
import glob
import h3
from typing import Literal
from datetime import datetime, timedelta
import pandas as pd
import plotly.express as px


## Compute Sunday from Statistical Week
def compute_sunday(row):
    jan1 = datetime(int(row["catch_year"]), 1, 1)
    first_monday = jan1 + timedelta(days=(7 - jan1.weekday()) % 7)
    return first_monday + timedelta(weeks=int(row["stat_week"]) - 1, days=6)


## Catch Data Preprocessing
def preprocess_rmpc_catch_data(catch_data):
    cs = catch_data.copy()

    cs.columns = cs.columns.str.strip().str.lower()

    # Adjust Catch Code
    cs["catch_location_code"] = cs["catch_location_code"].str.replace("  ", " ")

    # Add State Code
    cs["state_code"] = cs["catch_location_code"].str[0]

    # Add Water Type
    cs["water_type_code"] = cs["catch_location_code"].str[1]

    # Add Sector
    cs["sector_code"] = cs["catch_location_code"].str[2]

    # Add Region
    cs["region_code"] = cs["catch_location_code"].str[3:5]

    # Add Statistical Area
    cs["statistical_area"] = cs["catch_location_code"].str[5:7]

    # Filter to Statistical Week Period
    cs = cs[cs.period_type == "6"]
    cs["stat_week"] = cs["period"]

    # Build Date
    cs["stat_week_sunday"] = cs.apply(compute_sunday, axis=1)

    # Add Number Caught
    cs["number_caught"] = cs["number_caught"].astype(float)

    return cs


## Recovery Data Preprocessing
def preprocess_rmpc_recovery_data(recovery_data):
    rs = recovery_data.copy()

    rs.columns = rs.columns.str.strip().str.lower()

    # Adjust Catch Code
    rs["recovery_location_code"] = rs["recovery_location_code"].str.replace("  ", " ")

    # Add State Code
    rs["state_code"] = rs["recovery_location_code"].str[0]

    # Add Water Type
    rs["water_type_code"] = rs["recovery_location_code"].str[1]

    # Add Sector
    rs["sector_code"] = rs["recovery_location_code"].str[2]

    # Add Region
    rs["region_code"] = rs["recovery_location_code"].str[3:5]

    # Add Statistical Area
    rs["statistical_area"] = rs["recovery_location_code"].str[5:7]

    # Filter to Statistical Week Period
    rs = rs[rs.period_type == "6"]
    rs["stat_week"] = rs["period"]

    # Build Date
    rs = rs.rename(columns={"run_year": "catch_year"})
    rs["stat_week_sunday"] = rs.apply(compute_sunday, axis=1)

    # Add Number Caught
    rs["number_caught"] = 1  # rs["number_caught"].astype(float)

    return rs


# Loan and Process Sightings
def load_and_process_sighting_data(
    directory: str,
    date_col: str,
    lat_col: str,
    lon_col: str,
    id_col: str,
    h3_resolution: int,
    start_date: str = None,
    source: Literal["TMW", "ACARTIA"] = "TMW",
) -> pd.DataFrame:
    """
    Load and process sighting data for TMW or Acartia.

    Args:
        directory (str): Path to CSV files.
        date_col (str): Column name containing datetime string.
        lat_col (str): Latitude column name.
        lon_col (str): Longitude column name.
        id_col (str): Unique identifier or countable column.
        h3_resolution (int): H3 resolution to use.
        start_date (str, optional): Filter records to start at this date.
        source (str): "TMW" or "ACARTIA", for minor formatting differences.

    Returns:
        pd.DataFrame: Aggregated sightings data with full date-grid coverage.
    """
    print(directory)
    # Read & concat all CSVs
    if ".csv" in directory:
        data = pd.read_csv(directory)
    else:
        data = pd.concat(
            [pd.read_csv(path) for path in glob.glob(f"{directory}/*.csv")]
        )
    data.columns = data.columns.str.upper()

    # Parse date and geo
    data["DATE"] = data[date_col].str[:10]
    data["LATITUDE"] = pd.to_numeric(data[lat_col], errors="coerce")
    data["LONGITUDE"] = pd.to_numeric(data[lon_col], errors="coerce")
    data = data.dropna(subset=["LATITUDE", "LONGITUDE"])

    # Calculate H3 grid
    h3_col = f"H3_GRID_{h3_resolution}"
    data[h3_col] = data.apply(
        lambda x: h3.latlng_to_cell(x["LATITUDE"], x["LONGITUDE"], h3_resolution),
        axis=1,
    )

    data["DATE"] = pd.to_datetime(data["DATE"])
    if start_date:
        data = data[data["DATE"] >= pd.to_datetime(start_date)]

    # Aggregate sightings
    data_agg = data.groupby(
        ["DATE", "LATITUDE", "LONGITUDE", h3_col], as_index=False
    ).agg(SIGHTING_COUNT=(id_col, "count"))

    return data_agg

In [None]:
## Set H3 Resolution
h3_resolution = 3

## TMW Data Path
tmw_directory = "/Users/tylerstevenson/Documents/CODE/orcasalmon/data/twm"

## Acartia Data Path
acartia_directory = "/Users/tylerstevenson/Documents/CODE/FindMyWhale/data/raw/sightings/acartia-export.csv"

## Open TMW
tmw_data_cleaned = load_and_process_sighting_data(
    directory=tmw_directory,
    date_col="SIGHTDATE",
    lat_col="LATITUDE",
    lon_col="LONGITUDE",
    id_col="DATE",  # or other proxy for sightings count
    h3_resolution=h3_resolution,
    source="TMW",
)

## Open Acartia
acartia_data_cleaned = load_and_process_sighting_data(
    directory=acartia_directory,
    date_col="CREATED",
    lat_col="LATITUDE",
    lon_col="LONGITUDE",
    id_col="ENTRY_ID",
    h3_resolution=h3_resolution,
    start_date="2022-01-01",
    source="ACARTIA",
)

# Conbine Sightings Data
sightings_data_raw = pd.concat([acartia_data_cleaned, tmw_data_cleaned])

/Users/tylerstevenson/Documents/CODE/orcasalmon/data/twm
/Users/tylerstevenson/Documents/CODE/FindMyWhale/data/raw/sightings/acartia-export.csv


In [None]:
# Rough bounding box around the Columbia River mouth + offshore zone
lat_min, lat_max = 45.9, 46.4
lon_min, lon_max = -124.4, -123.7  # includes some ocean west of the entrance

df_filtered = sightings_data_raw[
    (sightings_data_raw["LATITUDE"] >= lat_min)
    & (sightings_data_raw["LATITUDE"] <= lat_max)
    & (sightings_data_raw["LONGITUDE"] >= lon_min)
    & (sightings_data_raw["LONGITUDE"] <= lon_max)
]
df_filtered["MONTH"] = df_filtered.DATE.dt.month
df_filtered["MONTH"] = df_filtered["MONTH"].astype(str)

In [None]:
pip install --upgrade pandas

In [None]:
# Make sure DATE is in datetime format
df_filtered["DATE"] = pd.to_datetime(df_filtered["DATE"])
df_filtered["MONTH"] = df_filtered["DATE"].dt.month

# This works even if pandas is old
df_filtered["stat_week_sunday"] = df_filtered["DATE"] - pd.to_timedelta(
    df_filtered["DATE"].dt.weekday + 1, unit="D"
)
df_filtered = df_filtered.groupby("stat_week_sunday", as_index=False)["MONTH"].count()
df_filtered.columns = ["DATE", "COUNT"]
df_filtered["MONTH"] = df_filtered["DATE"].dt.month

In [None]:
df_filtered.groupby("MONTH")["COUNT"].sum()

In [None]:
# # Create a scatter mapbox
# fig = px.scatter_mapbox(
#     df_filtered,
#     lat="LATITUDE",
#     lon="LONGITUDE",
#     color="MONTH",
#     # size=[1]*len(df_filtered),
#     hover_name="MONTH",
#     zoom=8,
#     center={"lat": 46.2, "lon": -124.0},
#     color_discrete_sequence=px.colors.qualitative.Set3,
#     mapbox_style="carto-positron",  # or "open-street-map"

# )

# fig.update_layout(
#     margin={"r":0,"t":0,"l":0,"b":0},
#     title="Orca Sightings near Columbia River Mouth"
# )

# fig.show()

In [None]:
tmp = df_filtered.groupby("MONTH", as_index=False)["COUNT"].sum()

In [None]:
fig = px.bar(tmp, x="MONTH", y="COUNT", title="Sighting Counts by Month")
fig.show()

In [None]:
df = df_filtered.copy()
df = df[df.DATE >= "2020-01-01"]
# Make sure your date column is datetime
df["DATE"] = pd.to_datetime(df["DATE"])

# Set weekly range based on Sunday (statistical week)
min_week = df["DATE"].min()
max_week = df["DATE"].max()

# Create full weekly date range (Sundays)
full_weeks = pd.date_range(start=min_week, end=max_week, freq="W-SUN")

# Make a DataFrame from the full weekly index
weeks_df = pd.DataFrame({"WEEK_START": full_weeks})

# Merge with your data (assuming you want to preserve original rows)
df["WEEK_START"] = df["DATE"]

# Aggregate (if needed) or deduplicate (if multiple rows per week)
weekly_agg = df.groupby("WEEK_START").sum(numeric_only=True).reset_index()

# Left join with full set of weeks
filled_df = weeks_df.merge(weekly_agg, on="WEEK_START", how="left")

# Fill missing values
filled_df = filled_df.fillna(0)

In [None]:
fig = px.line(filled_df, x="WEEK_START", y="COUNT")
fig.show()

In [None]:
import pandas as pd

df = pd.read_excel(
    "/Users/tylerstevenson/Documents/CODE/SalmonSignal/data/raw/Fish count 09222024.xlsx"
)

In [None]:
df

In [None]:
pip install openpyxl