In [None]:
# ----------------------------------------------------------------- #
#                              MODULES                              #

# Standard Modules
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

# Third-Party Modules
import os
import re
import requests
from io import BytesIO
import plotly.express as px
import geopandas as gpd
from shapely.geometry import Point
from typing import Literal
import h3
import glob
import pandas as pd
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.seasonal import STL
from scipy.stats import pearsonr
from sklearn.feature_selection import mutual_info_regression
import dcor
import plotly.graph_objects as go

#                                                                   #
# ----------------------------------------------------------------- #

# ----------------------------------------------------------------- #
#                             FUNCTIONS                             #

##############
# COLLECTION


# RPMC - WebPage Download
def download_rmpc_data(
    base_url="https://www.rmpc.org/pub/data",
    output_dir="../data/raw/RMPC/LOOKUPS",
    filename="LC042_ALL_FULLSET.csv",
    overwrite=False,
):
    """
    Downloads a lookup CSV file from RMPC if not already downloaded.

    Parameters:
        base_url (str): Base URL of the RMPC file host.
        output_dir (str): Local directory to save the downloaded file.
        filename (str): Name of the file to download.
        overwrite (bool): If True, force re-download even if file exists.

    Returns:
        str or None: Path to the downloaded file, or None if download failed.
    """
    url = f"{base_url}/{filename}"
    output_path = os.path.join(output_dir, filename)

    os.makedirs(output_dir, exist_ok=True)

    if os.path.exists(output_path) and not overwrite:
        print(f"⚠️  File already exists, skipping download: {output_path}")
        return output_path

    try:
        print(f"⬇️  Downloading {filename} from RMPC...")
        response = requests.get(url, timeout=30)
        response.raise_for_status()

        with open(output_path, "wb") as f:
            f.write(response.content)

        print(f"✅ File downloaded successfully: {output_path}")
        return output_path

    except requests.HTTPError as e:
        print(f"❌ HTTP error: {e}")
    except requests.RequestException as e:
        print(f"❌ Request failed: {e}")
    except Exception as e:
        print(f"❌ Something went wrong: {e}")

    return None


# Download WDFW Parquet
def update_wdfw_parquet(
    output_dir="../data/raw/RMPC/WDFW",
    base_url="https://www.rmpc.org/pub/data/",
    pattern="CS042_WDFW_.*\\.csv",
):
    raw_dir = os.path.join(output_dir, "raw", "RMPC", "WDFW")
    parquet_path = os.path.join(
        output_dir, "processed", f"{pattern[0:2]}042_WDFW_FULL.parquet"
    )
    os.makedirs(raw_dir, exist_ok=True)
    os.makedirs(os.path.dirname(parquet_path), exist_ok=True)

    # Load existing parquet to see what's already included
    existing_files = set()
    if os.path.exists(parquet_path):
        print(f"📦 Loading existing parquet file: {parquet_path}")
        existing_df = pd.read_parquet(parquet_path)
        if "source_filename" in existing_df.columns:
            existing_files = set(existing_df["source_filename"].unique())
        else:
            raise ValueError("Missing 'source_filename' column in existing parquet!")

    # Scrape available files from website
    # print(f"🌐 Scraping file list from {base_url}")
    response = requests.get(base_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    links = [a["href"] for a in soup.find_all("a", href=True)]
    csv_files = [f for f in links if re.match(pattern, f)]
    new_files = [f for f in csv_files if f not in existing_files]

    print(f"🔎 Found {len(csv_files)} total files, {len(new_files)} new to download.")

    # Download and read new CSVs
    new_data = []
    for fname in new_files:
        file_url = f"{base_url.rstrip('/')}/{fname}"
        local_path = os.path.join(raw_dir, fname)
        # print(f"⬇️ Downloading: {fname}")
        try:
            r = requests.get(file_url, timeout=30)
            r.raise_for_status()
            with open(local_path, "wb") as f:
                f.write(r.content)

            df = pd.read_csv(local_path, low_memory=False)
            df["source_filename"] = fname
            new_data.append(df)

        except Exception as e:
            print(f"❌ Error downloading {fname}: {e}")

    if not new_data:
        print("📭 No new files to process. Parquet is up to date.")
        return parquet_path

    combined_new = pd.concat(new_data, ignore_index=True)

    if os.path.exists(parquet_path):
        # print("🧬 Appending to existing Parquet...")
        full_df = pd.concat([existing_df, combined_new], ignore_index=True)
    else:
        print("📁 Creating new Parquet...")
        full_df = combined_new

    full_df = full_df.astype(str)
    full_df.to_parquet(parquet_path, engine="fastparquet", index=False)
    print(f"✅ Saved combined data to: {parquet_path}")

    return parquet_path


##############
# PROCESSING


## Compute Sunday from Statistical Week
def compute_sunday(row):
    jan1 = datetime(int(row["catch_year"]), 1, 1)
    first_monday = jan1 + timedelta(days=(7 - jan1.weekday()) % 7)
    return first_monday + timedelta(weeks=int(row["stat_week"]) - 1, days=6)


## Catch Data Preprocessing
def preprocess_rmpc_catch_data(catch_data):
    cs = catch_data.copy()

    cs.columns = cs.columns.str.strip().str.lower()

    # Adjust Catch Code
    cs["catch_location_code"] = cs["catch_location_code"].str.replace("  ", " ")

    # Add State Code
    cs["state_code"] = cs["catch_location_code"].str[0]

    # Add Water Type
    cs["water_type_code"] = cs["catch_location_code"].str[1]

    # Add Sector
    cs["sector_code"] = cs["catch_location_code"].str[2]

    # Add Region
    cs["region_code"] = cs["catch_location_code"].str[3:5]

    # Add Statistical Area
    cs["statistical_area"] = cs["catch_location_code"].str[5:7]

    # Filter to Statistical Week Period
    cs = cs[cs.period_type == "6"]
    cs["stat_week"] = cs["period"]

    # Build Date
    cs["stat_week_sunday"] = cs.apply(compute_sunday, axis=1)

    # Add Number Caught
    cs["number_caught"] = cs["number_caught"].astype(float)

    return cs


## Recovery Data Preprocessing
def preprocess_rmpc_recovery_data(recovery_data):
    rs = recovery_data.copy()

    rs.columns = rs.columns.str.strip().str.lower()

    # Adjust Catch Code
    rs["recovery_location_code"] = rs["recovery_location_code"].str.replace("  ", " ")

    # Add State Code
    rs["state_code"] = rs["recovery_location_code"].str[0]

    # Add Water Type
    rs["water_type_code"] = rs["recovery_location_code"].str[1]

    # Add Sector
    rs["sector_code"] = rs["recovery_location_code"].str[2]

    # Add Region
    rs["region_code"] = rs["recovery_location_code"].str[3:5]

    # Add Statistical Area
    rs["statistical_area"] = rs["recovery_location_code"].str[5:7]

    # Filter to Statistical Week Period
    rs = rs[rs.period_type == "6"]
    rs["stat_week"] = rs["period"]

    # Build Date
    rs = rs.rename(columns={"run_year": "catch_year"})
    rs["stat_week_sunday"] = rs.apply(compute_sunday, axis=1)

    # Add Number Caught
    rs["number_caught"] = 1  # rs["number_caught"].astype(float)

    return rs


## Extract General Code
def extract_general_code(code):
    if pd.isna(code):
        return None
    s = str(code).strip().upper()  # normalize case & whitespace
    # Match: digits + optional - + digits at start, ignore trailing letters
    m = re.match(r"^(\d+(-\d+)?)", s)
    return m.group(1) if m else None


# Normalize Code
def normalize_code(code):
    return "".join(str(code).upper().split())


# Extract Statistical Area
def extract_stat_area(code):
    if pd.isna(code):
        return None
    code = str(code).upper().replace(" ", "")

    if code.startswith("3F"):
        return None  # skip for now

    # Match 3M + 5 digits + optional letter suffix
    match = re.match(r"3M(\d{4})(\d)([A-Z]?)", code)
    if match:
        # last digit + optional letter suffix
        last_digit = match.group(3) if match.group(3) else ""
        stat_area_num = match.group(2)  # the digit before last digit
        stat_area = match.group(2) + match.group(3)
        # Actually we want last two digits + optional letter
        stat_area = match.group(2) + match.group(3)  # hmm, re-check this logic
        # better: last two digits + optional letter suffix
        stat_area = code[-2:]
        # Wait, need to be precise

        # Let's re-match to last two digits + optional letter:
        match2 = re.search(r"(\d{2})([A-Z]?)$", code)
        if match2:
            return match2.group(1) + match2.group(2)

    # fallback: number + letter suffix anywhere at end
    match_simple = re.search(r"(\d{1,2})([A-Z]?)$", code)
    if match_simple:
        return match_simple.group(1).zfill(2) + match_simple.group(2)

    return None


# Normalize Statistical Area
def normalize_stat_area(code):
    if not code or pd.isna(code):
        return None
    m = re.match(r"0*(\d+)([A-Z]?)", code.upper().replace(" ", ""))
    if m:
        num = m.group(1)
        letter = m.group(2) or ""
        return num + letter
    return None


# Extract stat_area_name from catch_location_code
def extract_stat_area(code):
    if pd.isna(code):
        return None
    code = str(code).upper().replace(" ", "")

    if code.startswith("3F"):
        return None  # skip 3F for now

    # Grab last two digits plus optional letter suffix from the end
    match = re.search(r"(\d{2})([A-Z]?)$", code)
    if match:
        return match.group(1) + match.group(2)

    # Fallback simple extraction
    match_simple = re.search(r"(\d{1,2})([A-Z]?)$", code)
    if match_simple:
        return match_simple.group(1).zfill(2) + match_simple.group(2)

    return None


# Main Processing Pipeline - Catch
def main_preprocessing_catch(
    cs, lookup_state_code, lookup_water_type, lookup_sector_type, psc_stat_area_lookup
):
    tmp = cs[
        [
            "catch_location_code",
            "state_code",
            "water_type_code",
            "sector_code",
            "region_code",
            "statistical_area",
        ]
    ].drop_duplicates()

    # Normalize and uppercase
    tmp["cache_short_code"] = tmp["catch_location_code"].str[0:7]
    for col in tmp.columns:
        tmp[col] = tmp[col].astype(str).str.upper().str.strip()

    # Merge in lookup tables
    tmp = pd.merge(tmp, lookup_state_code, how="left", on="state_code")
    tmp["state"] = tmp["state"].fillna("Other")

    tmp = pd.merge(tmp, lookup_water_type, how="left", on="water_type_code")
    tmp["water_type"] = tmp["water_type"].fillna("Other")

    tmp = pd.merge(tmp, lookup_sector_type, how="left", on="sector_code")
    tmp["sector"] = tmp["sector"].fillna("Other")

    # Extract normalized stat area name
    tmp["stat_area_name"] = tmp["catch_location_code"].apply(extract_stat_area)

    # Final normalization for short code
    tmp["norm_code"] = tmp["catch_location_code"].apply(normalize_code)

    # Prepare lookup dataframe
    lookup_df = pd.DataFrame.from_dict(
        psc_stat_area_lookup, orient="index"
    ).reset_index()
    lookup_df = lookup_df.rename(columns={"index": "stat_area_name"})

    # Normalize lookup keys
    lookup_df["stat_area_name"] = lookup_df["stat_area_name"].apply(normalize_stat_area)

    # Assume tmp is your dataframe with catch_location_code column
    tmp["stat_area_name"] = tmp["catch_location_code"].apply(extract_stat_area)

    # Normalize tmp stat_area_name to match lookup keys
    tmp["stat_area_name"] = tmp["stat_area_name"].apply(normalize_stat_area)

    # Merge marine area info
    tmp = tmp.merge(lookup_df, how="left", on="stat_area_name")

    # General Marine Area
    tmp["general_marine_area"] = tmp["marine_area"].astype(str).str.split(" - ").str[0]

    # General Area 8 Adjustment
    tmp["stat_area_name"] = tmp["stat_area_name"].str.replace("8A", "8-1")
    tmp["stat_area_name"] = tmp["stat_area_name"].str.replace("8D", "8-2")

    return tmp


# Main processing pipeline
def main_preprocessing_recover(
    rc, lookup_state_code, lookup_water_type, lookup_sector_type, psc_stat_area_lookup
):
    tmp = rc[
        [
            "recovery_location_code",
            "state_code",
            "water_type_code",
            "sector_code",
            "region_code",
            "statistical_area",
        ]
    ].drop_duplicates()

    # Normalize and uppercase
    tmp["recovery_short_code"] = tmp["recovery_location_code"].str[0:7]
    for col in tmp.columns:
        tmp[col] = tmp[col].astype(str).str.upper().str.strip()

    # Merge in lookup tables
    tmp = pd.merge(tmp, lookup_state_code, how="left", on="state_code")
    tmp["state"] = tmp["state"].fillna("Other")

    tmp = pd.merge(tmp, lookup_water_type, how="left", on="water_type_code")
    tmp["water_type"] = tmp["water_type"].fillna("Other")

    tmp = pd.merge(tmp, lookup_sector_type, how="left", on="sector_code")
    tmp["sector"] = tmp["sector"].fillna("Other")

    # Extract normalized stat area name
    tmp["stat_area_name"] = tmp["recovery_location_code"].apply(extract_stat_area)

    # Final normalization for short code
    tmp["norm_code"] = tmp["recovery_location_code"].apply(normalize_code)

    # Prepare lookup dataframe
    lookup_df = pd.DataFrame.from_dict(
        psc_stat_area_lookup, orient="index"
    ).reset_index()
    lookup_df = lookup_df.rename(columns={"index": "stat_area_name"})

    # Normalize lookup keys
    lookup_df["stat_area_name"] = lookup_df["stat_area_name"].apply(normalize_stat_area)

    # Assume tmp is your dataframe with catch_location_code column
    tmp["stat_area_name"] = tmp["recovery_location_code"].apply(extract_stat_area)

    # Normalize tmp stat_area_name to match lookup keys
    tmp["stat_area_name"] = tmp["stat_area_name"].apply(normalize_stat_area)

    # Merge marine area info
    tmp = tmp.merge(lookup_df, how="left", on="stat_area_name")

    # General Marine Area
    tmp["general_marine_area"] = tmp["marine_area"].astype(str).str.split(" - ").str[0]

    # General Area 8 Adjustment
    tmp["stat_area_name"] = tmp["stat_area_name"].str.replace("8A", "8-1")
    tmp["stat_area_name"] = tmp["stat_area_name"].str.replace("8D", "8-2")

    return tmp


# Get Filled Time Series
def get_time_series_filled_out(cs_mapped_sums):
    # Step 1: Get complete list of weeks from min to max
    full_weeks = pd.date_range(
        start=cs_mapped_sums["stat_week_sunday"].min(),
        end=cs_mapped_sums["stat_week_sunday"].max(),
        freq="W-SUN",
    )

    # Step 2: Get all marine areas and species
    areas = cs_mapped_sums["MARINE_AREA_LARGE"].unique()
    species = cs_mapped_sums["species"].unique()

    # Step 3: Cartesian product of all combinations
    full_index = pd.MultiIndex.from_product(
        [areas, full_weeks, species],
        names=["MARINE_AREA_LARGE", "stat_week_sunday", "species"],
    )
    full_df = full_index.to_frame(index=False)

    # Step 4: Merge with actual data
    merged = pd.merge(
        full_df,
        cs_mapped_sums[
            ["MARINE_AREA_LARGE", "stat_week_sunday", "species", "number_caught"]
        ],
        on=["MARINE_AREA_LARGE", "stat_week_sunday", "species"],
        how="left",
    )

    # Step 5: Fill missing catches with 0
    merged["number_caught"] = merged["number_caught"].fillna(0)

    # Step 6: Reattach optional fields like species_name + geometry
    merged = pd.merge(
        merged,
        cs_mapped_sums[["species", "species_name"]].drop_duplicates(),
        on="species",
        how="left",
    )

    merged = pd.merge(
        merged,
        cs_mapped_sums[["MARINE_AREA_LARGE", "geometry"]].drop_duplicates(),
        on="MARINE_AREA_LARGE",
        how="left",
    )

    # Step 7 (optional): Convert back to GeoDataFrame
    gdf_filled = gpd.GeoDataFrame(
        merged, geometry="geometry", crs=wdfw_marine_areas.crs
    )

    return gdf_filled


# Loan and Process Sightings
def load_and_process_sighting_data(
    directory: str,
    date_col: str,
    lat_col: str,
    lon_col: str,
    id_col: str,
    h3_resolution: int,
    start_date: str = None,
    source: Literal["TMW", "ACARTIA"] = "TMW",
) -> pd.DataFrame:
    """
    Load and process sighting data for TMW or Acartia.

    Args:
        directory (str): Path to CSV files.
        date_col (str): Column name containing datetime string.
        lat_col (str): Latitude column name.
        lon_col (str): Longitude column name.
        id_col (str): Unique identifier or countable column.
        h3_resolution (int): H3 resolution to use.
        start_date (str, optional): Filter records to start at this date.
        source (str): "TMW" or "ACARTIA", for minor formatting differences.

    Returns:
        pd.DataFrame: Aggregated sightings data with full date-grid coverage.
    """
    print(directory)
    # Read & concat all CSVs
    if ".csv" in directory:
        data = pd.read_csv(directory)
    else:
        data = pd.concat(
            [pd.read_csv(path) for path in glob.glob(f"{directory}/*.csv")]
        )
    data.columns = data.columns.str.upper()

    # Parse date and geo
    data["DATE"] = data[date_col].str[:10]
    data["LATITUDE"] = pd.to_numeric(data[lat_col], errors="coerce")
    data["LONGITUDE"] = pd.to_numeric(data[lon_col], errors="coerce")
    data = data.dropna(subset=["LATITUDE", "LONGITUDE"])

    # Calculate H3 grid
    h3_col = f"H3_GRID_{h3_resolution}"
    data[h3_col] = data.apply(
        lambda x: h3.latlng_to_cell(x["LATITUDE"], x["LONGITUDE"], h3_resolution),
        axis=1,
    )

    data["DATE"] = pd.to_datetime(data["DATE"])
    if start_date:
        data = data[data["DATE"] >= pd.to_datetime(start_date)]

    # Aggregate sightings
    data_agg = data.groupby(
        ["DATE", "LATITUDE", "LONGITUDE", h3_col], as_index=False
    ).agg(SIGHTING_COUNT=(id_col, "count"))

    return data_agg


## ANALYSIS
def preprocess_and_plot(
    df,
    orca_col="SIGHTINGS_RATIO",
    salmon_col="STANDARIZED_CAUGHT",
    window_size=4,
    max_lag=16,
):
    # 1. Smooth with rolling mean (centered)
    df[f"{orca_col}_SMOOTH"] = (
        df[orca_col].rolling(window=window_size, center=True).mean()
    )
    df[f"{salmon_col}_SMOOTH"] = (
        df[salmon_col].rolling(window=window_size, center=True).mean()
    )

    # Drop NaNs introduced by smoothing
    df = df.dropna(subset=[f"{orca_col}_SMOOTH", f"{salmon_col}_SMOOTH"]).copy()

    # 2. Standardize both
    scaler = StandardScaler()
    df[[f"{orca_col}_SCALED", f"{salmon_col}_SCALED"]] = scaler.fit_transform(
        df[[f"{orca_col}_SMOOTH", f"{salmon_col}_SMOOTH"]]
    )

    # 3. STL Detrending (seasonal period 52 weeks)
    stl_orca = STL(df[f"{orca_col}_SCALED"], period=52).fit()
    stl_salmon = STL(df[f"{salmon_col}_SCALED"], period=52).fit()
    df[f"{orca_col}_DETREND"] = df[f"{orca_col}_SCALED"] - stl_orca.trend
    df[f"{salmon_col}_DETREND"] = df[f"{salmon_col}_SCALED"] - stl_salmon.trend

    # 4. Remove outliers using z-score cutoff (e.g. abs(z) > 3)
    for col in [f"{orca_col}_DETREND", f"{salmon_col}_DETREND"]:
        z_scores = (df[col] - df[col].mean()) / df[col].std()
        df = df[z_scores.abs() <= 3]

    # 5. Define lagged correlation calculation function
    def lagged_correlation(df, max_lag, orca_col, salmon_col):
        results = []
        for lag in range(-max_lag, max_lag + 1):
            shifted_orca = df[orca_col].shift(lag)
            valid = pd.concat([df[salmon_col], shifted_orca], axis=1).dropna()
            if len(valid) > 0:
                pearson_r, _ = pearsonr(valid[salmon_col], valid[orca_col])
                mi = mutual_info_regression(
                    valid[salmon_col].values.reshape(-1, 1),
                    valid[orca_col].values,
                    random_state=42,
                )[0]
                d_corr = dcor.distance_correlation(
                    valid[salmon_col].values, valid[orca_col].values
                )
                results.append((lag, pearson_r, mi, d_corr))
            else:
                results.append((lag, float("nan"), float("nan"), float("nan")))
        return pd.DataFrame(
            results, columns=["lag_weeks", "pearson_r", "mutual_info", "distance_corr"]
        )

    # 6. Run lagged correlations on detrended data
    lagged_df = lagged_correlation(
        df, max_lag, f"{orca_col}_DETREND", f"{salmon_col}_DETREND"
    )

    # 7. Plot results
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=lagged_df["lag_weeks"],
            y=lagged_df["pearson_r"],
            mode="lines+markers",
            name="Pearson r",
        )
    )
    fig.add_trace(
        go.Scatter(
            x=lagged_df["lag_weeks"],
            y=lagged_df["mutual_info"],
            mode="lines+markers",
            name="Mutual Info",
        )
    )
    fig.add_trace(
        go.Scatter(
            x=lagged_df["lag_weeks"],
            y=lagged_df["distance_corr"],
            mode="lines+markers",
            name="Distance Corr",
        )
    )

    fig.update_layout(
        title="Lagged Correlation Metrics: Orca Sightings vs Chinook Catch (Preprocessed)",
        xaxis_title="Lag (weeks)",
        yaxis_title="Correlation / Mutual Info",
        legend_title="Metric",
        template="plotly_white",
    )
    fig.show()

    return lagged_df


def run_full_analysis(
    orca_df,
    salmon_df,
    orca_col="SIGHTINGS_RATIO",
    salmon_col="STANDARIZED_CAUGHT",
    window_size=4,
    max_lag=16,
):
    # Merge on stat_week_sunday
    df = pd.merge(
        orca_df[["stat_week_sunday", orca_col]],
        salmon_df[["stat_week_sunday", salmon_col]],
        on="stat_week_sunday",
        how="inner",
    ).dropna()

    # Run your preprocessing & lagged correlation function
    lagged_df = preprocess_and_plot(
        df,
        orca_col=orca_col,
        salmon_col=salmon_col,
        window_size=window_size,
        max_lag=max_lag,
    )

    return lagged_df


#                                     #
# ----------------------------------- #

In [None]:
# Lookup Codes for RMPC Catch Data
## Lookup Species
lookup_species = pd.DataFrame(
    {
        "species": ["1", "2", "3", "4", "5", "6", "7", "8", "9"],
        "species_name": [
            "Chinook",
            "Coho",
            "Steelhead",
            "Sockeye",
            "Chum",
            "Pink",
            "Masu",
            "Cutthroat",
            "Atlantic",
        ],
    }
)

## Lookup State Code
lookup_state_code = pd.DataFrame({"state_code": ["3"], "state": ["Washington"]})

## Lookup Water Type
lookup_water_type = pd.DataFrame(
    {"water_type_code": ["M", "F"], "water_type": ["Marine", "Freshwater"]}
)

## Lookup Sector Type
lookup_sector_type = pd.DataFrame(
    {
        "sector_code": ["1", "2", "3", "4", "*", "5"],
        "sector": [
            "Puget Sound",
            "Coastal Streams and Estuaries",
            "Ocean",
            "Columbia River and Tributaries",
            "Outside Washington",
            "Lakes",
        ],
    }
)

## Lookup Region
lookup_region = pd.DataFrame(
    {
        "region_code": [
            "01",
            "02",
            "03",
            "04",
            "05",
            "06",
            "07",
            "08",
            "11",
            "12",
            "13",
            "14",
            "15",
            "17",
            "18",
            "19",
            "20",
            "21",
            "22",
            "23",
            "24",
            "25",
            "26",
        ],
        "region": [
            "Nooksack / Samish Terminal",
            "Skagit Terminal",
            "Stillaguamish / Snohomish Terminal",
            "Hood Canal Terminal",
            "South Puget Sound Terminal",
            "Domestic Mixed Stock",
            "International Mixed Stock",
            "Strait of Juan De Fuca Terminal",
            "International Mixed Stock (5, 6, 7)",
            "Skagit, Stillaguamish, Snohomish Terminal",
            "Domestic Mixed Stock",
            "South Puget Sound Terminal",
            "Hood Canal Terminal",
            "North Coast Streams",
            "Grays Harbor Estuary",
            "Willapa Harbor Estuary",
            "Columbia River",
            "Marine Area 1",
            "Marine Area 2",
            "Marine Area 3",
            "Marine Area 4",
            "Marine Area 5",
            "Marine Area 6",
        ],
    }
)

## Location Code Lookup
psc_stat_area_lookup = {
    # Columbia River Zone
    "0A": {"marine_area": "Columbia River", "region": "Lower Columbia", "notes": ""},
    "0B": {"marine_area": "Columbia River", "region": "Lower Columbia", "notes": ""},
    "0C": {"marine_area": "Columbia River", "region": "Lower Columbia", "notes": ""},
    "0D": {"marine_area": "Columbia River", "region": "Lower Columbia", "notes": ""},
    "0E": {"marine_area": "Columbia River", "region": "Lower Columbia", "notes": ""},
    "0F": {"marine_area": "Columbia River", "region": "Lower Columbia", "notes": ""},
    "0G": {"marine_area": "Columbia River", "region": "Lower Columbia", "notes": ""},
    "0R": {
        "marine_area": "Columbia River (Research)",
        "region": "Lower Columbia",
        "notes": "Research/restricted",
    },
    "0X": {
        "marine_area": "Columbia River (Closed)",
        "region": "Lower Columbia",
        "notes": "Closed or experimental",
    },
    # Coastal Westport / Grays Harbor
    "1": {"marine_area": "Grays Harbor", "region": "Coastal Zone", "notes": ""},
    "1A": {"marine_area": "Grays Harbor", "region": "Coastal Zone", "notes": ""},
    "1R": {
        "marine_area": "Grays Harbor (Research)",
        "region": "Coastal Zone",
        "notes": "Research/restricted",
    },
    "2": {"marine_area": "Westport", "region": "Coastal Zone", "notes": ""},
    "2A": {"marine_area": "Westport", "region": "Coastal Zone", "notes": ""},
    "2B": {"marine_area": "Westport", "region": "Coastal Zone", "notes": ""},
    "2C": {"marine_area": "Westport", "region": "Coastal Zone", "notes": ""},
    "2D": {"marine_area": "Westport", "region": "Coastal Zone", "notes": ""},
    "2H": {
        "marine_area": "Westport Coastal (Special/Restricted)",
        "region": "Coastal Zone",
        "notes": "Research or tribal subarea",
    },
    "2X": {
        "marine_area": "Westport",
        "region": "Coastal Zone",
        "notes": "Closed or experimental",
    },
    # La Push / Outer Coast
    "3": {"marine_area": "La Push", "region": "Coastal Zone", "notes": ""},
    "3A": {"marine_area": "La Push", "region": "Coastal Zone", "notes": ""},
    "3B": {"marine_area": "Quillayute / Queets", "region": "Coastal Zone", "notes": ""},
    "3C": {"marine_area": "La Push", "region": "Coastal Zone", "notes": ""},
    "3D": {"marine_area": "La Push", "region": "Coastal Zone", "notes": ""},
    "3E": {"marine_area": "La Push", "region": "Coastal Zone", "notes": ""},
    "3F": {"marine_area": "La Push", "region": "Coastal Zone", "notes": ""},
    "3G": {"marine_area": "La Push", "region": "Coastal Zone", "notes": ""},
    "3H": {"marine_area": "La Push", "region": "Coastal Zone", "notes": ""},
    "3I": {"marine_area": "La Push", "region": "Coastal Zone", "notes": ""},
    "3J": {"marine_area": "La Push", "region": "Coastal Zone", "notes": ""},
    "3K": {"marine_area": "La Push", "region": "Coastal Zone", "notes": ""},
    "3X": {
        "marine_area": "La Push",
        "region": "Coastal Zone",
        "notes": "Closed or experimental",
    },
    # Neah Bay coastal
    "4": {"marine_area": "Neah Bay", "region": "Coastal Zone", "notes": ""},
    "4A": {"marine_area": "Neah Bay Offshore", "region": "Coastal Zone", "notes": ""},
    "4B": {"marine_area": "Neah Bay Inshore", "region": "Coastal Zone", "notes": ""},
    "4R": {
        "marine_area": "Neah Bay (Research)",
        "region": "Coastal Zone",
        "notes": "Research/restricted",
    },
    "4X": {
        "marine_area": "Neah Bay (Closed)",
        "region": "Coastal Zone",
        "notes": "Closed or experimental",
    },
    # Sekiu / coastal north
    "5": {"marine_area": "Sekiu / Pillar Point", "region": "Coastal Zone", "notes": ""},
    "5R": {
        "marine_area": "Sekiu / Pillar Point (Research)",
        "region": "Coastal Zone",
        "notes": "Research/restricted",
    },
    "5X": {
        "marine_area": "Sekiu / Pillar Point (Closed)",
        "region": "Coastal Zone",
        "notes": "Closed or experimental",
    },
    # Strait of Juan de Fuca / Puget Sound transition
    "6": {
        "marine_area": "Strait of Juan de Fuca",
        "region": "Puget Sound",
        "notes": "",
    },
    "6A": {
        "marine_area": "Ediz Hook to Dungeness Spit",
        "region": "Puget Sound",
        "notes": "",
    },
    "6B": {"marine_area": "Port Angeles", "region": "Puget Sound", "notes": ""},
    "6C": {"marine_area": "Port Townsend", "region": "Puget Sound", "notes": ""},
    "6D": {
        "marine_area": "East Strait of Juan de Fuca",
        "region": "Puget Sound",
        "notes": "",
    },
    "6R": {
        "marine_area": "Strait of Juan de Fuca Research Zone",
        "region": "Puget Sound",
        "notes": "Research or restricted",
    },
    "6X": {
        "marine_area": "Strait of Juan de Fuca (Closed)",
        "region": "Puget Sound",
        "notes": "Closed or experimental",
    },
    # San Juan Islands / Northern Puget Sound
    "7": {"marine_area": "San Juan Islands", "region": "Puget Sound", "notes": ""},
    "7A": {
        "marine_area": "San Juan Islands - General",
        "region": "Puget Sound",
        "notes": "",
    },
    "7B": {"marine_area": "Bellingham Bay", "region": "Puget Sound", "notes": ""},
    "7C": {"marine_area": "Lummi Bay", "region": "Puget Sound", "notes": ""},
    "7D": {"marine_area": "Drayton Harbor", "region": "Puget Sound", "notes": ""},
    "7E": {
        "marine_area": "San Juan Islands (Special/Restricted)",
        "region": "Puget Sound",
        "notes": "Special management subzone",
    },
    "7R": {
        "marine_area": "San Juan Islands (Research)",
        "region": "Puget Sound",
        "notes": "Research or restricted",
    },
    "7X": {
        "marine_area": "San Juan Islands (Closed)",
        "region": "Puget Sound",
        "notes": "Closed or experimental",
    },
    # Possession Sound / Everett
    "8": {
        "marine_area": "Possession Sound / Everett",
        "region": "Puget Sound",
        "notes": "",
    },
    "8A": {
        "marine_area": "Saratoga Passage / Skagit Bay",
        "region": "Puget Sound",
        "notes": "",
    },
    "8D": {
        "marine_area": "Possession Sound / Everett",
        "region": "Puget Sound",
        "notes": "",
    },
    "8R": {
        "marine_area": "Possession Sound Research Zone",
        "region": "Puget Sound",
        "notes": "Research or restricted",
    },
    # Admiralty Inlet
    "9": {"marine_area": "Admiralty Inlet", "region": "Puget Sound", "notes": ""},
    "9A": {"marine_area": "Admiralty Inlet", "region": "Puget Sound", "notes": ""},
    # Fallback / undefined
    "0Z": {
        "marine_area": "Undefined Zone",
        "region": "Unknown",
        "notes": "Special or undefined subarea",
    },
    "8X": {
        "marine_area": "Possession Sound / Skagit Bay",
        "region": "Puget Sound",
        "notes": "Closed or experimental",
    },
    "1X": {
        "marine_area": "Grays Harbor",
        "region": "Coastal Zone",
        "notes": "Closed or experimental",
    },
    "0X": {
        "marine_area": "Columbia River",
        "region": "Lower Columbia",
        "notes": "Closed or experimental",
    },
    "13B": {
        "marine_area": "La Push - Subarea B",
        "region": "Coastal Zone",
        "notes": "Subzone of La Push area, verify exact boundaries",
    },
    "10A": {
        "marine_area": "Columbia River - Subarea A",
        "region": "Lower Columbia",
        "notes": "Subzone of Columbia River area",
    },
    "12C": {
        "marine_area": "Grays Harbor - Subarea C",
        "region": "Coastal Zone",
        "notes": "Subzone of Grays Harbor area",
    },
    "12A": {
        "marine_area": "Grays Harbor - Subarea A",
        "region": "Coastal Zone",
        "notes": "Subzone of Grays Harbor area",
    },
    "11": {
        "marine_area": "Coastal Zone - Area 11",
        "region": "Coastal Zone",
        "notes": "General coastal zone area 11",
    },
    "13A": {
        "marine_area": "La Push - Subarea A",
        "region": "Coastal Zone",
        "notes": "Subzone of La Push area",
    },
    "12": {
        "marine_area": "Grays Harbor - Area 12",
        "region": "Coastal Zone",
        "notes": "General Grays Harbor area",
    },
    "12B": {
        "marine_area": "Grays Harbor - Subarea B",
        "region": "Coastal Zone",
        "notes": "Subzone of Grays Harbor area",
    },
    "12D": {
        "marine_area": "Grays Harbor - Subarea D",
        "region": "Coastal Zone",
        "notes": "Subzone of Grays Harbor area",
    },
    "13": {
        "marine_area": "La Push - Area 13",
        "region": "Coastal Zone",
        "notes": "General La Push area",
    },
    "10": {
        "marine_area": "Columbia River - Area 10",
        "region": "Lower Columbia",
        "notes": "General Columbia River area",
    },
    "11A": {
        "marine_area": "Coastal Zone - Subarea A",
        "region": "Coastal Zone",
        "notes": "Subzone of area 11",
    },
    "10E": {
        "marine_area": "Columbia River - Subarea E",
        "region": "Lower Columbia",
        "notes": "Subzone of Columbia River area",
    },
    "10Z": {
        "marine_area": "Columbia River - Special Zone Z",
        "region": "Lower Columbia",
        "notes": "Special or undefined subarea",
    },
    "10B": {
        "marine_area": "Columbia River - Subarea B",
        "region": "Lower Columbia",
        "notes": "Subzone of Columbia River area",
    },
    "13C": {
        "marine_area": "La Push - Subarea C",
        "region": "Coastal Zone",
        "notes": "Subzone of La Push area",
    },
    "13D": {
        "marine_area": "La Push - Subarea D",
        "region": "Coastal Zone",
        "notes": "Subzone of La Push area",
    },
    "13G": {
        "marine_area": "La Push - Subarea G",
        "region": "Coastal Zone",
        "notes": "Subzone of La Push area",
    },
    "13E": {
        "marine_area": "La Push - Subarea E",
        "region": "Coastal Zone",
        "notes": "Subzone of La Push area",
    },
    "13F": {
        "marine_area": "La Push - Subarea F",
        "region": "Coastal Zone",
        "notes": "Subzone of La Push area",
    },
    "13H": {
        "marine_area": "La Push - Subarea H",
        "region": "Coastal Zone",
        "notes": "Subzone of La Push area",
    },
    "13I": {
        "marine_area": "La Push - Subarea I",
        "region": "Coastal Zone",
        "notes": "Subzone of La Push area",
    },
    "13K": {
        "marine_area": "La Push - Subarea K",
        "region": "Coastal Zone",
        "notes": "Subzone of La Push area",
    },
    "13J": {
        "marine_area": "La Push - Subarea J",
        "region": "Coastal Zone",
        "notes": "Subzone of La Push area",
    },
    "2M": {
        "marine_area": "Westport - Subarea M",
        "region": "Coastal Zone",
        "notes": "Subzone of Westport area",
    },
    "2G": {
        "marine_area": "Westport - Subarea G",
        "region": "Coastal Zone",
        "notes": "Subzone of Westport area",
    },
    "2J": {
        "marine_area": "Westport - Subarea J",
        "region": "Coastal Zone",
        "notes": "Subzone of Westport area",
    },
    "2K": {
        "marine_area": "Westport - Subarea K",
        "region": "Coastal Zone",
        "notes": "Subzone of Westport area",
    },
    "0": {
        "marine_area": "Columbia River - Area 0",
        "region": "Lower Columbia",
        "notes": "General Columbia River area",
    },
    "54": {
        "marine_area": "Unknown Area 54",
        "region": "Unknown",
        "notes": "Placeholder - verify info",
    },
    "1B": {
        "marine_area": "Grays Harbor - Subarea B",
        "region": "Coastal Zone",
        "notes": "Subzone of Grays Harbor area",
    },
    "12H": {
        "marine_area": "Grays Harbor - Subarea H",
        "region": "Coastal Zone",
        "notes": "Subzone of Grays Harbor area",
    },
    "2N": {
        "marine_area": "Westport - Subarea N",
        "region": "Coastal Zone",
        "notes": "Subzone of Westport area",
    },
    "2R": {
        "marine_area": "Westport - Subarea R",
        "region": "Coastal Zone",
        "notes": "Subzone of Westport area",
    },
    "2T": {
        "marine_area": "Westport - Subarea T",
        "region": "Coastal Zone",
        "notes": "Subzone of Westport area",
    },
    "2U": {
        "marine_area": "Westport - Subarea U",
        "region": "Coastal Zone",
        "notes": "Subzone of Westport area",
    },
}

***

## Get WDFW Catch Data from RMPC

### Query Marine Zone GeoLayer from WDFW

In [None]:
# Query Marine Areas from WDFW

marine_area_path_folder = (
    "/Users/tylerstevenson/Documents/CODE/SalmonSignal/data/processed/GIS/marine_zones"
)
url = "https://geodataservices.wdfw.wa.gov/arcgis/rest/services/ApplicationServices/Marine_Areas/MapServer/3/query"

# QUERY - Query for Marine Area Boundary Polygons
params = {
    "where": "1=1",
    "outFields": "maNumber",
    "returnDistinctValues": "true",
    "returnGeometry": "false",  # ✨ fix
    "f": "json",
}

r = requests.get(url, params=params)
r.raise_for_status()

features = r.json()["features"]
unique_numbers = [f["attributes"]["maNumber"] for f in features]

marine_area_gdf = []
for ma_number in unique_numbers:
    params = {
        "where": f"maNumber = '{ma_number}'",
        "outFields": "*",
        "returnGeometry": "true",
        "f": "geojson",
    }

    # 2. Fetch the GeoJSON
    r = requests.get(url, params=params)
    r.raise_for_status()

    # 3. Read into GeoPandas
    marine_area = gpd.read_file(BytesIO(r.content))
    marine_area_gdf.append(marine_area)

# Combine
marine_area_gdf = pd.concat(marine_area_gdf)

# Organize for Output
marine_area_gdf = marine_area_gdf.rename(
    columns={"maName": "NAME", "maNumber": "MARINE_AREA"}
)
marine_area_gdf = marine_area_gdf[["WAC", "MARINE_AREA", "NAME", "geometry"]]
marine_area_gdf = marine_area_gdf.reset_index(drop=True)
marine_area_gdf["MARINE_AREA"] = marine_area_gdf["MARINE_AREA"].astype(str)
marine_area_gdf = marine_area_gdf.to_crs("EPSG:4326")

if not os.path.exists(marine_area_path_folder):
    os.makedirs(marine_area_path_folder)

marine_area_gdf.to_parquet(f"{marine_area_path_folder}/WDFW_MARINE_AREAS_LARGE.parquet")

In [None]:
# Open WDFW Marine Areas Polygons
wdfw_marine_areas = gpd.read_parquet(
    f"{marine_area_path_folder}/WDFW_MARINE_AREAS_LARGE.parquet"
)
wdfw_marine_areas = wdfw_marine_areas.rename(
    columns={"MARINE_AREA": "MARINE_AREA_LARGE"}
)

### Query RMPC Data - WDFW

In [None]:
# Location Lookup
lc_data_path = download_rmpc_data()

# RMPC Catch Data
rmpc_catch_data = update_wdfw_parquet(
    output_dir="../data/raw/RMPC/WDFW",
    base_url="https://www.rmpc.org/pub/data/",
    pattern="CS042_WDFW_.*\\.csv",
)

# RMPC Recovery Data
rmpc_recovery_data = update_wdfw_parquet(
    output_dir="../data/raw/RMPC/WDFW",
    base_url="https://www.rmpc.org/pub/data/",
    pattern="RC042_WDFW_.*\\.csv",
)

# RMPC Release Data
rmpc_release_data = update_wdfw_parquet(
    output_dir="../data/raw/RMPC/WDFW",
    base_url="https://www.rmpc.org/pub/data/",
    pattern="RL042_WDFW_.*\\.csv",
)

In [None]:
# Preprocess RMPC Data
## Location Lookup
lc = pd.read_csv(lc_data_path)
lc.columns = [str.strip(i) for i in lc.columns]

## Catch Data
cs = pd.read_parquet(rmpc_catch_data)
cs = preprocess_rmpc_catch_data(cs)

## Recovery Data
rc = pd.read_parquet(rmpc_recovery_data)
rc = preprocess_rmpc_recovery_data(rc)

## Release Data
rl = pd.read_parquet(rmpc_release_data)
# rl = preprocess_rmpc_release_data(rl)

In [None]:
# Main Processing - Collect Areas
cs_processed = main_preprocessing_catch(
    cs, lookup_state_code, lookup_water_type, lookup_sector_type, psc_stat_area_lookup
)

# Main Processing - Map to Collected Areas
cs_mapped = pd.merge(cs, cs_processed, how="left")
cs_mapped["MARINE_AREA_LARGE"] = cs_mapped["stat_area_name"].apply(extract_general_code)

#################################

# Main Processing - Collect Areas
rc_processed = main_preprocessing_recover(
    rc, lookup_state_code, lookup_water_type, lookup_sector_type, psc_stat_area_lookup
)

# Main Processing - Map to Collected Areas
rc_mapped = pd.merge(rc, rc_processed, how="left")
rc_mapped["MARINE_AREA_LARGE"] = rc_mapped["stat_area_name"].apply(extract_general_code)

In [None]:
# Get Large-Area Sums

#################################
## CATCH
## CS Data
cs_mapped_sums = cs_mapped[
    ["stat_week_sunday", "MARINE_AREA_LARGE", "number_caught", "species"]
]
cs_mapped_sums = cs_mapped_sums.groupby(
    ["MARINE_AREA_LARGE", "stat_week_sunday", "species"], as_index=False
)["number_caught"].sum()
cs_mapped_sums["MARINE_AREA_LARGE"] = cs_mapped_sums["MARINE_AREA_LARGE"].astype(str)

# Add Geometry
cs_mapped_sums = pd.merge(cs_mapped_sums, wdfw_marine_areas)

# Add Species Name
cs_mapped_sums = pd.merge(cs_mapped_sums, lookup_species)

# Fill Out - Catch
cs_mapped_sums_filled = get_time_series_filled_out(cs_mapped_sums)

#################################
## RECOVERY
## RC Data
rc_mapped_sums = rc_mapped[
    ["stat_week_sunday", "MARINE_AREA_LARGE", "number_caught", "species"]
]
rc_mapped_sums = rc_mapped_sums.groupby(
    ["MARINE_AREA_LARGE", "stat_week_sunday", "species"], as_index=False
)["number_caught"].sum()
rc_mapped_sums["MARINE_AREA_LARGE"] = rc_mapped_sums["MARINE_AREA_LARGE"].astype(str)

# Add Geometry
rc_mapped_sums = pd.merge(rc_mapped_sums, wdfw_marine_areas)

# Add Species Name
rc_mapped_sums = pd.merge(rc_mapped_sums, lookup_species)

# Fill Out - Recovery
rc_mapped_sums_filled = get_time_series_filled_out(rc_mapped_sums)

In [None]:
cs_mapped_sums_filled["MARINE_AREA_LARGE"].unique()

In [15]:
cs_mapped_sums_filled[
    cs_mapped_sums_filled["MARINE_AREA_LARGE"].isin(["1", "2"])
].to_parquet(
    "/Users/tylerstevenson/Documents/CODE/SalmonSignal/data/raw/RMPC/WDFW/processed/MA12_RMPC_CATCH.parquet"
)

In [18]:
cs_mapped_sums_filled_ = cs_mapped_sums_filled[
    cs_mapped_sums_filled["MARINE_AREA_LARGE"].isin(["1", "2"])
][['MARINE_AREA_LARGE', 'stat_week_sunday', 'species_name', 'number_caught']]

In [19]:
cs_mapped_sums_filled_.to_parquet("/Users/tylerstevenson/Documents/CODE/SalmonSignal/data/raw/RMPC/WDFW/processed/MA12_RMPC_CATCH.parquet")

### Investigation Time Series

In [None]:
# Filter to Chinook
## Chinook Catch
cs_chinook = cs_mapped_sums_filled[cs_mapped_sums_filled.species_name == "Chinook"]

## Chinook Recovery
rc_chinook = rc_mapped_sums_filled[rc_mapped_sums_filled.species_name == "Chinook"]

In [None]:
# Totals
## Chinook Catch
cs_chinook_total = cs_chinook.groupby(["stat_week_sunday"], as_index=False)[
    "number_caught"
].sum()

## Chinook Recovery
rc_chinook_total = rc_chinook.groupby(["stat_week_sunday"], as_index=False)[
    "number_caught"
].sum()

In [None]:
fig = px.line(title="Chinook - All Marine Areas")
fig.add_scatter(
    x=cs_chinook_total["stat_week_sunday"],
    y=cs_chinook_total["number_caught"],
    name="Catch Data",
    marker_color="#8EF9F3",
)
fig.add_scatter(
    x=rc_chinook_total["stat_week_sunday"],
    y=rc_chinook_total["number_caught"],
    name="Recovery Data",
    marker_color="#754043",
)
fig.update_layout(
    plot_bgcolor="white",  # White plot area
    paper_bgcolor="white",  # White outer area
    font=dict(
        family="Futura, Arial, sans-serif",  # Or any font you love
        size=14,
        color="black",
    ),
    xaxis=dict(
        showgrid=True,
        gridcolor="lightgrey",  # Optional: light gridlines
        zeroline=False,
    ),
    yaxis=dict(showgrid=True, gridcolor="lightgrey", zeroline=False),
)
fig.show()

In [None]:
marine_area_query = "7"

In [None]:
cs_chinook_ma_query = cs_chinook[
    cs_chinook.MARINE_AREA_LARGE.str.contains(marine_area_query)
]
cs_chinook_ma_query = cs_chinook_ma_query.groupby(["stat_week_sunday"], as_index=False)[
    "number_caught"
].sum()

rc_chinook_ma_query = rc_chinook[
    rc_chinook.MARINE_AREA_LARGE.str.contains(marine_area_query)
]
rc_chinook_ma_query = rc_chinook_ma_query.groupby(["stat_week_sunday"], as_index=False)[
    "number_caught"
].sum()


fig = px.line(title=f"Chinook - Marine Area {marine_area_query}")
fig.add_scatter(
    x=cs_chinook_ma_query["stat_week_sunday"],
    y=cs_chinook_ma_query["number_caught"],
    name="Catch Data",
    marker_color="#8EF9F3",
)
fig.add_scatter(
    x=rc_chinook_ma_query["stat_week_sunday"],
    y=rc_chinook_ma_query["number_caught"],
    name="Recovery Data",
    marker_color="#754043",
)
fig.update_layout(
    plot_bgcolor="white",  # White plot area
    paper_bgcolor="white",  # White outer area
    font=dict(
        family="Futura, Arial, sans-serif",  # Or any font you love
        size=14,
        color="black",
    ),
    xaxis=dict(
        showgrid=True,
        gridcolor="lightgrey",  # Optional: light gridlines
        zeroline=False,
    ),
    yaxis=dict(showgrid=True, gridcolor="lightgrey", zeroline=False),
)
fig.show()

In [None]:
cs_chinook_ma_query["STANDARIZED_CAUGHT"] = (
    cs_chinook_ma_query["number_caught"] - cs_chinook_ma_query["number_caught"].mean()
) / cs_chinook_ma_query["number_caught"].std()

#################################################

rc_chinook_ma_query["STANDARIZED_CAUGHT"] = (
    rc_chinook_ma_query["number_caught"] - rc_chinook_ma_query["number_caught"].mean()
) / rc_chinook_ma_query["number_caught"].std()

In [None]:
fig = px.line(title="Chinook - Marine Area 8 (Standardized)")
fig.add_scatter(
    x=cs_chinook_ma_query["stat_week_sunday"],
    y=cs_chinook_ma_query["STANDARIZED_CAUGHT"],
    name="Catch Data",
    marker_color="#8EF9F3",
)
fig.add_scatter(
    x=rc_chinook_ma_query["stat_week_sunday"],
    y=rc_chinook_ma_query["STANDARIZED_CAUGHT"],
    name="Recovery Data",
    marker_color="#754043",
)
fig.update_layout(
    plot_bgcolor="white",  # White plot area
    paper_bgcolor="white",  # White outer area
    font=dict(
        family="Futura, Arial, sans-serif",  # Or any font you love
        size=14,
        color="black",
    ),
    xaxis=dict(
        showgrid=True,
        gridcolor="lightgrey",  # Optional: light gridlines
        zeroline=False,
    ),
    yaxis=dict(showgrid=True, gridcolor="lightgrey", zeroline=False),
)
fig.show()

***

## Compare Against Orca Sightings

In [None]:
## Set H3 Resolution
h3_resolution = 3

## TMW Data Path
tmw_directory = "/Users/tylerstevenson/Documents/CODE/orcasalmon/data/twm"

## Acartia Data Path
acartia_directory = "/Users/tylerstevenson/Documents/CODE/FindMyWhale/data/raw/sightings/acartia-export.csv"

## Open TMW
tmw_data_cleaned = load_and_process_sighting_data(
    directory=tmw_directory,
    date_col="SIGHTDATE",
    lat_col="LATITUDE",
    lon_col="LONGITUDE",
    id_col="DATE",  # or other proxy for sightings count
    h3_resolution=h3_resolution,
    source="TMW",
)

## Open Acartia
acartia_data_cleaned = load_and_process_sighting_data(
    directory=acartia_directory,
    date_col="CREATED",
    lat_col="LATITUDE",
    lon_col="LONGITUDE",
    id_col="ENTRY_ID",
    h3_resolution=h3_resolution,
    start_date="2022-01-01",
    source="ACARTIA",
)

# Conbine Sightings Data
sightings_data_raw = pd.concat([acartia_data_cleaned, tmw_data_cleaned])

In [None]:
sightings_gdf = gpd.GeoDataFrame(
    sightings_data_raw,
    geometry=gpd.points_from_xy(
        sightings_data_raw.LONGITUDE, sightings_data_raw.LATITUDE
    ),
    crs="EPSG:4326",  # WGS84
)

# Ensure datetime
sightings_gdf["DATE"] = pd.to_datetime(sightings_gdf["DATE"])

# Join with marine areas
wdfw_marine_areas = wdfw_marine_areas.to_crs("EPSG:4326")
orca_gdf = sightings_gdf.sjoin(wdfw_marine_areas)

# Aggregate by Marine Area + Date
orca_gdf = orca_gdf.groupby(["DATE", "MARINE_AREA_LARGE"], as_index=False)[
    "SIGHTING_COUNT"
].sum()

# Calculate the Sunday of each week
orca_gdf["stat_week_sunday"] = (
    orca_gdf["DATE"]
    - pd.to_timedelta(orca_gdf["DATE"].dt.weekday, unit="d")
    - pd.Timedelta(days=1)
)

In [None]:
orca_gdf = orca_gdf.groupby(["stat_week_sunday", "MARINE_AREA_LARGE"], as_index=False)[
    "SIGHTING_COUNT"
].sum()

In [None]:
# Step 2: Total sightings per marine area per week
region_week_counts = (
    orca_gdf.groupby(["stat_week_sunday", "MARINE_AREA_LARGE"])["SIGHTING_COUNT"]
    .sum()
    .rename("SIGHTINGS_COUNT")
    .reset_index()
)

# Step 3: Total sightings per week across all marine areas
weekly_totals = (
    region_week_counts.groupby("stat_week_sunday")["SIGHTINGS_COUNT"]
    .sum()
    .rename("TOTAL_WEEKLY_SIGHTINGS")
    .reset_index()
)

# Step 4: Merge + calculate ratio
orca_share = region_week_counts.merge(weekly_totals, on="stat_week_sunday", how="left")
orca_share["SIGHTINGS_RATIO"] = (
    orca_share["SIGHTINGS_COUNT"] / orca_share["TOTAL_WEEKLY_SIGHTINGS"]
)

In [None]:
orca_area_8 = orca_share[(orca_share.MARINE_AREA_LARGE.str.contains(marine_area_query))]

In [None]:
orca_area_8 = orca_area_8.groupby(["stat_week_sunday"], as_index=False).agg(
    SIGHTINGS_COUNT=("SIGHTINGS_COUNT", "sum"),
    TOTAL_WEEKLY_SIGHTINGS=("TOTAL_WEEKLY_SIGHTINGS", "sum"),
)
orca_area_8["SIGHTINGS_RATIO"] = (
    orca_area_8["SIGHTINGS_COUNT"] / orca_area_8["TOTAL_WEEKLY_SIGHTINGS"]
)

In [None]:
import plotly.express as px
import plotly.graph_objects as go

# Base figure from px just to set up layout
fig = px.line(title=f"Chinook - Marine Area {marine_area_query}", height=500)

# Catch Data (Primary Y)
fig.add_scatter(
    x=cs_chinook_ma_query["stat_week_sunday"],
    y=cs_chinook_ma_query["STANDARIZED_CAUGHT"],
    name="Catch Data",
    marker_color="#8EF9F3",
    yaxis="y1",
)

# Recovery Data (Primary Y)
fig.add_scatter(
    x=rc_chinook_ma_query["stat_week_sunday"],
    y=rc_chinook_ma_query["STANDARIZED_CAUGHT"],
    name="Recovery Data",
    marker_color="#754043",
    yaxis="y1",
)

# Orca Sightings Ratio (Secondary Y)
fig.add_scatter(
    x=orca_area_8["stat_week_sunday"],
    y=orca_area_8["SIGHTINGS_COUNT"],
    name="Orca Sightings Ratio of Total",
    mode="lines",
    line=dict(color="#EF476F", dash="dot"),
    yaxis="y2",
)

# Update layout for dual Y-axes
fig.update_layout(
    plot_bgcolor="white",
    paper_bgcolor="white",
    font=dict(
        family="Futura, Arial, sans-serif",
        size=14,
        color="black",
    ),
    xaxis=dict(
        title="Week",
        showgrid=True,
        gridcolor="lightgrey",
        zeroline=False,
    ),
    yaxis=dict(
        title="Chinook (Standardized)",
        showgrid=True,
        gridcolor="lightgrey",
        zeroline=False,
    ),
    yaxis2=dict(
        title="Orca Sightings Ratio Total",
        overlaying="y",
        side="right",
        showgrid=False,
    ),
)

In [None]:
import plotly.express as px
import plotly.graph_objects as go

# Step 1: Monthly aggregation
cs_monthly = cs_chinook_ma_query.copy()
cs_monthly["month"] = cs_monthly["stat_week_sunday"].dt.to_period("M").dt.to_timestamp()
cs_monthly = cs_monthly.groupby("month")["STANDARIZED_CAUGHT"].mean().reset_index()

rc_monthly = rc_chinook_ma_query.copy()
rc_monthly["month"] = rc_monthly["stat_week_sunday"].dt.to_period("M").dt.to_timestamp()
rc_monthly = rc_monthly.groupby("month")["STANDARIZED_CAUGHT"].mean().reset_index()

orca_monthly = orca_area_8.copy()
orca_monthly["month"] = (
    orca_monthly["stat_week_sunday"].dt.to_period("M").dt.to_timestamp()
)
orca_monthly = orca_monthly.groupby("month")["SIGHTINGS_COUNT"].mean().reset_index()

# Step 2: Base figure
fig = px.line(title=f"Chinook - Marine Area {marine_area_query} (Monthly)", height=500)

# Step 3: Add Traces
# Catch Data
fig.add_scatter(
    x=cs_monthly["month"],
    y=cs_monthly["STANDARIZED_CAUGHT"],
    name="Catch Data",
    marker_color="#8EF9F3",
    yaxis="y1",
)

# Recovery Data
fig.add_scatter(
    x=rc_monthly["month"],
    y=rc_monthly["STANDARIZED_CAUGHT"],
    name="Recovery Data",
    marker_color="#754043",
    yaxis="y1",
)

# Orca Sightings Ratio
fig.add_scatter(
    x=orca_monthly["month"],
    y=orca_monthly["SIGHTINGS_COUNT"],
    name="Orca Sightings Ratio of Total",
    mode="lines",
    line=dict(color="#EF476F", dash="dot"),
    yaxis="y2",
)

# Step 4: Update layout for dual y-axes
fig.update_layout(
    plot_bgcolor="white",
    paper_bgcolor="white",
    font=dict(
        family="Futura, Arial, sans-serif",
        size=14,
        color="black",
    ),
    xaxis=dict(
        title="Month",
        showgrid=True,
        gridcolor="lightgrey",
        zeroline=False,
        tickformat="%b\n%Y",  # e.g., Jan 2023
    ),
    yaxis=dict(
        title="Chinook (Standardized)",
        showgrid=True,
        gridcolor="lightgrey",
        zeroline=False,
    ),
    yaxis2=dict(
        title="Orca Sightings Ratio Total",
        overlaying="y",
        side="right",
        showgrid=False,
    ),
)

fig.show()

In [None]:
import pandas as pd
import plotly.express as px

# --- Orca Data ---
orca_box = orca_area_8.copy()
orca_box["month_name"] = orca_box["stat_week_sunday"].dt.month_name()

# Standardize sightings ratio
scaler = StandardScaler()
orca_box["value"] = scaler.fit_transform(orca_box[["SIGHTINGS_COUNT"]])
orca_box["source"] = "Orca Sightings"

# Salmon Catch Data
catch_box = cs_chinook_ma_query.copy()
catch_box["month_name"] = catch_box["stat_week_sunday"].dt.month_name()
catch_box["value"] = catch_box["STANDARIZED_CAUGHT"]
catch_box["source"] = "Salmon Catch"

# Salmon Release Data
release_box = rc_chinook_ma_query.copy()
release_box["month_name"] = release_box["stat_week_sunday"].dt.month_name()
release_box["value"] = release_box["STANDARIZED_CAUGHT"]
release_box["source"] = "Salmon Recovery"  # Change if you use actual release data

# Combine all into one tidy dataframe
combined_box = pd.concat([orca_box, catch_box, release_box], axis=0)

# Ensure month order
month_order = [
    "January",
    "February",
    "March",
    "April",
    "May",
    "June",
    "July",
    "August",
    "September",
    "October",
    "November",
    "December",
]
combined_box["month_name"] = pd.Categorical(
    combined_box["month_name"], categories=month_order, ordered=True
)

# Plot!
fig = px.box(
    combined_box,
    x="month_name",
    y="value",
    color="source",
    title=f"Monthly Distributions of Orca Sightings, Salmon Catch & Recovery - Marine Area {marine_area_query}",
    labels={"month_name": "Month", "value": "Standardized Value"},
    color_discrete_map={
        "Orca Sightings": "#EF476F",
        "Salmon Catch": "#8EF9F3",
        "Salmon Recovery": "#754043",
    },
)

fig.update_layout(
    plot_bgcolor="white",
    paper_bgcolor="white",
    font=dict(family="Futura, Arial", size=14, color="black"),
    xaxis_title=None,
    yaxis_title="Standardized or Ratio Value",
    boxmode="group",
)

fig.show()

In [None]:
window_size = 5  # 4 weeks for smoothing peaks

orca_area_8["SIGHTINGS_RATIO_SMOOTH"] = (
    orca_area_8["SIGHTINGS_COUNT"].rolling(window=window_size, center=True).mean()
)
cs_chinook_ma_query["STANDARIZED_CAUGHT_SMOOTH"] = (
    cs_chinook_ma_query["STANDARIZED_CAUGHT"]
    .rolling(window=window_size, center=True)
    .mean()
)

df = pd.merge(
    orca_area_8[["stat_week_sunday", "SIGHTINGS_RATIO_SMOOTH"]],
    cs_chinook_ma_query[["stat_week_sunday", "STANDARIZED_CAUGHT_SMOOTH"]],
    on="stat_week_sunday",
    how="inner",
).dropna()

In [None]:
from scipy.stats import pearsonr

max_lag = 16  # weeks, adjust as you want
results = []

for lag in range(-max_lag, max_lag + 1):
    shifted = df["SIGHTINGS_RATIO_SMOOTH"].shift(lag)
    valid = pd.concat([df["STANDARIZED_CAUGHT_SMOOTH"], shifted], axis=1).dropna()
    if len(valid) > 0:
        r, _ = pearsonr(
            valid["STANDARIZED_CAUGHT_SMOOTH"], valid["SIGHTINGS_RATIO_SMOOTH"]
        )
        results.append((lag, r))
    else:
        results.append((lag, float("nan")))

lags_df = pd.DataFrame(results, columns=["lag_weeks", "correlation"])

In [None]:
from sklearn.feature_selection import mutual_info_regression
import numpy as np

# Prepare data: drop NaNs and align
df_clean = df.dropna(subset=["SIGHTINGS_RATIO_SMOOTH", "STANDARIZED_CAUGHT_SMOOTH"])

X = df_clean["STANDARIZED_CAUGHT_SMOOTH"].values.reshape(-1, 1)
y = df_clean["SIGHTINGS_RATIO_SMOOTH"].values

mi = mutual_info_regression(X, y, random_state=42)
print(f"Mutual Information: {mi[0]:.4f}")

In [None]:
import dcor

# Make sure arrays aligned and drop NaNs
x = df_clean["STANDARIZED_CAUGHT_SMOOTH"].values
y = df_clean["SIGHTINGS_RATIO_SMOOTH"].values

dcorr = dcor.distance_correlation(x, y)
print(f"Distance Correlation: {dcorr:.4f}")

In [None]:
orca_df = orca_area_8
salmon_df = cs_chinook_ma_query

run_full_analysis(
    orca_df,
    salmon_df,
    orca_col="SIGHTINGS_RATIO",
    salmon_col="STANDARIZED_CAUGHT",
    window_size=4,
    max_lag=16,
)

In [None]:
# # API endpoint
# url = "https://data.wa.gov/resource/auvb-4rvk.json"

# # Optional: params like $limit, $where, etc.
# params = {"$limit": 1000}  # Increase this or paginate if needed

# # Make the request
# response = requests.get(url, params=params)

# # Convert to DataFrame
# if response.status_code == 200:
#     data = response.json()
#     df_tag_recovery = pd.DataFrame(data)
# else:
#     print(f"Error: {response.status_code}")

df_tag_recovery = pd.read_csv(
    "/Users/tylerstevenson/Documents/CODE/SalmonSignal/data/raw/RMPC/WDFW/WDFW-Coded_Wire_Tag_Fish_Recoveries_20250730.csv"
)

In [None]:
# Aggregate Data and Save to File
columns_to_use = [
    "Species",
    "Recovery Date",
    "Location Name",
    "PSC Code",
    "Location Code",
]
df_tag_recovery = df_tag_recovery[columns_to_use].copy()
df_tag_recovery["Location Code"] = df_tag_recovery["Location Code"].fillna("")
df_tag_recovery["Count"] = 1

tag_recovered = df_tag_recovery.groupby(columns_to_use, as_index=False)["Count"].sum()
tag_recovered = tag_recovered[tag_recovered.Species != "Unknown"]

In [None]:
df_tag_recovery["Date"] = pd.to_datetime(df_tag_recovery["Recovery Date"])

In [None]:
df_tag_recovery.Date.max()

In [None]:
tag_sums = tag_recovered.groupby(["Recovery Date"], as_index=False)["Count"].sum()
tag_sums["Date"] = pd.to_datetime(tag_sums["Recovery Date"])

In [None]:
# fig = px.scatter(x = tag_sums['Date'], y = tag_sums['Count'])
# fig.show()

In [None]:
salmonid_indicators = "/Users/tylerstevenson/Documents/CODE/SalmonSignal/data/raw/WDFW_-_Salmonid_Population_Indicators_Database__SPI__Metrics_and_Indicators.csv"
salmonid_indicators = pd.read_csv(salmonid_indicators)

In [None]:
salmonid_indicators.head(5)

In [None]:
", ".join(salmonid_indicators.columns)

In [None]:
salmonid_indicators.iloc[0].tail(25)