***

## Open Sightings Data

1. Find Pod Designation (Place Sightings Into Pod - J, K, L, Transient, Other)
2. Find Pod Type Designation (Place Sightings Into Pod Type - SRKW, Transient, Other)
3. Aggregate Sightings by Laitutde + Longitude + Date 
4. Add Variables (DOY, WOY, MONTH, YEAR, SOURCE)

By opening this data, we can investigate whether there is any correlation between sightings at the Columbia River mouth and Salmon abundance.

In [None]:
# --------------------------------------- #
#                 MODULES                 #

# Standard Modules
import os
import re
import warnings

# Third-Party Modules
import glob
import numpy as np
import pandas as pd
import plotly.express as px

warnings.filterwarnings("ignore")

#                                         #
# --------------------------------------- #

# --------------------------------------- #
#                FUNCTIONS                #

####################
################ TWM


# TWM Data Preprocessing
def preprocess_twm_data(data):
    data["pod_tag"] = data["pod_tag"].fillna(data["pod"])
    data["pod_tag"] = data["pod_tag"].fillna("orcas")

    # Soft Fix for Unknown but Partially Identified - OverInflate Sightings?
    data["pod_tag"] = np.where(
        data["pod_tag"].str.startswith("nr"), "AGR", data["pod_tag"]
    )
    data["pod_tag"] = np.where(
        data["pod_tag"].str.startswith("sr"), "JKL", data["pod_tag"]
    )
    data["pod_tag"] = np.where(
        data["pod_tag"].str.startswith("t"), "T", data["pod_tag"]
    )
    data = data[["SightDate", "latitude", "longitude", "pod_tag"]]
    data.columns = ["DATE", "LATITUDE", "LONGITUDE", "POD_TAG"]

    data["POD_TAG"] = np.where(
        data["POD_TAG"] != "orcas", data["POD_TAG"].apply(list), list(["O"])
    )
    data = data.explode("POD_TAG")
    data["COUNT"] = 1

    data["POD_TYPE"] = data["POD_TAG"].map(pod_lookup)

    data = data.groupby(
        ["DATE", "LATITUDE", "LONGITUDE", "POD_TYPE", "POD_TAG"], as_index=False
    )["COUNT"].sum()

    return data


####################
############ ACARTIA


# Function to extract species from notes as a list
def extract_species_from_notes(notes):
    if pd.isna(notes):
        return ["Unspecified"]

    text = str(notes).lower()
    extracted_species = set()

    # Comprehensive keyword mapping based on common cetacean species
    species_keywords = {
        "Killer Whale": [
            "orca",
            "killer whale",
            "srkw",
            "southern resident",
            "transient",
            "bigg's",
            "j pod",
            "k pod",
            "l pod",
            "j-pod",
            "k-pod",
            "l-pod",
            "ballena asesina",
            "orque",
            "épaulard",
            "hunting a sea lion",
        ],
        "Humpback Whale": ["humpback", "ballena jorobada"],
        "Gray Whale": ["gray whale", "grey whale"],
        "Blue Whale": ["blue whale", "ballena azul"],
        "Fin Whale": ["fin whale", "finback whale"],
        "Minke Whale": ["minke", "petit rorqual"],
        "Sperm Whale": ["sperm whale"],
        "Right Whale": ["right whale", "black right whale"],
        "Sei Whale": ["sei whale"],
        "Baird's Beaked Whale": ["baird's beaked whale"],
        "Beluga Whale": ["beluga"],
        "Bryde's Whale": ["bryde"],
        "Pilot Whale": ["pilot whale"],
        "False Killer Whale": ["false killer whale"],
        "Beaked Whale": ["beaked whale"],
        "Narwhal": ["narwhal"],
        "Bowhead Whale": ["bowhead"],
        "Harbor Porpoise": ["harbor porpoise", "marsouin commun"],
        "Dall's Porpoise": ["dall's porpoise", "dalls"],
        "Porpoise": ["porpoise"],  # Generic
        "Bottlenose Dolphin": ["bottlenose dolphin", "bottlenose whale"],
        "Risso's Dolphin": ["risso's dolphin"],
        "Pacific White-sided Dolphin": [
            "pacific white-sided dolphin",
            "pacific white sided dolphin",
        ],
        "Common Dolphin": ["common dolphin"],
        "Common Dolphin": ["long-beaked common dolphin", "long beaked common dolphin"],
        "Common Dolphin": ["short-beaked common dolphin"],
        "Striped Dolphin": ["striped dolphin"],
        "Spinner Dolphin": ["spinner dolphin"],
        "Spotted Dolphin": ["spotted dolphin"],
        "White-beaked Dolphin": ["white-beaked dolphin"],
        "Atlantic White-sided Dolphin": ["atlantic white-sided dolphin"],
        "Right Whale": ["northern right whale dolphin"],
        "Dolphin": ["dolphin"],  # Generic
        "Other": ["seal", "sea lion", "shark", "sunfish", "mola mola"],
    }

    # Check for each species
    for sp, keywords in species_keywords.items():
        if any(k in text for k in keywords):
            extracted_species.add(sp)

    # Special regex for orca individual IDs
    if re.search(r"\b[jklto]\-?\d+\b", text, re.IGNORECASE):
        extracted_species.add("Killer Whale")

    # Return list of species; if none but 'whale' mentioned, return ['Unspecified Whale']
    if "False Killer Whale" in extracted_species:
        if "Killer Whale" in extracted_species:
            extracted_species.discard("Killer Whale")
    if extracted_species:
        return sorted(extracted_species)
    elif "whale" in text:
        return ["Unspecified Whale"]
    else:
        return ["Unspecified"]


# Preprocess Acartia Data to Identify Likely Orca Sightings
def collect_likely_orca_sightings(data):
    ## Lower-Case
    data["pre_type"] = data["type"].astype(str)
    data["pre_type"] = data["pre_type"].str.lower()

    ## Remove Apostrophes
    data["pre_type"] = data["pre_type"].str.replace("'", "")

    ## Remove Slashes
    data["pre_type"] = data["pre_type"].str.replace("\\", "")

    ## Remove Colon
    data["pre_type"] = data["pre_type"].str.replace(":", "")

    ## Remove "Sighting"
    data["pre_type"] = data["pre_type"].str.replace(" sighting", "")

    ################################################################################
    # Standardize Type Column
    data["std_type"] = data["pre_type"].astype(str).str.lower().map(species_mapping)

    # Check for Nan Std Type -> this means that the type was not in our lookup
    if len(data[data["std_type"].isna()]) > 0:
        print("Check Keywords Lookup to ensure all are caught")
        print("Un caught")
        display(list(data[data["std_type"].isna()]["type"].unique()))

    ################################################################################
    # Check Unspecified Group - Notes Section
    data_tagged = data[data["std_type"] != "Unspecified"]
    data_tagged_likely_orca = data_tagged[data_tagged["std_type"] == "Killer Whale"][
        ["created", "latitude", "longitude", "data_source_comments"]
    ]

    # Standardize Source Comments Column
    data_untag = data[data["std_type"] == "Unspecified"].copy()

    ## Fill Null with Unspecified
    data_untag["std_data_source_comments"] = data_untag["data_source_comments"].fillna(
        "Unspecified"
    )

    data_untag["std_data_source_comments"] = data_untag[
        "std_data_source_comments"
    ].str.replace("\n", "")

    data_untag["std_data_source_comments"] = (
        data_untag["std_data_source_comments"]
        .str.replace("Orca Network", "")
        .str.replace("Orcq Network", "")
        .str.replace("http://www.orca", "")
        .str.replace("Orcasound", "")
    )

    data_untag["std_data_source_comments"] = data_untag[
        "std_data_source_comments"
    ].str.replace("[", "")

    data_untag["std_data_source_comments"] = data_untag[
        "std_data_source_comments"
    ].str.replace("]", "")

    data_untag["std_data_source_comments"] = (
        data_untag["std_data_source_comments"]
        .str.replace(" at", "")
        .str.replace(" the", "")
        .str.replace(" of", "")
        .str.replace("viewed", "")
    )

    data_untag["std_data_source_comments"] = data_untag[
        "std_data_source_comments"
    ].str.replace("Not orca", "")

    # Extract from notes
    data_untag["extracted_from_notes"] = data_untag["std_data_source_comments"].apply(
        extract_species_from_notes
    )

    # Explode Multiple Species
    data_untag = data_untag.explode("extracted_from_notes")

    data_untag_likely_orca = data_untag[
        data_untag["extracted_from_notes"] == "Killer Whale"
    ][["created", "latitude", "longitude", "data_source_comments"]]

    data_likely_orca = pd.concat([data_tagged_likely_orca, data_untag_likely_orca])
    data_likely_orca["DATE"] = data_likely_orca["created"].str[0:10]
    data_likely_orca = data_likely_orca.rename(
        columns={"latitude": "LATITUDE", "longitude": "LONGITUDE"}
    )

    return data_likely_orca


# build Pod Tagging
def build_pod_tag_list(j, k, l, t):
    tag = []
    if j == 1:
        tag.append("J")
    if k == 1:
        tag.append("K")
    if l == 1:
        tag.append("L")
    if t == 1:
        tag.append("T")

    if len(tag) == 0:
        tag = [None]

    return tag


# Define Pod Keys
def define_keys(data):
    long_string = ",".join(data.data_source_comments.astype(str))

    jpod_keys = [
        "J pod",
        "Jpod",
        "J ppd",
        "J Pod",
        "j Pod",
        "J-pod",
        "Js",
        "j pod",
        "jpod",
        "j ppd",
        "j-pod",
        "j+k",
        "k+j",
        "j & k",
        "k & j",
        "j and k",
        "k and j",
        "jk pods",
        "kj pods",
        "J/K",
        "j/k",
        "J+K",
        "K+J",
        "J & K",
        "K & J",
        "J and K",
        "K and J",
        "JK pods",
        "KJ pods",
        "j+l",
        "l+j",
        "j & l",
        "l & j",
        "j and l",
        "l and j",
        "jl pods",
        "lj pods",
        "J+L",
        "L+J",
        "J & L",
        "L & J",
        "J and L",
        "L and J",
        "JL pods",
        "LJ pods",
        "j, k, l pod",
        "j, k, and l pod",
        "jkl",
        "J, K, L pod",
        "J, K, and L pod",
        "JKL",
        "j27",
        "j38",
        "j35",
        "j40",
        "J27",
        "J38",
        "J35",
        "J40",
        "J,",
        "j,",
    ]

    # Supplement
    jpod_ids = matches = re.findall(
        r"\bj\s*\d+[A-Za-z]?\b", long_string, flags=re.IGNORECASE
    )
    jpod_ids = np.unique(jpod_ids)

    jpod_keys = np.unique(jpod_keys + list(jpod_ids))

    kpod_keys = [
        "K pod",
        "K Pod",
        "k pod",
        "Kpod",
        "K,",
        "k,",
        "K-pod",
        "Ks",
        "k pod",
        "kpod",
        "k-pod",
        "j+k",
        "k+j",
        "j & k",
        "J/K",
        "j/k",
        "k & j",
        "j and k",
        "k and j",
        "jk pods",
        "kj pods",
        "J+K",
        "K+J",
        "J & K",
        "K & J",
        "J and K",
        "K and J",
        "JK pods",
        "KJ pods",
        "k+l",
        "l+k",
        "k & l",
        "l & k",
        "k and l",
        "l and k",
        "lk pods",
        "kl pods",
        "K+L",
        "L+K",
        "K & L",
        "L & K",
        "K and L",
        "L and K",
        "LK pods",
        "KL pods",
        "j, k, l pod",
        "j, k, and l pod",
        "jkl",
        "J, K, L pod",
        "J, K, and L pod",
        "JKL",
        "k37",
        "K37",
    ]

    kpod_ids = matches = re.findall(
        r"\bk\s*\d+[A-Za-z]?\b", long_string, flags=re.IGNORECASE
    )
    kpod_ids = np.unique(kpod_ids)

    kpod_keys = np.unique(kpod_keys + list(kpod_ids))

    lpod_keys = [
        "L pod",
        "Lpod",
        "L-pod",
        "L,",
        "l,",
        "Ls",
        "j+l",
        "l+j",
        "j & l",
        "l & j",
        "j and l",
        "l and j",
        "jl pods",
        "lj pods",
        "J+L",
        "L+J",
        "J & L",
        "L & J",
        "J and L",
        "L and J",
        "JL pods",
        "LJ pods",
        "k+l",
        "l+k",
        "k & l",
        "l & k",
        "k and l",
        "l and k",
        "lk pods",
        "kl pods",
        "K+L",
        "L+K",
        "K & L",
        "L & K",
        "K and L",
        "L and K",
        "LK pods",
        "KL pods",
        "j, k, l pod",
        "j, k, and l pod",
        "jkl",
        "J, K, L pod",
        "J, K, and L pod",
        "JKL",
        "l12",
        "l54",
        "l-12",
        "l82",
        "l85",
        "l87",
        "L12",
        "L54",
        "L-12",
        "L82",
        "L85",
        "L87",
        "L4",
    ]

    lpod_ids = matches = re.findall(
        r"\bl\s*\d+[A-Za-z]?\b", long_string, flags=re.IGNORECASE
    )
    lpod_ids = np.unique(lpod_ids)

    lpod_keys = np.unique(lpod_keys + list(lpod_ids))

    biggs_keys = [
        "Bigg",
        "bigg",
        "T Party",
        "sea lion",
        "Sea Lion",
        "seal",
        "Transient",
        "transient",
        "Ts",
        "t99",
        "65",
        "Hunting a seal close to shore at Alki",
        "65A",
        "65a",
        "T046B",
        "t046B" "t137",
        "t46",
        "hunting Harbor porpoises",
        "t10",
        "t2c",
        "t49",
        "T99",
        "T137",
        "T36",
        "T10",
        "T2C",
        "T49",
        "T65A",
        "T124",
        "T35",
        "T46",
        "T087",
        "T87",
        "T60",
        "T75" "T68",
        "T65",
        "T77",
        "T19",
        "T18",
        "T34",
        "T90",
        "T123",
        "T68",
        "T11",
        "T37",
    ]

    biggs_ids = matches = re.findall(
        r"\bt\s*\d+[A-Za-z]?\b", long_string, flags=re.IGNORECASE
    )
    biggs_ids = np.unique(biggs_ids)

    biggs_keys = np.unique(biggs_keys + list(biggs_ids))

    srkw_keys = (
        [
            "SRKW",
            "srkw",
            "srs",
            "srk",
            "salmon",
            "Southern Residents",
            "southern resident",
            "Southern Resident Killer Whales",
            "Southern Resident",
            " resident",
            " Resident",
        ]
        + list(jpod_keys)
        + list(kpod_keys)
        + list(lpod_keys)
    )

    return jpod_keys, kpod_keys, lpod_keys, biggs_keys, srkw_keys


# Assign Pod-Type
def assign_pod_type_bool(data, jpod_keys, kpod_keys, lpod_keys, biggs_keys, srkw_keys):

    # J pod
    data["J"] = data["data_source_comments"].apply(
        lambda x: 1 if any([k for k in jpod_keys if k in str(x)]) else 0
    )
    # K pod
    data["K"] = data["data_source_comments"].apply(
        lambda x: 1 if any([k for k in kpod_keys if k in str(x)]) else 0
    )
    # L pod
    data["L"] = data["data_source_comments"].apply(
        lambda x: 1 if any([k for k in lpod_keys if k in str(x)]) else 0
    )

    data["T"] = data["data_source_comments"].apply(
        lambda x: 1 if any([k for k in biggs_keys if k in str(x)]) else 0
    )

    data["SRKW"] = data["data_source_comments"].apply(
        lambda x: 1 if any([k for k in srkw_keys if k in str(x)]) else 0
    )

    data["POD_TAG"] = data.apply(
        lambda x: build_pod_tag_list(j=x["J"], k=x["K"], l=x["L"], t=x["T"]), axis=1
    )

    data = data.explode("POD_TAG")

    return data


# Preprocess Acartia Data
def preprocess_acartia_data(data):
    data.columns = data.columns.str.strip()
    data = data[["created", "latitude", "longitude", "type", "data_source_comments"]]

    # Identify Likely Orcas
    data = collect_likely_orca_sightings(data)

    # Build Pod-Keys
    jpod_keys, kpod_keys, lpod_keys, biggs_keys, srkw_keys = define_keys(data)

    data_passed = assign_pod_type_bool(
        data, jpod_keys, kpod_keys, lpod_keys, biggs_keys, srkw_keys
    )
    data_passed["POD_TAG"] = data_passed["POD_TAG"].fillna("O")
    data_passed["POD_TYPE"] = data_passed["POD_TAG"].map(pod_lookup)
    data_passed["POD_TYPE"] = np.where(
        data_passed["SRKW"] == 1, "SRKW", data_passed["POD_TYPE"]
    )

    data_passed["COUNT"] = 1

    data = data_passed.groupby(
        ["DATE", "LATITUDE", "LONGITUDE", "POD_TYPE", "POD_TAG"], as_index=False
    )["COUNT"].sum()
    data = data[data.DATE >= "2020-01-01"]

    return data


# Preprocess Sightings Data - Killer Whales (TODO: need to accoutn for northern resident in Acartia sightings.)
def preprocess_kw_sightings_data(directory, source):
    if os.path.exists(directory):

        # Read & concat all CSVs
        if ".csv" in directory:
            data = pd.read_csv(directory)
        else:
            data = []
            for path in glob.glob(f"{directory}/*.csv"):
                df = pd.read_csv(path)
                data.append(df)
            if len(data) > 0:
                data = pd.concat(data)
            else:
                data = pd.DataFrame()

        if source == "TWM":
            # Preprocess TWM Data
            data = preprocess_twm_data(data)
            data["SOURCE"] = "TWM"

        elif source == "ACARTIA":
            # Preprocess Acartia Data
            data = preprocess_acartia_data(data)
            data["SOURCE"] = "ACARTIA"

        return data


#                                         #
# --------------------------------------- #

# --------------------------------------- #
#              DATA LOOKUPS               #

# Pod-Community Lookup
pod_lookup = {
    "A": "NRKW",
    "G": "NRKW",
    "R": "NRKW",
    "J": "SRKW",
    "K": "SRKW",
    "L": "SRKW",
    "T": "TRANSIENT",
    "O": "OTHER",
}


# Define mapping for standardizing species names
species_mapping = {
    "humpback": "Humpback Whale",
    "ballena jorobada": "Humpback Whale",
    "humpback whale": "Humpback Whale",
    "humpback sighting:": "Humpback Whale",
    "orca": "Killer Whale",
    "killer whale": "Killer Whale",
    "southern resident orca": "Killer Whale",
    "southern resident killer whale": "Killer Whale",
    "killer whale (orca)": "Killer Whale",
    "orca (ballena asesina)": "Killer Whale",
    "killer whale sighting:": "Killer Whale",
    "orca sighting:": "Killer Whale",
    "southern resident killer whale sighting:": "Killer Whale",
    "killer whale (orca) sighting:": "Killer Whale",
    "gray": "Gray Whale",
    "grey": "Gray Whale",
    "gray whale": "Gray Whale",
    "grey whale": "Gray Whale",
    "gray whale sighting:": "Gray Whale",
    "blue whale": "Blue Whale",
    "ballena azul": "Blue Whale",
    "blue whale sighting:": "Blue Whale",
    "fin whale": "Fin Whale",
    "finback whale": "Fin Whale",
    "fin whale sighting:": "Fin Whale",
    "minke whale": "Minke Whale",
    "petit rorqual": "Minke Whale",
    "minke whale sighting:": "Minke Whale",
    "harbor porpoise": "Harbor Porpoise",
    "marsouin commun": "Harbor Porpoise",
    "dalls porpoise": "Dall's Porpoise",
    "pacific white-sided dolphin": "Pacific White-sided Dolphin",
    "pacific white-sided dolphin sighting:": "Pacific White-sided Dolphin",
    "common dolphin": "Common Dolphin",
    "common dolphin - unidentified": "Common Dolphin",
    "common dolphin sighting:": "Common Dolphin",
    "common long-beaked dolphin": "Common Dolphin",
    "long-beaked common dolphin": "Common Dolphin",
    "common short-beaked dolphin": "Common Dolphin",
    "right whale": "Right Whale",
    "right whale sighting:": "Right Whale",
    "black right whale": "Right Whale",
    "northern right whale dolphin": "Right Whale",
    "northern right whale dolphin sighting:": "Right Whale",
    "sperm whale": "Sperm Whale",
    "sperm whale sighting:": "Sperm Whale",
    "white-beaked dolphin": "White-beaked Dolphin",
    "striped dolphin": "Striped Dolphin",
    "rissos dolphin": "Risso's Dolphin",
    "rissos dolphin sighting:": "Risso's Dolphin",
    "sei whale": "Sei Whale",
    "steller sealion": "Steller Sea Lion",
    "short finned pilot whale": "Short-finned Pilot Whale",
    "short finned pilot whale sighting:": "Short-finned Pilot Whale",
    "bottlenose dolphin": "Bottlenose Dolphin",
    "bottlenose whale": "Bottlenose Dolphin",
    "beluga": "Beluga Whale",
    "beluga whale": "Beluga Whale",
    "beluga whale sighting:": "Beluga Whale",
    "sowerbys beaked whale": "Sowerby's Beaked Whale",
    "atlantic white-sided dolphin": "Atlantic White-sided Dolphin",
    "atlantic white-sided dolphin sighting:": "Atlantic White-sided Dolphin",
    "bairds beaked whale": "Baird's Beaked Whale",
    "mola mola / sunfish": "Mola Mola",
    "blue shark": "Blue Shark",
    "whale - unidentified": "Unspecified",
    "unspecified": "Unspecified",
    "unspecified sighting:": "Unspecified",
    "other": "Unspecified",
    "other (specify in comments)": "Unspecified",
    "other sighting:": "Unspecified",
    "other (specify in comments) sighting:": "Unspecified",
    "other species": "Unspecified",
    "non spécifié": "Unspecified",
    "non spã©cifiã©": "Unspecified",
    "autre": "Unspecified",
    "": "Unspecified",
    "nan": "Unspecified",
    np.nan: "Unspecified",
}

#                                         #
# --------------------------------------- #

## Preprocess Whale Sightings Dataset from TWM and Acartia

In [None]:
# Preprocess TWM Data
tmw_directory = "/Users/tylerstevenson/Documents/CODE/orcasalmon/data/twm"
twm_data = preprocess_kw_sightings_data(directory=tmw_directory, source="TWM")

# Preprocess TWM Data
acartia_directory = "/Users/tylerstevenson/Documents/CODE/FindMyWhale/data/raw/sightings/acartia-export.csv"
acartia_data = preprocess_kw_sightings_data(
    directory=acartia_directory, source="ACARTIA"
)

# Sightings Data
sightings_data = pd.concat([twm_data, acartia_data])

# Post-Process Sightings Data to Make Ready for Analysis
# Remove Non-Located Sightings
sightings_data = sightings_data[
    (sightings_data.LATITUDE != "<Null>") & (sightings_data.LONGITUDE != "<Null>")
]

# D-Type Handling
sightings_data["LATITUDE"] = sightings_data["LATITUDE"].astype(float)
sightings_data["LONGITUDE"] = sightings_data["LONGITUDE"].astype(float)
sightings_data["DATETIME"] = pd.to_datetime(sightings_data["DATE"])

# Make Date Type
sightings_data["DATE"] = sightings_data["DATETIME"].dt.date
sightings_data["DOY"] = sightings_data["DATETIME"].dt.day_of_year
sightings_data["WOY"] = sightings_data["DATETIME"].dt.isocalendar().week
sightings_data["MONTH"] = sightings_data["DATETIME"].dt.month
sightings_data["YEAR"] = sightings_data["DATETIME"].dt.year
sightings_data["YEAR_WEEK"] = sightings_data["DATETIME"].dt.strftime("%Y-%U")
sightings_data["YEAR_MONTH"] = sightings_data["DATETIME"].dt.strftime("%Y-%m-01")

# Calculate the weekday number (Monday=0, Sunday=6)
weekday_num = sightings_data["DATETIME"].dt.weekday

# Calculate the date of the previous Sunday by subtracting (weekday_num + 1) % 7 days
sightings_data["STAT_WEEK_SUNDAY"] = sightings_data["DATETIME"] - pd.to_timedelta(
    (weekday_num + 1) % 7, unit="D"
)

In [None]:
# Export to File
sightings_preprocessed_data_dir = "../data/processed/ORCA_SIGHTINGS"
if not os.path.exists(sightings_preprocessed_data_dir):
    os.makedirs(sightings_preprocessed_data_dir)

# Outpath Path
output_path = f"{sightings_preprocessed_data_dir}/ORCA_SIGHTINGS.parquet"

# Export to Path
sightings_data.to_parquet(output_path)

In [None]:
sightings_data

In [None]:
daily_counts = sightings_data.groupby(["DATE", "POD_TYPE"], as_index=False)[
    "COUNT"
].sum()

In [None]:
# fig = px.line(daily_counts, x="DATE", y="COUNT", color="POD_TYPE")
# fig.show()

## Analysis of Sightings Data

In [None]:
selected_latitude = 46.2167
selected_longitude = -123.9333
selected_buffer_distance = 100

import geopandas as gpd

# Step 1: Make GeoDataFrame in EPSG:4326
area_filter = gpd.GeoDataFrame(
    geometry=gpd.points_from_xy([selected_longitude], [selected_latitude]),
    crs="EPSG:4326",
)
area_filter = area_filter.to_crs(epsg=32610)

# Step 3: Buffer in meters (buffer_km * 1000)
area_filter["geometry"] = area_filter.geometry.buffer(selected_buffer_distance * 1000)

# Step 4: Back to EPSG:4326 (lat/lon degrees)
area_filter = area_filter.to_crs(epsg=4326)

selected_area = area_filter.copy()
# plot_data = plot_data.sjoin(area_filter)

In [None]:
area_filter.explore()

In [None]:
# Convert to Geospatial DataSet
orca_sightings_df = gpd.GeoDataFrame(
    sightings_data,
    geometry=gpd.points_from_xy(sightings_data.LONGITUDE, sightings_data.LATITUDE),
    crs="EPSG:4326",  # WGS84 lat/lon
)

In [None]:
orca_sightings_df_filtered = orca_sightings_df.sjoin(area_filter)

In [None]:
tot_columbia_sightings = orca_sightings_df_filtered.groupby(
    ["WOY", "POD_TYPE"], as_index=False
)["COUNT"].sum()

# tot_columbia_sightings = tot_columbia_sightings[
#     tot_columbia_sightings.POD_TYPE == "SRKW"
# ]

df_srkw = pd.DataFrame({"WOY": list(range(1, 53))})
df_srkw["POD_TYPE"] = "SRKW"

df_tran = pd.DataFrame({"WOY": list(range(1, 53))})
df_tran["POD_TYPE"] = "TRANSIENT"

df_othe = pd.DataFrame({"WOY": list(range(1, 53))})
df_othe["POD_TYPE"] = "OTHER"

df = pd.concat([df_srkw, df_tran, df_othe])

tot_columbia_sightings = pd.merge(tot_columbia_sightings, df, how="outer")
tot_columbia_sightings["COUNT"] = tot_columbia_sightings["COUNT"].fillna(0)

In [None]:
fig = px.line(tot_columbia_sightings, x="WOY", y="COUNT", color="POD_TYPE")
fig.show()

In [None]:
import plotly.graph_objects as go
import numpy as np

In [None]:
# read in bonneville data
bon_raw = pd.read_parquet(
    "../data/processed/FPC_DAM_COUNTS/Bonneville_FPC_DAM_COUNTS.parquet"
)
bon_raw["Count"] = np.where(bon_raw["Count"] < 0, 0, bon_raw["Count"])
bon = bon_raw.groupby(["WoY", "Species"], as_index=False)["Count"].mean()
bon = bon.rename(columns={"WoY": "WOY"})

In [None]:
def plot_area_plot_stacked(
    bon_raw, metric, metric_lower, metric_upper, dam, species="ChinookAdult"
):
    bon_woy = bon_raw[
        (bon_raw[metric] >= metric_lower) & (bon_raw[metric] <= metric_upper)
    ]
    bon_woy_chinook = bon_woy[bon_woy.Species == species]
    bon_woy_chinook["Year"] = bon_woy_chinook["Year"].astype(int)
    bon_woy_chinook = bon_woy_chinook.sort_values(metric).reset_index(drop=True)

    bon_woy_chinook = bon_woy_chinook.groupby([metric, "Year"], as_index=False)[
        "Count"
    ].sum()

    # Define blue-to-red color gradient
    years = sorted(bon_woy_chinook["Year"].unique())
    n_years = len(years)
    # Interpolate colors from blue (#0000FF) to red (#FF0000)
    colors = [
        f"rgb({int(255 * (1 - i/(n_years-1)))}, 0, {int(255 * i/(n_years-1))})"
        for i in range(n_years)
    ]
    year_color_map = dict(zip(years, colors))

    # Calculate total count per year and sort years by total count descending (largest first)
    total_counts = (
        bon_woy_chinook.groupby("Year")["Count"].sum().sort_values(ascending=False)
    )
    sorted_years = total_counts.index.tolist()

    # Create the filled area plot with largest areas in the back
    fig = go.Figure()

    for year in sorted_years:
        # Filter data for the current year
        df_year = bon_woy_chinook[bon_woy_chinook["Year"] == year].sort_values(metric)
        fig.add_trace(
            go.Scatter(
                x=df_year[metric],
                y=df_year["Count"],
                mode="lines",
                name=str(year),
                fill="tozeroy",  # Fill area to y=0 (non-stacked)
                line=dict(color=year_color_map[year], width=2),
                fillcolor=year_color_map[year],  # Fill with same color as line
                # hovertemplate=f"{metric}: %{x}<br>Count: %{y}<br>Year: %{text}",
                text=[str(year)] * len(df_year),
            )
        )

    # # Update layout
    fig.update_layout(
        title=f"{species} Counts by {metric}, Colored by Year - {dam}",
        xaxis_title=metric,
        yaxis_title="Count",
        legend_title="Year",
        plot_bgcolor="black",  # Black background
        paper_bgcolor="black",  # Black surrounding area
        font=dict(color="white"),  # White text for labels
    )

    return fig, bon_woy_chinook


# Get Area Plot
fig, bon_ = plot_area_plot_stacked(
    bon_raw, metric="WoY", metric_lower=0, metric_upper=53, dam="Bonneville", species = 'ChinookAdult'
)
fig.show()

In [None]:
box_way_pivot = pd.pivot_table(bon_, index="WoY", columns="Year", values="Count")

fig = px.imshow(box_way_pivot, color_continuous_scale="turbo")
fig.show()

In [None]:
tmp = tot_columbia_sightings.copy()
tmp["COUNT"] = np.where(
    (tmp["COUNT"] > 0) & (tmp["POD_TYPE"] == "OTHER"), 250000, tmp["COUNT"]
)
tmp["COUNT"] = np.where(
    (tmp["COUNT"] > 0) & (tmp["POD_TYPE"] == "SRKW"), 150000, tmp["COUNT"]
)
tmp["COUNT"] = np.where(
    (tmp["COUNT"] > 0) & (tmp["POD_TYPE"] == "TRANSIENT"), 50000, tmp["COUNT"]
)
tmp = pd.pivot_table(tmp.fillna("nn"), columns="WOY", values="COUNT", index="POD_TYPE")
tmp = tmp[tmp.index != "nn"].fillna(0)


tmp = pd.merge(
    box_way_pivot.reset_index(),
    tmp.T.reset_index().rename(columns={"WOY": "WoY", "OTHER": 2026, "SRKW": 2027}),
)

px.imshow(tmp, color_continuous_scale="turbo")
#

## Tagging Data

In [None]:
import requests
import time
from shapely.geometry import Point


# https://www.fisheries.noaa.gov/inport/item/18090
def fetch_srkw_coastal_page(offset=0, rows=100):
    """Fetch a single page of SRKW coastal occurrence data."""
    url = "https://www.webapps.nwfsc.noaa.gov/apex/parr/srkw_occurrence_coastal/data/page/"
    params = {"offset": offset, "rows": rows}
    headers = {"User-Agent": "orca-fetcher-9000"}

    response = requests.get(url, params=params, headers=headers)
    response.raise_for_status()
    return response.json().get("items", [])


def fetch_all_srkw_coastal(max_pages=500, rows_per_page=100, sleep_sec=0.5):
    """Fetch all available pages of SRKW data, auto-stop when empty."""
    all_records = []
    for i in range(max_pages):
        offset = i * rows_per_page
        # print(f"Fetching page {i} (offset={offset})...")
        try:
            page = fetch_srkw_coastal_page(offset=offset, rows=rows_per_page)
            if not page:
                print("No more records — halting.")
                break
            all_records.extend(page)
            time.sleep(sleep_sec)  # be kind to their server
        except Exception as e:
            print(f"Page {i} errored: {e}")
            continue

    return pd.DataFrame(all_records)


def to_geodataframe(df, lon_field="lon_p", lat_field="lat_p"):
    """Convert a DataFrame with lon/lat to a GeoDataFrame."""
    df = df.copy()
    df["geometry"] = df.apply(lambda row: Point(row[lon_field], row[lat_field]), axis=1)
    return gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")


# 🐋 Run the full download
tagging_df = fetch_all_srkw_coastal()

In [None]:
tagging_gdf = to_geodataframe(tagging_df)

In [None]:
tagging_gdf_in_aoi = tagging_gdf.sjoin(area_filter)
tagging_gdf_in_aoi["NEAR_MOUTH"] = "NEAR - MOUTH"

tagging_gdf = pd.merge(tagging_gdf, tagging_gdf_in_aoi, how="outer")
tagging_gdf["NEAR_MOUTH"] = tagging_gdf["NEAR_MOUTH"].fillna("AWAY")

tagging_gdf["DATE"] = tagging_gdf["gmt_date"]
tagging_gdf["DATETIME"] = pd.to_datetime(tagging_gdf["DATE"])
tagging_gdf["YEAR"] = tagging_gdf["DATETIME"].dt.year
tagging_gdf["DOY"] = tagging_gdf["DATETIME"].dt.day_of_year
tagging_gdf["WOY"] = tagging_gdf["DATETIME"].dt.isocalendar().week

In [None]:
tags = tagging_gdf[["WOY", "DOY", "YEAR", "NEAR_MOUTH"]].drop_duplicates(
    subset=["WOY", "NEAR_MOUTH"]
)
tags = tags[(tags.DOY > 50) & (tags.DOY < 200) & (tags.NEAR_MOUTH == "NEAR - MOUTH")]

## Dam Analysis vs. Sightings

In [None]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np


def get_species_table(
    dam_data_raw, dam_name, date_metric, min_date_metric, max_date_metric
):
    species_table = []
    for species_name in dam_data_raw.Species.unique():
        woy_analysis_table = build_dam_peak_analysis(
            dam_data_raw,
            dam_name,
            species_name,
            date_metric,
            min_date_metric,
            max_date_metric,
        )
        species_table.append(woy_analysis_table)

    species_table = pd.concat(species_table)

    return species_table


def run_analysis_on_dams(dam_data, date_metric="WoY"):

    # Preprocess Data
    dam_data = dam_data.groupby([date_metric, "Year", "Decade"], as_index=False)[
        "Count"
    ].sum()

    # Expand the data
    expanded_values = np.repeat(dam_data[date_metric], dam_data["Count"])

    # Calculate the median
    long_term_median = np.median(expanded_values)

    # Collect for All Years
    # Get Year Mdeians
    year_val, median_vals, mean_vals, std_vals = [], [], [], []
    for year_ in dam_data["Year"].unique():
        df_ = dam_data[dam_data["Year"] == year_].copy()
        # Expand the data
        expanded_values = np.repeat(
            df_[date_metric],
            df_["Count"],
        )

        # Calculate the median
        mean_val = np.mean(expanded_values)
        mean_vals.append(mean_val)

        std_val = np.std(expanded_values)
        std_vals.append(std_val)

        median_val = np.median(expanded_values)
        median_vals.append(median_val)

        year_val.append(year_)

    test_ = pd.DataFrame(
        {
            "YEAR": year_val,
            f"MEDIAN_{date_metric}": median_vals,
            f"MEAN_{date_metric}": mean_vals,
            f"STD_{date_metric}": std_vals,
        }
    )
    test_ = test_.sort_values("YEAR").reset_index(drop=True)

    return test_, long_term_median


def plot_species_area_plot_at_dam(
    df,
    dam_name,
    orca_sightings_df_filtered,
    selected_buffer_distance,
    tags,
    date_metric,
    adjust_to_estimate_density_at_mouth,
    how_much_to_adjust,
):

    # Get unique species dynamically
    species_list = sorted(df["Species"].unique())
    print(species_list)
    num_species = len(species_list)

    # Generate hues spaced evenly across the color wheel
    hues = np.linspace(0, 360, num_species, endpoint=False)

    # Create figure
    fig = go.Figure()

    # Add traces for each species
    for i, species in enumerate(species_list):
        # Filter data for the species
        df_species = df[df["Species"] == species].sort_values("YEAR")

        # Define colors based on hue
        hue = hues[i]
        mean_color = f"hsl({hue}, 100%, 40%)"  # Darker for MEAN line
        std_color = f"hsl({hue}, 100%, 70%)"  # Lighter for STD lines
        fill_color = (
            f"hsla({hue}, 100%, 70%, 0.3)"  # Even lighter with transparency for fill
        )

        if adjust_to_estimate_density_at_mouth == True:
            lower_b = (
                df_species[f"MEAN_{date_metric}"] - df_species[f"STD_{date_metric}"]
            ) - how_much_to_adjust
            upper_b = (
                df_species[f"MEAN_{date_metric}"]
                + df_species[f"STD_{date_metric}"]
                - how_much_to_adjust
            )
            val_p = df_species[f"MEAN_{date_metric}"] - how_much_to_adjust
            adjusted = f"-Adjusted ({how_much_to_adjust} days)"
        else:
            lower_b = (
                df_species[f"MEAN_{date_metric}"] - df_species[f"STD_{date_metric}"]
            )
            upper_b = (
                df_species[f"MEAN_{date_metric}"] + df_species[f"STD_{date_metric}"]
            )
            val_p = df_species[f"MEAN_{date_metric}"]
            adjusted = ""

        # Lower bound trace (MEAN - STD)
        fig.add_trace(
            go.Scatter(
                x=df_species["YEAR"],
                y=lower_b,
                mode="lines",
                line=dict(color=std_color, width=1),
                name=f"{species} - STD",
                legendgroup=species,
                showlegend=False,
            )
        )

        # Upper bound trace with fill to lower bound
        fig.add_trace(
            go.Scatter(
                x=df_species["YEAR"],
                y=upper_b,
                mode="lines",
                line=dict(color=std_color, width=1),
                fill="tonexty",
                fillcolor=fill_color,
                name=f"{species} + STD",
                legendgroup=species,
                showlegend=False,
            )
        )

        # MEAN line trace
        fig.add_trace(
            go.Scatter(
                x=df_species["YEAR"],
                y=val_p,
                mode="lines",
                line=dict(color=mean_color, width=2),
                name=species,
                legendgroup=species,
            )
        )

    # Compute decade marks for grid lines
    min_year = df["YEAR"].min() // 10 * 10
    max_year = (df["YEAR"].max() // 10 + 1) * 10
    decades = list(range(int(min_year), int(max_year) + 1, 10))

    # Update layout
    fig.update_layout(
        title=f"Early Spring Run - Average Contributing Days at {dam_name} Dam by Species (Day 75 - 175) {adjusted}",
        xaxis_title="Year",
        yaxis_title=date_metric,
        showlegend=True,
        paper_bgcolor="white",
        plot_bgcolor="white",
        font=dict(color="black"),
        xaxis=dict(
            gridcolor="lightgrey",
            showgrid=True,
            tickmode="array",
            tickvals=decades,
            title_font=dict(color="black"),
            tickfont=dict(color="black"),
        ),
        yaxis=dict(
            gridcolor="lightgrey",
            showgrid=True,
            title_font=dict(color="black"),
            tickfont=dict(color="black"),
        ),
    )
    sights = orca_sightings_df_filtered.copy()
    if date_metric == "DoY":
        sights = sights.rename(columns={"DOY": "DoY"})
    if date_metric == "WoY":
        sights = sights.rename(columns={"WOY": "WoY"})
    sights = sights[["YEAR", "POD_TYPE", date_metric]].drop_duplicates()
    sights = sights[sights[date_metric] < 180]
    fig.add_scatter(
        x=sights[sights.POD_TYPE == "SRKW"]["YEAR"],
        y=sights[sights.POD_TYPE == "SRKW"][date_metric],
        name=f"Sightings (<{selected_buffer_distance}KM) - SRKW",
        mode="markers",
        marker_color="blue",
    )
    fig.add_scatter(
        x=sights[sights.POD_TYPE == "OTHER"]["YEAR"],
        y=sights[sights.POD_TYPE == "OTHER"][date_metric],
        name=f"Sightings (<{selected_buffer_distance}KM) - OTHER",
        mode="markers",
        marker_color="green",
    )
    fig.add_scatter(
        x=sights[sights.POD_TYPE == "TRANSIENT"]["YEAR"],
        y=sights[sights.POD_TYPE == "TRANSIENT"][date_metric],
        name=f"Sightings (<{selected_buffer_distance}KM) - TRANSIENT",
        mode="markers",
        marker_color="orange",
    )

    # Assign colors and shapes for NEAR_MOUTH
    near_mouth_values = tags["NEAR_MOUTH"].unique()

    colors = ["#1f77b4", "#ff7f0e"]
    shapes = ["cross", "star"]
    near_mouth_map = {
        val: {"color": colors[i], "shape": shapes[i]}
        for i, val in enumerate(near_mouth_values)
    }

    # Add scatter for tags
    for near_mouth in near_mouth_values:
        tags_subset = tags[tags["NEAR_MOUTH"] == near_mouth]
        if date_metric == "DoY":
            tags_subset = tags_subset.rename(columns={"DOY": "DoY"})
        if date_metric == "WoY":
            tags_subset = tags_subset.rename(columns={"WOY": "WoY"})
        fig.add_trace(
            go.Scatter(
                x=tags_subset["YEAR"],
                y=tags_subset[date_metric],
                mode="markers",
                marker=dict(
                    color="black",
                    symbol="x",
                    size=4,
                    # line=dict(width=1, color="black"),
                ),
                name=f"Tagged (<{selected_buffer_distance}KM) - SRKW",
                legendgroup=f"Tagged (<{selected_buffer_distance}KM) - SRKW",
            )
        )

    return fig


def plot_analysis(
    test_,
    dam_name,
    species_name,
    date_metric,
    long_term_median,
    orca_sightings_df_filtered,
    date_upper_limit,
):
    fig = px.line(title=f"Analysis of WoY Count at {dam_name} - {species_name}")
    fig.add_scatter(
        y=test_[f"MEAN_{date_metric}"],
        x=test_["YEAR"],
        name=f"Mean Revist ({date_metric}) ",
        marker_color="#F4442E",
    )
    fig.add_scatter(
        y=test_[f"MEAN_{date_metric}"] + test_[f"STD_{date_metric}"],
        x=test_["YEAR"],
        name="Upper Bounds",
        marker_color="#FFB7AE",
    )
    fig.add_scatter(
        y=test_[f"MEAN_{date_metric}"] - test_[f"STD_{date_metric}"],
        x=test_["YEAR"],
        name="Lower Bounds",
        marker_color="#FFB7AE",
    )

    sights = orca_sightings_df_filtered.copy()
    if date_metric == "DoY":
        sights = sights.rename(columns={"DOY": "DoY"})
    sights = sights[["YEAR", "POD_TYPE", date_metric]].drop_duplicates()
    sights = sights[sights[date_metric] < date_upper_limit]
    fig.add_scatter(
        x=sights[sights.POD_TYPE == "SRKW"]["YEAR"],
        y=sights[sights.POD_TYPE == "SRKW"][date_metric],
        name="Sightings (<115KM) - SRKW",
        mode="markers",
        marker_color="blue",
    )
    fig.add_scatter(
        x=sights[sights.POD_TYPE == "OTHER"]["YEAR"],
        y=sights[sights.POD_TYPE == "OTHER"][date_metric],
        name="Sightings (<115KM) - OTHER",
        mode="markers",
        marker_color="green",
    )
    fig.add_scatter(
        x=sights[sights.POD_TYPE == "TRANSIENT"]["YEAR"],
        y=sights[sights.POD_TYPE == "TRANSIENT"][date_metric],
        name="Sightings (<115KM) - TRANSIENT",
        mode="markers",
        marker_color="orange",
    )

    fig.add_hline(y=long_term_median)

    # # Update layout
    fig.update_layout(
        title=f"{dam_name} Dam - {species_name} Counts by {date_metric} vs. Columbia River Mouth Killer Whale Sightings",
        xaxis_title="Year",
        yaxis_title=date_metric,
        legend_title="Year",
        plot_bgcolor="white",  # Black background
        paper_bgcolor="white",  # Black surrounding area
        font=dict(color="black"),  # White text for labels
        # showlegend=True,
    )
    return fig


def build_dam_peak_analysis(
    dam_data_raw, dam_name, species_name, date_metric, min_date_metric, max_date_metric
):
    dam_data_preproc = dam_data_raw[
        (dam_data_raw[date_metric] >= min_date_metric)
        & (dam_data_raw[date_metric] <= max_date_metric)
    ]

    dam_data_preproc = dam_data_preproc[dam_data_preproc.Species == species_name]
    dam_data_preproc["Year"] = dam_data_preproc["Year"].astype(int)
    dam_data_preproc = dam_data_preproc.sort_values(date_metric).reset_index(drop=True)
    dam_data_preproc["Decade"] = dam_data_preproc["Year"].astype(str).str[0:3]
    dam_data_preproc["Decade"] = (
        dam_data_preproc["Decade"].apply(lambda x: f"{x}0").astype(int)
    )
    dam_data_preproc = dam_data_preproc[dam_data_preproc.Count >= 0]

    # Get Analysis Table
    woy_analysis_table, long_term_median = run_analysis_on_dams(
        dam_data_preproc, date_metric=date_metric
    )
    woy_analysis_table["Species"] = species_name
    woy_analysis_table["Dam"] = dam_name
    woy_analysis_table["LongTermMedian"] = long_term_median

    add_decade = dam_data_preproc[["Year", "Decade"]].drop_duplicates()
    add_decade.columns = ["YEAR", "DECADE"]

    woy_analysis_table = pd.merge(woy_analysis_table, add_decade)

    woy_analysis_table = pd.merge(
        woy_analysis_table,
        woy_analysis_table.groupby("DECADE", as_index=False).agg(
            DECADE_AVG=(f"MEAN_{date_metric}", "mean")
        ),
    )

    woy_analysis_table = pd.merge(
        woy_analysis_table,
        woy_analysis_table.groupby("DECADE", as_index=False).agg(
            DECADE_STD=(f"STD_{date_metric}", "mean")
        ),
    )

    return woy_analysis_table

In [None]:
# read in bonneville data
bon_raw = pd.read_parquet(
    "../data/processed/FPC_DAM_COUNTS/Bonneville_FPC_DAM_COUNTS.parquet"
)

wlm_raw = pd.read_parquet(
    "../data/processed/FPC_DAM_COUNTS/Willamette_FPC_DAM_COUNTS.parquet"
)

In [None]:
# Get Area Plot
fig, bon_ = plot_area_plot_stacked(
    pd.concat([wlm_raw, bon_raw]),
    metric="WoY",
    metric_lower=10,
    metric_upper=25,
    dam="Willamette + Bonneville",
    species = 'ChinookAdult'
)
fig.show()

In [None]:
date_metric = "WoY"
min_date_metric = 0
max_date_metric = 30

In [None]:
# Get Bonneville Raw Species Analysis Data
bon_species_table = get_species_table(
    dam_data_raw=bon_raw,
    dam_name="Bonneville",
    date_metric=date_metric,
    min_date_metric=min_date_metric,
    max_date_metric=max_date_metric,
)

# Get Willamette Raw Species Analysis Data
wlm_raw_species_table = get_species_table(
    dam_data_raw=wlm_raw,
    dam_name="Willamette",
    date_metric=date_metric,
    min_date_metric=min_date_metric,
    max_date_metric=max_date_metric,
)

In [None]:
# Filter to Common Meals for SRKW
species_list = ["ChinookAdult", "ChinookJack", "Steelhead"]

df_wlm = wlm_raw_species_table[wlm_raw_species_table.Species.isin(species_list)].copy()
df_wlm["Species"] = df_wlm["Species"].apply(lambda x: f"{x} - Willamette")

df_bon = bon_species_table[bon_species_table.Species.isin(species_list)].copy()
df_bon["Species"] = df_bon["Species"].apply(lambda x: f"{x} - Bonneville")

df = pd.concat([df_wlm, df_bon])

fig = plot_species_area_plot_at_dam(
    df,
    dam_name="Bonenville / Willamette",
    orca_sightings_df_filtered=orca_sightings_df_filtered,
    selected_buffer_distance=selected_buffer_distance,
    tags=tags,
    date_metric=date_metric,
    adjust_to_estimate_density_at_mouth=False,
    how_much_to_adjust=1,
)


fig.show()

In [None]:
# # Get Area Plot
# fig, bon_ = plot_area_plot_stacked(
#     wlm_raw,
#     metric="WoY",
#     metric_lower=0,
#     metric_upper=30,
#     dam="Willamette",
#     species="ChinookAdult",
# )
# fig.add_scatter(
#     x=orca_sightings_df_filtered[orca_sightings_df_filtered.POD_TYPE == "SRKW"]["WOY"],
#     y=[np.max(wlm_raw["Count"])]
#     * len(orca_sightings_df_filtered[orca_sightings_df_filtered.POD_TYPE == "SRKW"]),
#     mode="markers",
#     marker_color="yellow",
#     name="SRKW - Sightings",
# )

# fig.add_scatter(
#     x=orca_sightings_df_filtered[orca_sightings_df_filtered.POD_TYPE != "SRKW"]["WOY"],
#     y=[1.1 * np.max(wlm_raw["Count"])]
#     * len(orca_sightings_df_filtered[orca_sightings_df_filtered.POD_TYPE != "SRKW"]),
#     mode="markers",
#     marker_color="turquoise",
#     name="OTHER - Sightings",
# )
# fig.show()

# # Get Area Plot
# fig, bon_ = plot_area_plot_stacked(
#     wlm_raw,
#     metric="WoY",
#     metric_lower=0,
#     metric_upper=30,
#     dam="Willamette",
#     species="ChinookJack",
# )
# fig.add_scatter(
#     x=orca_sightings_df_filtered[orca_sightings_df_filtered.POD_TYPE == "SRKW"]["WOY"],
#     y=[np.max(wlm_raw["Count"])]
#     * len(orca_sightings_df_filtered[orca_sightings_df_filtered.POD_TYPE == "SRKW"]),
#     mode="markers",
#     marker_color="yellow",
#     name="SRKW - Sightings",
# )

# fig.add_scatter(
#     x=orca_sightings_df_filtered[orca_sightings_df_filtered.POD_TYPE != "SRKW"]["WOY"],
#     y=[1.1 * np.max(bon_["Count"])]
#     * len(orca_sightings_df_filtered[orca_sightings_df_filtered.POD_TYPE != "SRKW"]),
#     mode="markers",
#     marker_color="turquoise",
#     name="OTHER - Sightings",
# )
# fig.show()

# # Get Area Plot
# fig, bon_ = plot_area_plot_stacked(
#     wlm_raw,
#     metric="WoY",
#     metric_lower=0,
#     metric_upper=30,
#     dam="Willamette",
#     species="Steelhead",
# )
# fig.add_scatter(
#     x=orca_sightings_df_filtered[orca_sightings_df_filtered.POD_TYPE == "SRKW"]["WOY"],
#     y=[np.max(bon_["Count"])]
#     * len(orca_sightings_df_filtered[orca_sightings_df_filtered.POD_TYPE == "SRKW"]),
#     mode="markers",
#     marker_color="yellow",
#     name="SRKW - Sightings",
# )

# fig.add_scatter(
#     x=orca_sightings_df_filtered[orca_sightings_df_filtered.POD_TYPE != "SRKW"]["WOY"],
#     y=[1.1 * np.max(bon_["Count"])]
#     * len(orca_sightings_df_filtered[orca_sightings_df_filtered.POD_TYPE != "SRKW"]),
#     mode="markers",
#     marker_color="turquoise",
#     name="OTHER - Sightings",
# )
# fig.show()

In [None]:
# orca_sightings_df_filtered.explore(
#     "WOY", marker_kwds=dict(radius=10, fill=True), cmap="RdBu"
# )

In [None]:
orca_sightings_df

In [None]:
import geopandas as gpd
from shapely.geometry import Polygon

# Define the bounding box coordinates
min_lon = -135.4  # Western boundary (100 km west of ~124.5°W coast)
max_lon = -123.5  # Eastern boundary (inland)
min_lat = 40.5  # Southern boundary (near CA-Mexico border)
max_lat = 48.0  # Northern boundary (near WA-Canada border)

# Create a polygon for the bounding box
bbox = Polygon(
    [
        (min_lon, min_lat),  # SW corner
        (min_lon, max_lat),  # NW corner
        (max_lon, max_lat),  # NE corner
        (max_lon, min_lat),  # SE corner
        (min_lon, min_lat),  # Close the polygon
    ]
)

# Create a GeoDataFrame
gdf1 = gpd.GeoDataFrame(geometry=[bbox], crs="EPSG:4326")


import geopandas as gpd
from shapely.geometry import Polygon

# Define the bounding box coordinates
min_lon = -135.4  # Western boundary (100 km west of ~124.5°W coast)
max_lon = -112.1  # Eastern boundary (inland)
min_lat = 30.5  # Southern boundary (near CA-Mexico border)
max_lat = 44  # Northern boundary (near WA-Canada border)

# Create a polygon for the bounding box
bbox = Polygon(
    [
        (min_lon, min_lat),  # SW corner
        (min_lon, max_lat),  # NW corner
        (max_lon, max_lat),  # NE corner
        (max_lon, min_lat),  # SE corner
        (min_lon, min_lat),  # Close the polygon
    ]
)

# Create a GeoDataFrame
gdf2 = gpd.GeoDataFrame(geometry=[bbox], crs="EPSG:4326")

# Define the bounding box coordinates
min_lon = -135.4  # Western boundary (100 km west of ~124.5°W coast)
max_lon = -124.5  # Eastern boundary (inland)
min_lat = 47.5  # Southern boundary (near CA-Mexico border)
max_lat = 49.0  # Northern boundary (near WA-Canada border)

# Create a polygon for the bounding box
bbox = Polygon(
    [
        (min_lon, min_lat),  # SW corner
        (min_lon, max_lat),  # NW corner
        (max_lon, max_lat),  # NE corner
        (max_lon, min_lat),  # SE corner
        (min_lon, min_lat),  # Close the polygon
    ]
)

# Create a GeoDataFrame
gdf3 = gpd.GeoDataFrame(geometry=[bbox], crs="EPSG:4326")


# gdf = pd.concat([gdf2, gdf1, gdf3]).dissolve()
gdf = pd.concat([gdf2, gdf1]).dissolve()

In [None]:
coast_sightings = orca_sightings_df.sjoin(gdf)

In [None]:
fig = px.scatter(coast_sightings, x="WOY", y="LATITUDE", color="POD_TYPE")
fig.add_hline(y=46.1)
fig.add_hline(y=38.33)
fig.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


data = coast_sightings[coast_sightings.POD_TYPE == "SRKW"]
# Normalize the features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data[["LATITUDE", "WOY"]])

# Apply K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
data["Cluster"] = kmeans.fit_predict(scaled_data)

# Visualize the clusters
plt.scatter(data["LATITUDE"], data["WOY"], c=data["Cluster"], cmap="viridis")
plt.xlabel("Latitude")
plt.ylabel("Week of Year")
plt.title("K-means Clustering (k=3)")
plt.show()

# Summarize cluster centers
centers = scaler.inverse_transform(kmeans.cluster_centers_)
print("Cluster Centers (LATITUDE, WOY):")
for i, center in enumerate(centers):
    print(f"Cluster {i}: {center}")

In [None]:
data["Cluster"] = data["Cluster"].astype(str)
fig = px.scatter(data, x="WOY", y="LATITUDE", color="Cluster")
fig.add_hline(y=46.1)
fig.add_hline(y=38.33)
fig.show()

In [None]:
fig = px.scatter(data, x="YEAR_WEEK", y="LATITUDE", color="WOY")
fig.show()

In [None]:
coast_Tags = tagging_gdf.drop(columns="index_right").sjoin(gdf)
coast_Tags["MONTH"] = coast_Tags["DATETIME"].dt.month.astype(float)
coast_Tags["WOY"] = coast_Tags["WOY"].astype(float)

In [None]:
fig = px.scatter(coast_Tags, y="lat_p", x="DATE", color="WOY")
fig.add_hline(y=46.1)
fig.add_hline(y=38.33)
fig.show()

In [None]:
coast_Tags.popid

In [None]:
coast_Tags.columns

In [None]:
fig = px.scatter(coast_Tags, y="lat_p", x="DOY", color="popid")

fig.add_hline(
    y=46.2,
    line_dash="dot",
    line_color="black",
    opacity=0.25,
    annotation_text="Columbia River",  # The text for the annotation
    annotation_position="top right",  # The position of the annotation relative to the line
)

fig.add_hline(
    y=40.65,
    line_dash="dot",
    line_color="black",
    opacity=0.25,
    annotation_text="Eel River",  # The text for the annotation
    annotation_position="top right",  # The position of the annotation relative to the line
)


fig.add_hline(
    y=46.95,
    line_dash="dot",
    line_color="black",
    opacity=0.25,
    annotation_text="Chehalis River",  # The text for the annotation
    annotation_position="top right",  # The position of the annotation relative to the line
)

fig.add_hline(
    y=47.55,  # The y-coordinate of the horizontal line
    line_dash="dot",
    line_color="black",
    opacity=0.25,
    annotation_text="Queets River",  # The text for the annotation
    annotation_position="top right",  # The position of the annotation relative to the line
)

fig.add_hline(
    y=38.33,  # The y-coordinate of the horizontal line
    line_dash="dot",
    line_color="black",
    opacity=0.25,
    annotation_text="Sacramento River",  # The text for the annotation
    annotation_position="top right",  # The position of the annotation relative to the line
)

fig.add_hline(
    y=36.78,  # The y-coordinate of the horizontal line
    line_dash="dot",
    line_color="black",
    opacity=0.25,
    annotation_text="Monterey Bay",  # The text for the annotation
    annotation_position="top right",  # The position of the annotation relative to the line
)

fig.add_hline(
    y=44.64,  # The y-coordinate of the horizontal line
    line_dash="dot",
    line_color="black",
    opacity=0.25,
    annotation_text="Newport",  # The text for the annotation
    annotation_position="top right",  # The position of the annotation relative to the line
)
fig.add_hline(
    y=43.67,  # The y-coordinate of the horizontal line
    line_dash="dot",
    line_color="black",
    opacity=0.25,
    annotation_text="Umpqua River",  # The text for the annotation
    annotation_position="top right",  # The position of the annotation relative to the line
)

dd_ = coast_sightings[coast_sightings.POD_TYPE == "SRKW"]
for pdtag in dd_.POD_TAG.unique():
    tmpp = dd_[dd_.POD_TAG == pdtag]
    if "L" in pdtag:
        color_val = "#fb8500"
    elif "J" in pdtag:
        color_val = "#fb6f92"
    elif "K" in pdtag:
        color_val = "#31572c"
    fig.add_scatter(
        x=tmpp["DOY"],
        y=tmpp["LATITUDE"],
        name=pdtag,
        mode="markers",
        marker_color=color_val,
    )

import calendar

# Month start/end DOY
month_bounds = []
current_doy = 1
for month in range(1, 13):
    days_in_month = calendar.monthrange(2024, month)[1]  # leap year if needed
    month_bounds.append(
        (current_doy, current_doy + days_in_month - 1, calendar.month_abbr[month])
    )
    current_doy += days_in_month

# Add shaded bands for months
for start_doy, end_doy, month_name in month_bounds:
    fig.add_vrect(
        x0=start_doy,
        x1=end_doy,
        fillcolor="white",
        opacity=0.0,
        line_width=1,
        line_color="black",
        annotation_text=month_name,
        annotation_position="top left",
    )

fig.show()

In [None]:
fig = px.scatter(coast_Tags, y="lat_p", x="DOY", color="popid")

fig.add_hline(
    y=46.2,
    line_dash="dot",
    line_color="black",
    opacity=0.25,
    annotation_text="Columbia River",  # The text for the annotation
    annotation_position="top right",  # The position of the annotation relative to the line
)

fig.add_hline(
    y=40.65,
    line_dash="dot",
    line_color="black",
    opacity=0.25,
    annotation_text="Eel River",  # The text for the annotation
    annotation_position="top right",  # The position of the annotation relative to the line
)


fig.add_hline(
    y=46.95,
    line_dash="dot",
    line_color="black",
    opacity=0.25,
    annotation_text="Chehalis River",  # The text for the annotation
    annotation_position="top right",  # The position of the annotation relative to the line
)

fig.add_hline(
    y=47.55,  # The y-coordinate of the horizontal line
    line_dash="dot",
    line_color="black",
    opacity=0.25,
    annotation_text="Queets River",  # The text for the annotation
    annotation_position="top right",  # The position of the annotation relative to the line
)

fig.add_hline(
    y=38.33,  # The y-coordinate of the horizontal line
    line_dash="dot",
    line_color="black",
    opacity=0.25,
    annotation_text="Sacramento River",  # The text for the annotation
    annotation_position="top right",  # The position of the annotation relative to the line
)

fig.add_hline(
    y=36.78,  # The y-coordinate of the horizontal line
    line_dash="dot",
    line_color="black",
    opacity=0.25,
    annotation_text="Monterey Bay",  # The text for the annotation
    annotation_position="top right",  # The position of the annotation relative to the line
)

fig.add_hline(
    y=44.64,  # The y-coordinate of the horizontal line
    line_dash="dot",
    line_color="black",
    opacity=0.25,
    annotation_text="Newport",  # The text for the annotation
    annotation_position="top right",  # The position of the annotation relative to the line
)
fig.add_hline(
    y=43.67,  # The y-coordinate of the horizontal line
    line_dash="dot",
    line_color="black",
    opacity=0.25,
    annotation_text="Umpqua River",  # The text for the annotation
    annotation_position="top right",  # The position of the annotation relative to the line
)

dd_ = coast_sightings.copy()#[coast_sightings.POD_TYPE != "SRKW"]
for pdtag in dd_.POD_TAG.unique():
    tmpp = dd_[dd_.POD_TAG == pdtag]
    if "L" in pdtag:
        color_val = "#fb8500"
    elif "J" in pdtag:
        color_val = "#fb6f92"
    elif "K" in pdtag:
        color_val = "#31572c"
    elif "T" in pdtag:
        color_val = "#ffbe0b"
    elif "O" in pdtag:
        color_val = "#00f5d4"
    fig.add_scatter(
        x=tmpp["DOY"],
        y=tmpp["LATITUDE"],
        name=pdtag,
        mode="markers",
        marker_color=color_val,
    )

import calendar

# Month start/end DOY
month_bounds = []
current_doy = 1
for month in range(1, 13):
    days_in_month = calendar.monthrange(2024, month)[1]  # leap year if needed
    month_bounds.append(
        (current_doy, current_doy + days_in_month - 1, calendar.month_abbr[month])
    )
    current_doy += days_in_month

# Add shaded bands for months
for start_doy, end_doy, month_name in month_bounds:
    fig.add_vrect(
        x0=start_doy,
        x1=end_doy,
        fillcolor="white",
        opacity=0.0,
        line_width=1,
        line_color="black",
        annotation_text=month_name,
        annotation_position="top left",
    )

fig.show()