In [124]:
# --------------------------------------- #
#                 MODULES                 #

# Standard Modules
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
import math
import os
import re
import requests
from requests.adapters import HTTPAdapter, Retry
import warnings

# Third-Party Modules
import pandas as pd
import numpy as np
import plotly.express as px

warnings.filterwarnings("ignore")

#                                         #
# --------------------------------------- #

# --------------------------------------- #
#                FUNCTIONS                #

######################
# TWM Data

############
# CONNECTION


# Query Session
def _requests_session(retries=3, backoff=0.5):
    """Build a requests session with retry logic."""
    session = requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff,
        status_forcelist=(500, 502, 503, 504),
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session


# Column Normalization Across DataSets
def _normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Normalize column names (lowercase, strip spaces)."""
    df.columns = df.columns.str.strip().str.lower()
    return df


# Set Cached Data Path
def _cache_path(local_cache, path):
    """Convert GitHub path to a safe local cache filename."""
    return os.path.join(local_cache, path.replace("/", "_"))


# Open Data
def open_github_data(
    owner: str,
    repo: str,
    directory: str,
    branch: str = "main",
    token: str = None,
    local_cache: str = "cache",
    transform_fn=None,
    file_ext: str = ".csv",
    parallel: bool = True,
) -> pd.DataFrame:
    """
    Ingests all CSVs (or other file_ext) from a GitHub repo folder into a combined DataFrame.

    Args:
        owner (str): GitHub owner/org.
        repo (str): Repository name.
        directory (str): Path to directory in repo.
        branch (str): Git branch (default "main").
        token (str): Optional GitHub token for higher rate limits.
        local_cache (str): Directory to cache raw CSVs locally.
        transform_fn (callable): Optional function applied to each DataFrame before concat.
        file_ext (str): File extension to include (default ".csv").
        parallel (bool): Download files in parallel.

    Returns:
        pd.DataFrame: Concatenated dataframe of all files with provenance metadata.
    """
    os.makedirs(local_cache, exist_ok=True)
    session = _requests_session()

    headers = {}
    if token:
        headers["Authorization"] = f"token {token}"

    # 1. Query GitHub API for directory contents
    api_url = (
        f"https://api.github.com/repos/{owner}/{repo}/contents/{directory}?ref={branch}"
    )
    resp = session.get(api_url, headers=headers)
    resp.raise_for_status()
    files = resp.json()

    # Filter by extension
    csv_files = [f for f in files if f["name"].lower().endswith(file_ext)]

    if not csv_files:
        return pd.DataFrame()

    # Grab commit SHA for provenance
    commit_url = f"https://api.github.com/repos/{owner}/{repo}/commits/{branch}"
    commit_sha = session.get(commit_url, headers=headers).json().get("sha", None)

    # 2. Loader function for one file
    def _load_one(f):
        path = f["path"]
        cache_fp = _cache_path(local_cache, path)
        if os.path.exists(cache_fp):
            df = pd.read_csv(cache_fp)
        else:
            raw_url = (
                f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path}"
            )
            df = pd.read_csv(raw_url)
            df.to_csv(cache_fp, index=False)

        df["__source_file"] = path
        df["__ingest_time"] = datetime.utcnow().isoformat()
        df["__commit_sha"] = commit_sha
        df = _normalize_columns(df)
        if transform_fn:
            df = transform_fn(df)
        return df

    # 3. Parallel or serial load
    if parallel:
        with ThreadPoolExecutor() as ex:
            dfs = list(ex.map(_load_one, csv_files))
    else:
        dfs = [_load_one(f) for f in csv_files]

    return pd.concat(dfs, ignore_index=True)


############
# PREPROCESS


# Prep TWM Data for Pod Assignment
def prepare_twm_data(twm_data_raw):

    # Select Columns
    twm_data = twm_data_raw[
        [
            "sightdate",
            "latitude",
            "longitude",
            "pod",
            "pod_tag",
            "likelypod",
            "source",
            "__source_file",
        ]
    ]

    # Preformat Data
    twm_data.columns = [
        "DATE",
        "LATITUDE",
        "LONGITUDE",
        "POD",
        "POD_TAG",
        "LIKELY_POD",
        "SOURCE",
        "SOURCE_FILE",
    ]
    # Clean Source Column
    twm_data.SOURCE = twm_data.SOURCE.str.upper().str.strip()

    # Ensure Latitude + Longitude are Float Type
    twm_data["LATITUDE"] = pd.to_numeric(twm_data["LATITUDE"], errors="coerce")
    twm_data["LATITUDE"] = twm_data["LATITUDE"].astype(float)

    twm_data["LONGITUDE"] = pd.to_numeric(twm_data["LONGITUDE"], errors="coerce")
    twm_data["LONGITUDE"] = twm_data["LONGITUDE"].astype(float)

    return twm_data


# Ensure Multi-Column Pod Support is Preserved
def combine_cols(a, b):
    """Combine Tags from Multiple Columns"""
    parts = [x for x in [a, b] if pd.notna(x) and x is not None]
    return ", ".join(parts) if parts else None


# Extract Numerical Pod Assignment
def extract_ids(s):
    """Extract Individual Tags from Data"""
    if pd.isna(s):  # catches NaN, None, NA
        return None
    else:
        s = s.strip()
        pattern = r"[JKLT][\-\+\?]*\d+"
        # pattern = r"[JKL][\-\+\?]*\d+S?"

        matches = re.findall(pattern, s)
        # clean off extra symbols like - + ? but keep the number
        cleaned = [re.sub(r"[^\w]", "", m) for m in matches]
        cleaned = list(np.unique(cleaned))
        if not cleaned:
            return None
        if len(cleaned) == 1:
            return cleaned[0]
        if len(cleaned) > 1:
            return ",".join(cleaned)
        return cleaned  # return list if more than one


# Define POD Assignment Rules
def flag_pod_tags_types(twm_data):
    twm_data["POD"] = twm_data["POD"].str.upper()
    twm_data["POD_TAG"] = twm_data["POD_TAG"].str.upper()
    twm_data["LIKELY_POD"] = twm_data["LIKELY_POD"].str.upper()

    # Replace Any Unspecified with Nan
    twm_data["POD"] = twm_data["POD"].replace("ORCAS", np.nan)
    twm_data["LIKELY_POD"] = twm_data["LIKELY_POD"].replace(" ", np.nan)

    # Drop "P" from POD (Likely "P" = "POD")
    twm_data["POD"] = twm_data["POD"].str.replace("P", "")

    # Make K, J, L,  Column

    twm_data["POD_J"] = np.where(
        (twm_data.POD.str.contains("J", na=False))
        | (twm_data.POD_TAG.str.contains("J", na=False))
        | (twm_data.LIKELY_POD.str.contains("J", na=False)),
        1,
        0,
    )
    twm_data["POD_K"] = np.where(
        (twm_data.POD.str.contains("K", na=False))
        | (twm_data.POD_TAG.str.contains("K", na=False))
        | (twm_data.LIKELY_POD.str.contains("K", na=False)),
        1,
        0,
    )
    twm_data["POD_L"] = np.where(
        (twm_data.POD.str.contains("L", na=False))
        | (twm_data.POD_TAG.str.contains("L", na=False))
        | (twm_data.LIKELY_POD.str.contains("L", na=False)),
        1,
        0,
    )
    twm_data["POD_T"] = np.where(
        (twm_data.POD.str.contains("T", na=False))
        | (twm_data.POD_TAG.str.contains("T", na=False))
        | (twm_data.LIKELY_POD.str.contains("T", na=False)),
        1,
        0,
    )
    twm_data["TYPE_SRKW"] = np.where(
        (twm_data.POD.str.contains("SR", na=False))
        | (twm_data.LIKELY_POD.str.contains("SR", na=False)),
        1,
        0,
    )
    twm_data["TYPE_SRKW"] = np.where(
        twm_data["POD_J"] + twm_data["POD_K"] + twm_data["POD_L"] > 0,
        1,
        twm_data["TYPE_SRKW"],
    )
    twm_data["TYPE_NRKW"] = np.where(
        (twm_data.POD.str.contains("NR", na=False))
        | (twm_data.LIKELY_POD.str.contains("NR", na=False)),
        1,
        0,
    )
    twm_data["TYPE_TRANSIENT"] = np.where(
        (twm_data.POD.str.contains("T", na=False))
        | (twm_data.LIKELY_POD.str.contains("T", na=False)),
        1,
        0,
    )
    twm_data["TYPE_OTHER"] = np.where(
        (twm_data["TYPE_TRANSIENT"] == 0)
        & (twm_data["TYPE_SRKW"] == 0)
        & (twm_data["TYPE_NRKW"] == 0),
        1,
        0,
    )
    twm_data["TYPE_SRKW"] = twm_data["TYPE_SRKW"].map({0: "", 1: "SRKW"})
    twm_data["TYPE_NRKW"] = twm_data["TYPE_NRKW"].map({0: "", 1: "NRKW"})
    twm_data["TYPE_TRANSIENT"] = twm_data["TYPE_TRANSIENT"].map({0: "", 1: "TRANSIENT"})
    twm_data["TYPE_OTHER"] = twm_data["TYPE_OTHER"].map({0: "", 1: "OTHER"})

    # Combine into a list, ignoring empty strings
    twm_data["TYPE"] = twm_data.apply(
        lambda x: [
            t
            for t in [
                x["TYPE_SRKW"],
                x["TYPE_NRKW"],
                x["TYPE_TRANSIENT"],
                x["TYPE_OTHER"],
            ]
            if t
        ],
        axis=1,
    )
    twm_data = twm_data.drop(
        columns=["TYPE_SRKW", "TYPE_NRKW", "TYPE_TRANSIENT", "TYPE_OTHER"]
    )

    twm_data["POD_J"] = twm_data["POD_J"].map({0: "", 1: "J"})
    twm_data["POD_K"] = twm_data["POD_K"].map({0: "", 1: "K"})
    twm_data["POD_L"] = twm_data["POD_L"].map({0: "", 1: "L"})
    twm_data["POD_T"] = twm_data["POD_T"].map({0: "", 1: "T"})

    # Get Named Individuals
    twm_data["INDIVIDUALS_P"] = twm_data["POD"].apply(lambda x: extract_ids(x))
    twm_data["INDIVIDUALS_LP"] = twm_data["LIKELY_POD"].apply(lambda x: extract_ids(x))
    twm_data["NAMED_SUBPOD"] = twm_data.apply(
        lambda row: combine_cols(row["INDIVIDUALS_P"], row["INDIVIDUALS_LP"]), axis=1
    )

    twm_data["PODS"] = twm_data.apply(
        lambda x: [
            t
            for t in [
                x["POD_J"],
                x["POD_K"],
                x["POD_L"],
                x["POD_T"],
            ]
            if t
        ],
        axis=1,
    )

    twm_data = twm_data.drop(
        columns=["INDIVIDUALS_P", "INDIVIDUALS_LP", "POD_TAG", "LIKELY_POD", "POD"]
    )

    twm_data["ASSOC_PODS"] = twm_data["PODS"].apply(lambda x: ",".join(sorted(x)))
    twm_data["ASSOC_POD_COUNT"] = twm_data["PODS"].apply(lambda x: len(x))

    twm_data = twm_data.drop(columns=["POD_J", "POD_K", "POD_L", "POD_T", "PODS"])
    twm_data.ASSOC_PODS = twm_data.ASSOC_PODS.replace("", "O")

    return twm_data


# Explode POD Names
def twm_explode(twm_data):
    twm_data["NAMED_SUBPOD"] = twm_data["NAMED_SUBPOD"].str.split(",")
    twm_data = twm_data.explode("NAMED_SUBPOD")
    twm_data["NAMED_SUBPOD"] = twm_data["NAMED_SUBPOD"].str.strip()
    twm_data["NAMED_SUBPOD"] = twm_data["NAMED_SUBPOD"].fillna("OTHER")

    twm_data = twm_data.explode("TYPE")

    return twm_data


# Preprocess TWM Data
def preprocess_twm_data(twm_data):
    """Preprocesses + Cleans TWM Data"""
    # 1. Organize TWM Data
    twm_data = prepare_twm_data(twm_data)

    # 2. Flag Pod Tags + Types for TWM Data
    twm_data = flag_pod_tags_types(twm_data)

    # 3. Explode Data for Duplicate Sightings
    twm_data = twm_explode(twm_data)

    return twm_data


######################
# Acartia Data - Git


def get_acartia_associated_pods(j, k, l, b):
    assoc_pod = []
    if j == 1:
        assoc_pod.append("J")
    if k == 1:
        assoc_pod.append("K")
    if l == 1:
        assoc_pod.append("L")
    if b == 1:
        assoc_pod.append("T")

    if len(assoc_pod) > 1:
        assoc_pod_count = len(assoc_pod)
        assoc_pod = ",".join(assoc_pod)
    elif len(assoc_pod) > 0:
        assoc_pod = assoc_pod[0]
        assoc_pod_count = 1
    else:
        assoc_pod = "O"
        assoc_pod_count = 0
    return assoc_pod, assoc_pod_count


def get_pod_types(j, k, l, b):
    pod_types = []
    if j == 1:
        pod_types.append("SRKW")
    if k == 1:
        pod_types.append("SRKW")
    if l == 1:
        pod_types.append("SRKW")
    if b == 1:
        pod_types.append("TRANSIENT")

    if len(pod_types) == 0:
        pod_type = ["OTHER"]
    else:
        pod_type = list(np.unique(pod_types))

    return pod_type


# Explode POD Names
def acartia_explode(acartia_data):
    acartia_data["NAMED_SUBPOD"] = acartia_data["NAMED_SUBPOD"].str.split(",")
    acartia_data = acartia_data.explode("NAMED_SUBPOD")
    acartia_data["NAMED_SUBPOD"] = acartia_data["NAMED_SUBPOD"].str.strip()
    acartia_data["NAMED_SUBPOD"] = acartia_data["NAMED_SUBPOD"].fillna("OTHER")

    acartia_data = acartia_data.explode("TYPE")

    return acartia_data


def prep_acartia_data(acartia_data):
    acartia_data["DATE"] = acartia_data.apply(
        lambda x: f"{x.year}-{x.month}-{x.day}", axis=1
    )

    acartia_data = acartia_data[
        [
            "DATE",
            "latitude",
            "longitude",
            "data_source_comments",
            "j",
            "k",
            "l",
            "biggs",
            "__source_file",
        ]
    ]
    acartia_data.columns = [
        "DATE",
        "LATITUDE",
        "LONGITUDE",
        "NOTES",
        "J",
        "K",
        "L",
        "BIGGS",
        "SOURCE_FILE",
    ]

    acartia_data["SOURCE"] = "ACARTIA"

    return acartia_data


def clean_pod_mismatches(acartia_data):
    acartia_data["TYPE"] = np.where(
        (acartia_data["TYPE"] == "OTHER")
        & (acartia_data["NOTES"].str.contains("SRKW")),
        "SRKW",
        acartia_data["TYPE"],
    )

    acartia_data["ASSOC_PODS"] = np.where(
        (acartia_data.NAMED_SUBPOD != "OTHER") & (acartia_data.TYPE == "OTHER"),
        acartia_data.NAMED_SUBPOD.str[0],
        acartia_data["ASSOC_PODS"],
    )

    acartia_data.TYPE = np.where(
        (acartia_data.TYPE == "OTHER")
        & (
            acartia_data.ASSOC_PODS.str.contains("J")
            | acartia_data.ASSOC_PODS.str.contains("K")
            | acartia_data.ASSOC_PODS.str.contains("L")
        ),
        "SRKW",
        acartia_data.TYPE,
    )
    return acartia_data


def preprocess_git_acartia(acartia_data_raw):
    # 1. Prep Acartia Data for Formatting
    acartia_data = prep_acartia_data(acartia_data_raw)

    ###
    # Get Associated Pods
    acartia_data[["ASSOC_PODS", "ASSOC_POD_COUNT"]] = acartia_data.apply(
        lambda x: get_acartia_associated_pods(x.J, x.K, x.L, x.BIGGS),
        axis=1,
        result_type="expand",
    )

    # Get Associated Pod Types
    acartia_data["TYPE"] = acartia_data.apply(
        lambda x: get_pod_types(x.J, x.K, x.L, x.BIGGS), axis=1
    )
    acartia_data = acartia_data.drop(columns=["J", "K", "L", "BIGGS"])

    # Get Named Sub-Pod
    acartia_data["NAMED_SUBPOD"] = acartia_data["NOTES"].apply(lambda x: extract_ids(x))

    # Explode
    acartia_data = acartia_explode(acartia_data)

    # Clean Mismatches
    acartia_data = clean_pod_mismatches(acartia_data)

    # Filter to Final Columns
    acartia_data = acartia_data[
        [
            "DATE",
            "LATITUDE",
            "LONGITUDE",
            "SOURCE",
            "SOURCE_FILE",
            "TYPE",
            "NAMED_SUBPOD",
            "ASSOC_PODS",
            "ASSOC_POD_COUNT",
        ]
    ]

    acartia_data["DATE"] = pd.to_datetime(acartia_data["DATE"])

    return acartia_data


######################
# Acartia Data - Acartia Sourced


#                                         #
# --------------------------------------- #

### 1. TWM Data via Git

In [125]:
# Read TWM Data from Git
twm_data_raw = open_github_data(
    owner="liu-zoe",
    repo="orcasalmon",
    directory="data/twm",
    branch="main",
)

# Preprocess TWM Data
twm_data = preprocess_twm_data(twm_data=twm_data_raw)

### 2. Acartia Data via Git

In [126]:
# Read Acartia Data from Git
acartia_git_data_raw = open_github_data(
    owner="liu-zoe",
    repo="orcasalmon",
    directory="data/acartia",
    branch="main",
)

# Preprocess Acartia Data from Git
acartia_git_data = preprocess_git_acartia(acartia_git_data_raw)

### 3. Acartia Data via Sourcing from Acartia

In [217]:
# Define mapping for standardizing species names
species_mapping = {
    "humpback": "Humpback Whale",
    "ballena jorobada": "Humpback Whale",
    "humpback whale": "Humpback Whale",
    "humpback sighting:": "Humpback Whale",
    "orca": "Killer Whale",
    "killer whale": "Killer Whale",
    "southern resident orca": "Killer Whale",
    "southern resident killer whale": "Killer Whale",
    "killer whale (orca)": "Killer Whale",
    "orca (ballena asesina)": "Killer Whale",
    "killer whale sighting:": "Killer Whale",
    "orca sighting:": "Killer Whale",
    "southern resident killer whale sighting:": "Killer Whale",
    "killer whale (orca) sighting:": "Killer Whale",
    "gray": "Gray Whale",
    "grey": "Gray Whale",
    "gray whale": "Gray Whale",
    "grey whale": "Gray Whale",
    "gray whale sighting:": "Gray Whale",
    "blue whale": "Blue Whale",
    "ballena azul": "Blue Whale",
    "blue whale sighting:": "Blue Whale",
    "fin whale": "Fin Whale",
    "finback whale": "Fin Whale",
    "fin whale sighting:": "Fin Whale",
    "minke whale": "Minke Whale",
    "petit rorqual": "Minke Whale",
    "minke whale sighting:": "Minke Whale",
    "harbor porpoise": "Harbor Porpoise",
    "marsouin commun": "Harbor Porpoise",
    "dalls porpoise": "Dall's Porpoise",
    "pacific white-sided dolphin": "Pacific White-sided Dolphin",
    "pacific white-sided dolphin sighting:": "Pacific White-sided Dolphin",
    "common dolphin": "Common Dolphin",
    "common dolphin - unidentified": "Common Dolphin",
    "common dolphin sighting:": "Common Dolphin",
    "common long-beaked dolphin": "Common Dolphin",
    "long-beaked common dolphin": "Common Dolphin",
    "common short-beaked dolphin": "Common Dolphin",
    "right whale": "Right Whale",
    "right whale sighting:": "Right Whale",
    "black right whale": "Right Whale",
    "northern right whale dolphin": "Right Whale",
    "northern right whale dolphin sighting:": "Right Whale",
    "sperm whale": "Sperm Whale",
    "sperm whale sighting:": "Sperm Whale",
    "white-beaked dolphin": "White-beaked Dolphin",
    "striped dolphin": "Striped Dolphin",
    "rissos dolphin": "Risso's Dolphin",
    "rissos dolphin sighting:": "Risso's Dolphin",
    "sei whale": "Sei Whale",
    "steller sealion": "Steller Sea Lion",
    "short finned pilot whale": "Short-finned Pilot Whale",
    "short finned pilot whale sighting:": "Short-finned Pilot Whale",
    "bottlenose dolphin": "Bottlenose Dolphin",
    "bottlenose whale": "Bottlenose Dolphin",
    "beluga": "Beluga Whale",
    "beluga whale": "Beluga Whale",
    "beluga whale sighting:": "Beluga Whale",
    "sowerbys beaked whale": "Sowerby's Beaked Whale",
    "atlantic white-sided dolphin": "Atlantic White-sided Dolphin",
    "atlantic white-sided dolphin sighting:": "Atlantic White-sided Dolphin",
    "bairds beaked whale": "Baird's Beaked Whale",
    "mola mola / sunfish": "Mola Mola",
    "blue shark": "Blue Shark",
    "whale - unidentified": "Unspecified",
    "unspecified": "Unspecified",
    "unspecified sighting:": "Unspecified",
    "other": "Unspecified",
    "other (specify in comments)": "Unspecified",
    "other sighting:": "Unspecified",
    "other (specify in comments) sighting:": "Unspecified",
    "other species": "Unspecified",
    "non spécifié": "Unspecified",
    "non spã©cifiã©": "Unspecified",
    "autre": "Unspecified",
    "": "Unspecified",
    "nan": "Unspecified",
    np.nan: "Unspecified",
}


# Prep Acartia Local Data
def prep_acartia_local_data(acartia_data_raw):
    acartia_data = acartia_data_raw[
        [
            "created",
            "latitude",
            "longitude",
            "type",
            "data_source_comments",
        ]
    ]

    acartia_data.columns = ["DATE", "LATITUDE", "LONGITUDE", "TYPE", "NOTES"]
    acartia_data["SOURCE"] = "ACARTIA"
    acartia_data["SOURCE_FILE"] = "acartia-export.csv"

    # Normalize Date
    acartia_data["DATE"] = acartia_data["DATE"].str[0:10]

    # Ensure Latitude + Longitude are Float Type
    acartia_data["LATITUDE"] = pd.to_numeric(acartia_data["LATITUDE"], errors="coerce")
    acartia_data["LATITUDE"] = acartia_data["LATITUDE"].astype(float)

    acartia_data["LONGITUDE"] = pd.to_numeric(
        acartia_data["LONGITUDE"], errors="coerce"
    )
    acartia_data["LONGITUDE"] = acartia_data["LONGITUDE"].astype(float)

    # Standardize TYPE, NOTES columns
    acartia_data["TYPE"] = acartia_data["TYPE"].astype(str).str.lower().str.strip()
    acartia_data["NOTES"] = acartia_data["NOTES"].astype(str).str.lower().str.strip()

    acartia_data["NOTES"] = acartia_data["NOTES"].str.replace("orca network", "")
    acartia_data["NOTES"] = acartia_data["NOTES"].str.replace("orcasound", "")
    acartia_data["NOTES"] = acartia_data["NOTES"].str.replace("orcalab", "")

    return acartia_data


# Function to extract species from notes as a list
def extract_species_from_notes(notes):
    if pd.isna(notes):
        return ["Unspecified"]

    text = str(notes).lower()
    extracted_species = set()

    # Comprehensive keyword mapping based on common cetacean species
    species_keywords = {
        "Killer Whale": [
            "orca",
            "killer whale",
            "srkw",
            "southern resident",
            "transient",
            "bigg's",
            "j pod",
            "k pod",
            "l pod",
            "j-pod",
            "k-pod",
            "l-pod",
            "ballena asesina",
            "orque",
            "épaulard",
            "hunting a sea lion",
        ],
        "Humpback Whale": ["humpback", "ballena jorobada"],
        "Gray Whale": ["gray whale", "grey whale"],
        "Blue Whale": ["blue whale", "ballena azul"],
        "Fin Whale": ["fin whale", "finback whale"],
        "Minke Whale": ["minke", "petit rorqual"],
        "Sperm Whale": ["sperm whale"],
        "Right Whale": ["right whale", "black right whale"],
        "Sei Whale": ["sei whale"],
        "Baird's Beaked Whale": ["baird's beaked whale"],
        "Beluga Whale": ["beluga"],
        "Bryde's Whale": ["bryde"],
        "Pilot Whale": ["pilot whale"],
        "False Killer Whale": ["false killer whale"],
        "Beaked Whale": ["beaked whale"],
        "Narwhal": ["narwhal"],
        "Bowhead Whale": ["bowhead"],
        "Harbor Porpoise": ["harbor porpoise", "marsouin commun"],
        "Dall's Porpoise": ["dall's porpoise", "dalls"],
        "Porpoise": ["porpoise"],  # Generic
        "Bottlenose Dolphin": ["bottlenose dolphin", "bottlenose whale"],
        "Risso's Dolphin": ["risso's dolphin"],
        "Pacific White-sided Dolphin": [
            "pacific white-sided dolphin",
            "pacific white sided dolphin",
        ],
        "Common Dolphin": ["common dolphin"],
        "Common Dolphin": ["long-beaked common dolphin", "long beaked common dolphin"],
        "Common Dolphin": ["short-beaked common dolphin"],
        "Striped Dolphin": ["striped dolphin"],
        "Spinner Dolphin": ["spinner dolphin"],
        "Spotted Dolphin": ["spotted dolphin"],
        "White-beaked Dolphin": ["white-beaked dolphin"],
        "Atlantic White-sided Dolphin": ["atlantic white-sided dolphin"],
        "Right Whale": ["northern right whale dolphin"],
        "Dolphin": ["dolphin"],  # Generic
        "Other": ["seal", "sea lion", "shark", "sunfish", "mola mola"],
    }

    # Check for each species
    for sp, keywords in species_keywords.items():
        if any(k in text for k in keywords):
            extracted_species.add(sp)

    # Special regex for orca individual IDs
    if re.search(r"\b[jklto]\-?\d+\b", text, re.IGNORECASE):
        extracted_species.add("Killer Whale")

    # Return list of species; if none but 'whale' mentioned, return ['Unspecified Whale']
    if "False Killer Whale" in extracted_species:
        if "Killer Whale" in extracted_species:
            extracted_species.discard("Killer Whale")
    if extracted_species:
        return sorted(extracted_species)
    elif "whale" in text:
        return ["Unspecified Whale"]
    else:
        return ["Unspecified"]


# Define Pod Keys
def define_keys(data):
    long_string = ",".join(data.NOTES.astype(str))

    jpod_keys = [
        "J pod",
        "Jpod",
        "J ppd",
        "J Pod",
        "j Pod",
        "J-pod",
        "Js",
        "j pod",
        "jpod",
        "j ppd",
        "j-pod",
        "j+k",
        "k+j",
        "j & k",
        "k & j",
        "j and k",
        "k and j",
        "jk pods",
        "kj pods",
        "J/K",
        "j/k",
        "J+K",
        "K+J",
        "J & K",
        "K & J",
        "J and K",
        "K and J",
        "JK pods",
        "KJ pods",
        "j+l",
        "l+j",
        "j & l",
        "l & j",
        "j and l",
        "l and j",
        "jl pods",
        "lj pods",
        "J+L",
        "L+J",
        "J & L",
        "L & J",
        "J and L",
        "L and J",
        "JL pods",
        "LJ pods",
        "j, k, l pod",
        "j, k, and l pod",
        "jkl",
        "J, K, L pod",
        "J, K, and L pod",
        "JKL",
        "j27",
        "j38",
        "j35",
        "j40",
        "J27",
        "J38",
        "J35",
        "J40",
        "J,",
        "j,",
    ]

    # Supplement
    jpod_ids = matches = re.findall(
        r"\bj\s*\d+[A-Za-z]?\b", long_string, flags=re.IGNORECASE
    )
    jpod_ids = np.unique(jpod_ids)

    jpod_keys = np.unique(jpod_keys + list(jpod_ids))

    kpod_keys = [
        "K pod",
        "K Pod",
        "k pod",
        "Kpod",
        "K,",
        "k,",
        "K-pod",
        "Ks",
        "k pod",
        "kpod",
        "k-pod",
        "j+k",
        "k+j",
        "j & k",
        "J/K",
        "j/k",
        "k & j",
        "j and k",
        "k and j",
        "jk pods",
        "kj pods",
        "J+K",
        "K+J",
        "J & K",
        "K & J",
        "J and K",
        "K and J",
        "JK pods",
        "KJ pods",
        "k+l",
        "l+k",
        "k & l",
        "l & k",
        "k and l",
        "l and k",
        "lk pods",
        "kl pods",
        "K+L",
        "L+K",
        "K & L",
        "L & K",
        "K and L",
        "L and K",
        "LK pods",
        "KL pods",
        "j, k, l pod",
        "j, k, and l pod",
        "jkl",
        "J, K, L pod",
        "J, K, and L pod",
        "JKL",
        "k37",
        "K37",
    ]

    kpod_ids = matches = re.findall(
        r"\bk\s*\d+[A-Za-z]?\b", long_string, flags=re.IGNORECASE
    )
    kpod_ids = np.unique(kpod_ids)

    kpod_keys = np.unique(kpod_keys + list(kpod_ids))

    lpod_keys = [
        "L pod",
        "Lpod",
        "L-pod",
        "L,",
        "l,",
        "Ls",
        "j+l",
        "l+j",
        "j & l",
        "l & j",
        "j and l",
        "l and j",
        "jl pods",
        "lj pods",
        "J+L",
        "L+J",
        "J & L",
        "L & J",
        "J and L",
        "L and J",
        "JL pods",
        "LJ pods",
        "k+l",
        "l+k",
        "k & l",
        "l & k",
        "k and l",
        "l and k",
        "lk pods",
        "kl pods",
        "K+L",
        "L+K",
        "K & L",
        "L & K",
        "K and L",
        "L and K",
        "LK pods",
        "KL pods",
        "j, k, l pod",
        "j, k, and l pod",
        "jkl",
        "J, K, L pod",
        "J, K, and L pod",
        "JKL",
        "l12",
        "l54",
        "l-12",
        "l82",
        "l85",
        "l87",
        "L12",
        "L54",
        "L-12",
        "L82",
        "L85",
        "L87",
        "L4",
    ]

    lpod_ids = matches = re.findall(
        r"\bl\s*\d+[A-Za-z]?\b", long_string, flags=re.IGNORECASE
    )
    lpod_ids = np.unique(lpod_ids)

    lpod_keys = np.unique(lpod_keys + list(lpod_ids))

    biggs_keys = [
        "Bigg",
        "bigg",
        "T Party",
        "sea lion",
        "Sea Lion",
        "seal",
        "Transient",
        "transient",
        "Ts",
        "t99",
        "65",
        "Hunting a seal close to shore at Alki",
        "65A",
        "65a",
        "T046B",
        "t046B" "t137",
        "t46",
        "hunting Harbor porpoises",
        "t10",
        "t2c",
        "t49",
        "T99",
        "T137",
        "T36",
        "T10",
        "T2C",
        "T49",
        "T65A",
        "T124",
        "T35",
        "T46",
        "T087",
        "T87",
        "T60",
        "T75" "T68",
        "T65",
        "T77",
        "T19",
        "T18",
        "T34",
        "T90",
        "T123",
        "T68",
        "T11",
        "T37",
    ]

    biggs_ids = matches = re.findall(
        r"\bt\s*\d+[A-Za-z]?\b", long_string, flags=re.IGNORECASE
    )
    biggs_ids = np.unique(biggs_ids)

    biggs_keys = np.unique(biggs_keys + list(biggs_ids))

    srkw_keys = (
        [
            "SRKW",
            "srkw",
            "srs",
            "srk",
            "salmon",
            "Southern Residents",
            "southern resident",
            "Southern Resident Killer Whales",
            "Southern Resident",
            " resident",
            " Resident",
        ]
        + list(jpod_keys)
        + list(kpod_keys)
        + list(lpod_keys)
    )

    return jpod_keys, kpod_keys, lpod_keys, biggs_keys, srkw_keys

In [218]:
# Preprocess Acartia Data to Identify Likely Orca Sightings
def collect_likely_orca_sightings(data):
    ## Lower-Case
    data["pre_type"] = data["TYPE"].astype(str)
    data["pre_type"] = data["pre_type"].str.lower()

    ## Remove Apostrophes
    data["pre_type"] = data["pre_type"].str.replace("'", "")

    ## Remove Slashes
    data["pre_type"] = data["pre_type"].str.replace("\\", "")

    ## Remove Colon
    data["pre_type"] = data["pre_type"].str.replace(":", "")

    ## Remove "Sighting"
    data["pre_type"] = data["pre_type"].str.replace(" sighting", "")

    ################################################################################
    # Standardize Type Column
    data["std_type"] = data["pre_type"].astype(str).str.lower().map(species_mapping)

    # Check for Nan Std Type -> this means that the type was not in our lookup
    if len(data[data["std_type"].isna()]) > 0:
        print("Check Keywords Lookup to ensure all are caught")
        print("Un caught")
        display(list(data[data["std_type"].isna()]["type"].unique()))

    ################################################################################
    # Check Unspecified Group - Notes Section
    data_tagged = data[data["std_type"] != "Unspecified"]
    data_tagged_likely_orca = data_tagged[data_tagged["std_type"] == "Killer Whale"][
        ["DATE", "TYPE", "LATITUDE", "LONGITUDE", "NOTES", "SOURCE", "SOURCE_FILE"]
    ]

    # Standardize Source Comments Column
    data_untag = data[data["std_type"] == "Unspecified"].copy()

    ## Fill Null with Unspecified
    data_untag["std_data_source_comments"] = data_untag["NOTES"].fillna("Unspecified")

    data_untag["std_data_source_comments"] = data_untag[
        "std_data_source_comments"
    ].str.replace("\n", "")

    data_untag["std_data_source_comments"] = (
        data_untag["std_data_source_comments"]
        .str.replace("Orca Network", "")
        .str.replace("Orcq Network", "")
        .str.replace("http://www.orca", "")
        .str.replace("Orcasound", "")
    )

    data_untag["std_data_source_comments"] = data_untag[
        "std_data_source_comments"
    ].str.replace("[", "")

    data_untag["std_data_source_comments"] = data_untag[
        "std_data_source_comments"
    ].str.replace("]", "")

    data_untag["std_data_source_comments"] = (
        data_untag["std_data_source_comments"]
        .str.replace(" at", "")
        .str.replace(" the", "")
        .str.replace(" of", "")
        .str.replace("viewed", "")
    )

    data_untag["std_data_source_comments"] = data_untag[
        "std_data_source_comments"
    ].str.replace("Not orca", "")

    # Extract from notes
    data_untag["extracted_from_notes"] = data_untag["std_data_source_comments"].apply(
        extract_species_from_notes
    )

    # Explode Multiple Species
    data_untag = data_untag.explode("extracted_from_notes")

    data_untag_likely_orca = data_untag[
        data_untag["extracted_from_notes"] == "Killer Whale"
    ][["DATE", "TYPE", "LATITUDE", "LONGITUDE", "NOTES", "SOURCE", "SOURCE_FILE"]]

    data_likely_orca = pd.concat([data_tagged_likely_orca, data_untag_likely_orca])
    data_likely_orca["DATE"] = data_likely_orca["DATE"].str[0:10]

    return data_likely_orca


# Assign Pod-Type
def assign_pod_type_bool(data, jpod_keys, kpod_keys, lpod_keys, biggs_keys, srkw_keys):

    # J pod
    data["J"] = data["NOTES"].apply(
        lambda x: 1 if any([k for k in jpod_keys if k in str(x)]) else 0
    )
    # K pod
    data["K"] = data["NOTES"].apply(
        lambda x: 1 if any([k for k in kpod_keys if k in str(x)]) else 0
    )
    # L pod
    data["L"] = data["NOTES"].apply(
        lambda x: 1 if any([k for k in lpod_keys if k in str(x)]) else 0
    )

    data["T"] = data["NOTES"].apply(
        lambda x: 1 if any([k for k in biggs_keys if k in str(x)]) else 0
    )

    data["SRKW"] = data["NOTES"].apply(
        lambda x: 1 if any([k for k in srkw_keys if k in str(x)]) else 0
    )

    return data


def assign_pod_information_acartia_local(acartia_data):
    # Identify NAMED_PODS
    acartia_data["NAMED_SUBPOD"] = (
        acartia_data["NOTES"].str.upper().apply(lambda x: extract_ids(x))
    )
    acartia_data["T"] = np.where(
        acartia_data["NAMED_SUBPOD"].str.contains("T"), 1, acartia_data["T"]
    )
    acartia_data["J"] = np.where(
        acartia_data["NAMED_SUBPOD"].str.contains("J"), 1, acartia_data["J"]
    )
    acartia_data["K"] = np.where(
        acartia_data["NAMED_SUBPOD"].str.contains("K"), 1, acartia_data["K"]
    )
    acartia_data["L"] = np.where(
        acartia_data["NAMED_SUBPOD"].str.contains("L"), 1, acartia_data["L"]
    )

    acartia_data["TYPE_SRKW"] = np.where(
        (acartia_data["J"] + acartia_data["K"] + acartia_data["L"] > 0)
        | (acartia_data["TYPE"].str.contains("southern")),
        1,
        acartia_data["SRKW"],
    )
    acartia_data["TYPE_TRANSIENT"] = np.where(
        (acartia_data["T"] == 1),
        1,
        0,
    )
    acartia_data["TYPE_OTHER"] = np.where(
        (acartia_data["TYPE_TRANSIENT"] == 0) & (acartia_data["TYPE_SRKW"] == 0),
        1,
        0,
    )
    acartia_data["TYPE_SRKW"] = acartia_data["TYPE_SRKW"].map({0: "", 1: "SRKW"})
    acartia_data["TYPE_TRANSIENT"] = acartia_data["TYPE_TRANSIENT"].map(
        {0: "", 1: "TRANSIENT"}
    )
    acartia_data["TYPE_OTHER"] = acartia_data["TYPE_OTHER"].map({0: "", 1: "OTHER"})

    # Combine into a list, ignoring empty strings
    acartia_data["TYPE"] = acartia_data.apply(
        lambda x: [
            t
            for t in [
                x["TYPE_SRKW"],
                x["TYPE_TRANSIENT"],
                x["TYPE_OTHER"],
            ]
            if t
        ],
        axis=1,
    )
    acartia_data = acartia_data.drop(
        columns=["TYPE_SRKW", "TYPE_TRANSIENT", "TYPE_OTHER", "SRKW"]
    )

    acartia_data["J"] = acartia_data["J"].map({0: "", 1: "J"})
    acartia_data["K"] = acartia_data["K"].map({0: "", 1: "K"})
    acartia_data["L"] = acartia_data["L"].map({0: "", 1: "L"})
    acartia_data["T"] = acartia_data["T"].map({0: "", 1: "T"})

    acartia_data["PODS"] = acartia_data.apply(
        lambda x: [
            t
            for t in [
                x["J"],
                x["K"],
                x["L"],
                x["T"],
            ]
            if t
        ],
        axis=1,
    )

    acartia_data = acartia_data.drop(columns=["J", "K", "L", "T"])

    acartia_data["ASSOC_PODS"] = acartia_data["PODS"].apply(
        lambda x: ",".join(sorted(x))
    )
    acartia_data["ASSOC_POD_COUNT"] = acartia_data["PODS"].apply(lambda x: len(x))

    acartia_data = acartia_data.drop(columns=["PODS"])
    acartia_data.ASSOC_PODS = acartia_data.ASSOC_PODS.replace("", "O")

    return acartia_data


# Preprocess Local Acartia
def preprocess_local_acartia(acartia_data_raw):
    # Prepare Acartia Data
    acartia_data = prep_acartia_local_data(acartia_data_raw)

    # Identify Likely Orcas
    acartia_data = collect_likely_orca_sightings(acartia_data)
    acartia_data = acartia_data.drop_duplicates()

    # Build Pod-Keys
    jpod_keys, kpod_keys, lpod_keys, biggs_keys, srkw_keys = define_keys(acartia_data)
    acartia_data = assign_pod_type_bool(
        acartia_data, jpod_keys, kpod_keys, lpod_keys, biggs_keys, srkw_keys
    )

    acartia_data["orig_TYPE"] = acartia_data["TYPE"]

    # Assign Pod Information
    acartia_data = assign_pod_information_acartia_local(acartia_data)

    # Explode on TYPE and ASSOC PODS
    acartia_data = acartia_explode(acartia_data)

    return acartia_data


def acartia_explode(acartia_data):
    acartia_data["NAMED_SUBPOD"] = acartia_data["NAMED_SUBPOD"].str.split(",")
    acartia_data = acartia_data.explode("NAMED_SUBPOD")
    acartia_data["NAMED_SUBPOD"] = acartia_data["NAMED_SUBPOD"].str.strip()
    acartia_data["NAMED_SUBPOD"] = acartia_data["NAMED_SUBPOD"].fillna("OTHER")

    acartia_data = acartia_data.explode("TYPE")

    return acartia_data

In [219]:
local_file = "/Users/tylerstevenson/Documents/CODE/FindMyWhale/data/raw/sightings/acartia-export.csv"

# NOTE: Git Version Lacking All Observations (Bring In All Observations from Historical) - To Compare
# NOTE: In the future, bring in code to read via the API (Prior 7-days)

# Open Acartia Local File
acartia_data_raw = pd.read_csv(local_file)

# Preprocess Acartia Local
acartia_data_local = preprocess_local_acartia(acartia_data_raw)

In [None]:
# Output for Zoe - NOTE: need to send/cross-reference
acartia_data = acartia_data_local.copy()

unspec_sightings = acartia_data[
    (acartia_data["orig_TYPE"].isin(["nan", "autre"]))
    | (acartia_data["orig_TYPE"].str.contains("other"))
]
unspec_sightings = unspec_sightings.drop_duplicates()
unspec_sightings = unspec_sightings.reset_index(drop=True)

In [248]:
acartia_data_local = acartia_data_local[['DATE', 'LATITUDE', 'LONGITUDE', 'SOURCE', 'SOURCE_FILE', 'TYPE', 'NAMED_SUBPOD', 'ASSOC_PODS', 'ASSOC_POD_COUNT']]
acartia_data_local = acartia_data_local.drop_duplicates()
acartia_data_local['DATE'] = pd.to_datetime(acartia_data_local['DATE'])

***

## Investigation Sightings Data - Quick Analysis

In [256]:
acartia_data = pd.concat([acartia_data_local, acartia_git_data]).drop(columns = 'SOURCE_FILE').drop_duplicates()
acartia_data = acartia_data[acartia_data['DATE'] > '1970-01-01']

In [262]:
# Combine Data
sightings_data = pd.concat([twm_data, acartia_data])
print('Total Sightings:', len(sightings_data))
print('  -Total SRKW:', len(sightings_data[sightings_data['TYPE'] == 'SRKW']))
print('  -Total NRKW:', len(sightings_data[sightings_data['TYPE'] == 'NRKW']))
print('  -Total TRANSIENT:', len(sightings_data[sightings_data['TYPE'] == 'TRANSIENT']))
print('  -Total OTHER:', len(sightings_data[sightings_data['TYPE'] == 'OTHER']))

sightings_data["DATE"] = pd.to_datetime(sightings_data["DATE"])
sightings_data = sightings_data.sort_values("DATE")
sightings_data = sightings_data.reset_index(drop=True)

sightings_data["MONTH"] = sightings_data["DATE"].dt.month
sightings_data["YEAR"] = sightings_data["DATE"].dt.year
sightings_data["DOY"] = sightings_data["DATE"].dt.day_of_year
sightings_data["WOY"] = sightings_data["DATE"].dt.isocalendar().week
sightings_data["WOY"] = sightings_data["WOY"].astype(int)
sightings_data["DOY"] = sightings_data["DOY"].astype(int)

# Safe to Assume Longitudes are Flipped if they are positive?
sightings_data["LONGITUDE"] = np.where(
    sightings_data["LONGITUDE"] > 0,
    sightings_data["LONGITUDE"] * -1,
    sightings_data["LONGITUDE"],
)
sightings_data = sightings_data[sightings_data.LONGITUDE < 0]

Total Sightings: 130798
  -Total SRKW: 97678
  -Total NRKW: 10
  -Total TRANSIENT: 12918
  -Total OTHER: 20192


In [263]:
# Get Total Sightings Over Time
sightings_counts = sightings_data.groupby("DATE", as_index=False).agg(
    COUNT=("SOURCE", "count")
)

sightings_counts["MONTH"] = sightings_counts["DATE"].dt.month

In [259]:
fig = px.line(
    sightings_counts, x="DATE", y="COUNT", title="Count of Sightings Over Time"
)
fig.show()

fig = px.box(
    sightings_counts, x="MONTH", y="COUNT", title="Count of Sightings Over Time (Month)"
)
fig.show()

### Track a Pod

In [268]:
pod_only = ["L"]

pod_obs = sightings_data[sightings_data["ASSOC_PODS"].isin(pod_only)]
all_dates = sightings_data[["DATE"]].drop_duplicates()

# Get Counts Over Time
pod_obs_counts = pod_obs.groupby(["DATE", "ASSOC_PODS"], as_index=False).agg(
    COUNT=("ASSOC_POD_COUNT", "count")
)
pod_obs_counts = pd.merge(pod_obs_counts, all_dates, how="outer")
pod_obs_counts["COUNT"] = pod_obs_counts["COUNT"].fillna(0)

pod_obs_counts["WOY"] = pod_obs_counts["DATE"].dt.isocalendar().week

In [269]:
fig = px.box(pod_obs_counts, x="WOY", y="COUNT", color="ASSOC_PODS", title="POD Obs")
fig.show()

In [272]:
fig = px.scatter_mapbox(
    pod_obs,
    lat="LATITUDE",
    lon="LONGITUDE",
    color="WOY",  # color points by month
    color_continuous_scale=px.colors.cyclical.IceFire,  # or any color scale
    size_max=10,
    height=800,
    range_color=[1, 53],
    zoom=5,  # adjust to your region
    mapbox_style="carto-positron",  # free, clean basemap
    hover_data=["LATITUDE", "LONGITUDE", "YEAR", "MONTH"],  # info on hover
)
fig.update_layout(
    dragmode="zoom",  # default is "pan", change to "zoom" if you want
    mapbox=dict(center=dict(lat=47.6, lon=-122.3), zoom=6),
)
config = {"scrollZoom": True}
fig.show(config=config)

#### Track a Specific "Named" Pod

In [273]:
sightings_data.NAMED_SUBPOD.value_counts().head()

NAMED_SUBPOD
OTHER    116494
L12        2489
T65        1730
T99        1000
T137        971
Name: count, dtype: int64

In [275]:
pods = {
    "J11s": ["J11", "J27", "J31", "J39", "J56"],
    "J14s": ["J14", "J37", "J40", "J45", "J49", "J59", "J63"],
    "J16s": ["J16", "J26", "J36", "J42"],
    "J17s": ["J17", "J35", "J44", "J46", "J47", "J53", "J57"],
    "J19s": ["J19", "J41", "J51", "J58", "J62"],
    "J22s": ["J22", "J38"],
    "L12s": ["L12", "L25", "L87", "L119", "L126", "L94", "L113", "L121", "L127"],
}

In [276]:
# Get SubGroup:
named_pods = pods["L12s"]

In [277]:
named_ = sightings_data[sightings_data.NAMED_SUBPOD.isin(named_pods)]
all_dates = sightings_data[["DATE"]].drop_duplicates()

# Get Counts Over Time
named_counts = named_.groupby(["DATE"], as_index=False).agg(
    COUNT=("ASSOC_POD_COUNT", "count")
)
named_counts = pd.merge(named_counts, all_dates, how="outer")
named_counts["COUNT"] = named_counts["COUNT"].fillna(0)

In [278]:
fig = px.line(named_counts, x="DATE", y="COUNT", title=f"L12 - Sub-Pod Observations")
fig.show()

In [279]:
fig = px.scatter_mapbox(
    named_,
    lat="LATITUDE",
    lon="LONGITUDE",
    color="DOY",  # color points by month
    color_continuous_scale=px.colors.cyclical.IceFire,  # or any color scale
    size_max=10,
    height=800,
    range_color=[1, 366],
    zoom=5,  # adjust to your region
    mapbox_style="carto-positron",  # free, clean basemap
    hover_data=["LATITUDE", "LONGITUDE", "YEAR", "MONTH"],  # info on hover
)
fig.update_layout(
    dragmode="zoom",  # default is "pan", change to "zoom" if you want
    mapbox=dict(center=dict(lat=47.6, lon=-122.3), zoom=6),
)
config = {"scrollZoom": True}
fig.show(config=config)

In [280]:
# When do we have duplicat eobservations on the same day?
named_["DATE"].value_counts().head(15)

DATE
2011-07-18    81
2009-07-20    77
2009-07-21    62
2011-07-23    58
2009-07-23    55
2009-08-11    47
2013-06-21    44
2011-08-01    43
2011-07-31    42
2011-07-17    42
2011-07-22    41
2011-08-27    39
2009-07-22    37
2005-07-19    33
2008-08-11    31
Name: count, dtype: int64

In [281]:
named_date = named_[named_["DATE"] == "2011-07-18"]

In [282]:
fig = px.scatter_mapbox(
    named_date,
    lat="LATITUDE",
    lon="LONGITUDE",
    color="NAMED_SUBPOD",  # color points by month
    # color_continuous_scale="Reds",  # or any color scale
    size_max=10,
    height=800,
    range_color=[1, 366],
    zoom=5,  # adjust to your region
    mapbox_style="carto-positron",  # free, clean basemap
    hover_data=["LATITUDE", "LONGITUDE", "YEAR", "MONTH"],  # info on hover
)
fig.update_layout(
    dragmode="zoom",  # default is "pan", change to "zoom" if you want
    mapbox=dict(center=dict(lat=47.6, lon=-122.3), zoom=6),
)
config = {"scrollZoom": True}
fig.show(config=config)

In [283]:
# Mostly the sound, let's check their lats

fig = px.box(named_, x="MONTH", y="LATITUDE")
fig.show()

fig = px.box(named_, x="MONTH", y="LONGITUDE")
fig.show()