## test whale data

In [None]:
import requests
import pandas as pd

species = "Megaptera novaeangliae"
bbox_wkt = "POLYGON ((-175 15, -175 65, -120 65, -120 15, -175 15))"
start = "2023-01-01"
end   = "2023-12-31"
size  = 500

base_url = (
    "https://api.obis.org/v3/occurrence"
)

def fetch_page(offset):
    params = {
        "scientificname": species,
        "startdate": start,    
        "enddate": end,         
        "geometry": bbox_wkt,
        "size": size,
        "offset": offset,
    }
    headers = {"accept": "*/*"}

    r = requests.get(base_url, params=params, headers=headers)
    r.raise_for_status()
    return r.json()

data = fetch_page(0)

rows = data.get("results", [])
print("rows on first page:", len(rows))

df = pd.DataFrame(rows)
df.head()


In [None]:
data["total"]

In [None]:
data["results"][0]

## full whale data

from: https://ipt.env.duke.edu/resource?r=zd_1765

In [None]:
import pandas as pd

df = pd.read_csv("occurrence.txt", sep="\t")
df.head()


In [None]:
len(df)

In [None]:
df.columns

In [None]:
df = df[df["scientificName"] == "Megaptera novaeangliae"]
df["eventDate"] = pd.to_datetime(df["eventDate"], errors="coerce")


In [None]:
len(df["organismID"].unique())

In [None]:
counts = df.groupby("organismID").size().reset_index(name="n_obs")
counts.sort_values("n_obs", ascending=False).head(20)


In [None]:
import pandas as pd

df = df.copy()

# normalize sex labels
df["sex"] = (
    df["sex"]
    .astype(str)
    .str.lower()
    .str.strip()
    .map({"m":"male", "male":"male", "f":"female", "female":"female"})
)

# count # sightings per whale
counts = df.groupby("organismID")["eventDate"].count().reset_index(name="n_obs")

# top 100 whales by obs count
top100 = counts.sort_values("n_obs", ascending=False).head(100)

# get sex for each whale (mode of observations)
sex_per_id = (
    df.groupby("organismID")["sex"]
      .agg(lambda x: x.dropna().mode()[0] if len(x.dropna()) else None)
      .reset_index()
)

# merge
top100 = top100.merge(sex_per_id, on="organismID", how="left")

# filter only sex known
top100 = top100[top100["sex"].notna()].copy()

# pick 10 female + 10 male
fem_ids = top100[top100["sex"]=="female"].head(10)["organismID"]
male_ids = top100[top100["sex"]=="male"].head(10)["organismID"]

best_ids = pd.concat([fem_ids, male_ids], ignore_index=True)
best_ids = list(best_ids.unique())   # ensure no dupes

print("N selected:", len(best_ids))
print(best_ids)


In [None]:
df_sel = df[df["organismID"].isin(best_ids)].copy()


In [None]:
df_sel["eventDate"] = pd.to_datetime(df_sel["eventDate"], errors="coerce")
df_sel = df_sel[df_sel["eventDate"].notna()]
df_sel = df_sel.sort_values(["organismID", "eventDate"])


In [None]:
df_sel["lat_next"] = df_sel.groupby("organismID")["decimalLatitude"].shift(-1)
df_sel["lon_next"] = df_sel.groupby("organismID")["decimalLongitude"].shift(-1)
df_sel["t_next"]   = df_sel.groupby("organismID")["eventDate"].shift(-1)

df_seg = df_sel.dropna(subset=["lat_next","lon_next"]).copy()


In [None]:
print(df_seg["organismID"].nunique())   # should be 20
print(df_sel.groupby("organismID").size().sort_values(ascending=False).head())


In [None]:
df_sel["week"] = df_sel["eventDate"].dt.isocalendar().week.astype(int)

weekly = (
    df_sel
    .groupby(["organismID","sex","week"])
    .agg(
        lat=("decimalLatitude","median"),
        lon=("decimalLongitude","median"),
        t=("eventDate","min")
    )
    .reset_index()
)


In [None]:
weekly = weekly.sort_values(["organismID","t"]).reset_index(drop=True)

In [None]:
weekly["lat_next"] = weekly.groupby("organismID")["lat"].shift(-1)
weekly["lon_next"] = weekly.groupby("organismID")["lon"].shift(-1)
weekly["t_next"]   = weekly.groupby("organismID")["t"].shift(-1)

seg = weekly.dropna(subset=["lat_next","lon_next"]).copy()


In [None]:
seg.to_csv("whale_segments_weekly.csv", index=False)
