## Apply Phase 1 cluster review annotations to all detected songs

based on Phase 1 cluster review, apply updated cluster labels to all clips
- some clusters were merged
- some clusters that were impure are removed from further analysis

This script was used to create the `per_point_clusters_reviewed.csv` file in the publicly available 4-year passive acoustic monitoring dataset of Ovenbird songs. 


In [None]:
import pandas as pd
import numpy as np
import ast


def parse_list_of_floats(string):
    try:
        return ast.literal_eval(string)
    except (SyntaxError, ValueError):
        return None

In [None]:
pam_dataset_path = "../../../pam_dataset_v4/"

cluster_samples = pd.read_csv(
    f"{pam_dataset_path}/per_point_clusters_sample10.csv",
    parse_dates=["datetime", "date"],
    converters={"features3d": parse_list_of_floats},
)
cluster_samples["file"] = cluster_samples["clip_name"].apply(
    lambda clip: f"{pam_dataset_path}/audio/{clip}"
)
cluster_samples["start_time"] = 0
cluster_samples["year"] = cluster_samples["date"].apply(lambda x: x.year)

In [None]:
reviewed_clusters_csv = f"{pam_dataset_path}/cluster_summary_reviewed.csv"
cluster_annotations = pd.read_csv(reviewed_clusters_csv)

number of clusters merged with another

In [80]:
cluster_annotations["annotation"].apply(lambda x: x == x and "RK" in x).sum()

65

reasons for rejected clusters

In [81]:
cluster_annotations["reason"].value_counts()

reason
distant         30
not_ovenbird    29
mixed           21
Name: count, dtype: int64

In [83]:
cluster_annotations["cluster_reviewed"] = cluster_annotations.apply(
    lambda row: (row["cluster_30"] if row["annotation"] == "y" else row["annotation"]),
    axis=1,
)
cluster_annotations["cluster_reviewed"] = cluster_annotations["cluster_reviewed"].apply(
    lambda x: np.nan if x == "n" else x
)
clusters = [c for c in cluster_annotations["cluster_reviewed"].unique() if c == c]
for c in clusters:
    assert c in cluster_annotations["cluster_30"].values, c
    assert cluster_annotations.set_index("cluster_30").at[c, "annotation"] == "y", c
print(f"n clusters after review: {len(clusters)}")

n clusters after review: 405


In [84]:
# need integer index of each clip, restarts with each cluster

cluster_samples = pd.read_csv(
    f"{pam_dataset_path}/per_point_clusters_sample10.csv",
    parse_dates=["datetime", "date"],
    converters={"features3d": parse_list_of_floats},
)
cluster_samples["file"] = cluster_samples["clip_name"].apply(
    lambda clip: f"{pam_dataset_path}/audio/{clip}"
)
cluster_samples["start_time"] = 0
cluster_samples["year"] = cluster_samples["date"].apply(lambda x: x.year)
annotated_cluster_samples = cluster_samples.copy()

annotated_cluster_samples["idx"] = None
for _, cluster in annotated_cluster_samples.groupby("cluster_30"):
    idx = 0
    for i, row in cluster.iterrows():
        annotated_cluster_samples.at[i, "idx"] = idx
        idx += 1

# annotated_cluster_samples['idx'].value_counts()
annotated_cluster_samples.set_index("cluster_30", inplace=True)

In [85]:
annotated_cluster_samples = annotated_cluster_samples.join(
    cluster_annotations.set_index("cluster_30")[
        ["cluster_reviewed", "clips_to_exclude"]
    ]
)
annotated_cluster_samples = annotated_cluster_samples.dropna(
    subset=["cluster_reviewed"]
)
annotated_cluster_samples["clips_to_exclude"] = annotated_cluster_samples[
    "clips_to_exclude"
].apply(lambda x: [int(i) for i in str(x).split(",")] if x == x else [])
excluded_cluster_samples = annotated_cluster_samples[
    annotated_cluster_samples.apply(lambda row: row.idx in row.clips_to_exclude, axis=1)
]
annotated_cluster_samples = annotated_cluster_samples[
    annotated_cluster_samples.apply(
        lambda row: not row.idx in row.clips_to_exclude, axis=1
    )
]

annotated_cluster_samples = annotated_cluster_samples.drop(columns="clips_to_exclude")
n_pure = len(annotated_cluster_samples)
n_impure = len(excluded_cluster_samples)
n_pure, n_impure, f"purity: {n_pure/(n_pure+n_impure):0.2f}"

(4237, 432, 'purity: 0.91')

In [None]:
# annotated_cluster_samples.to_csv(
#     f"{pam_dataset_path}/per_point_clusters_reviewed.csv",
#     index=False,
# )

In [None]:
all_cluster_samples = pd.read_csv(
    f"{pam_dataset_path}/per_point_clusters.csv",
    index_col=0,
)
all_cluster_samples.index.name = "clip_index"

all_cluster_samples = all_cluster_samples.reset_index(drop=False).set_index(
    "cluster_30"
)

all_cluster_samples = all_cluster_samples.join(
    cluster_annotations.set_index("cluster_30")[["cluster_reviewed"]]
)
all_cluster_samples = all_cluster_samples.dropna(subset=["cluster_reviewed"])
all_cluster_samples.to_csv(f"{pam_dataset_path}/all_clips_with_cleaned_clusters.csv")

In [88]:
len(all_cluster_samples.index.unique())

470

In [89]:
len(all_cluster_samples.cluster_reviewed.unique())

405