# 2021

Loads the 2021 scoring spreadsheet to filter out videos containing multiple and/or unknown individuals.

In this dataset, each row corresponds to one bird observation. When a single video contained multiple birds, there would be multiple rows (one per bird) with the same Video_ID. This script drops Video_IDs that appear more than once, as well as rows where Bird == "UNKNOWN" (case insensitive), to keep only videos with a single, identified individual.

In [None]:
from pathlib import Path
import pandas as pd

original_scoring = Path("original_video_categorisation_2021.csv")
filtered_scoring = Path("filtered_video_categorisation_2021.csv")

filtered_scoring_df = pd.read_csv(original_scoring, encoding="utf-16")
total_rows = len(filtered_scoring_df)

# keep Video_IDs that appear only once
Video_ID_counts = filtered_scoring_df["Video_ID"].value_counts()
Video_IDs_appearing_only_once = Video_ID_counts[Video_ID_counts == 1].index
keep_single_bird = filtered_scoring_df["Video_ID"].isin(Video_IDs_appearing_only_once)

# drop rows with Bird == "UNKNOWN"
keep_known_bird = filtered_scoring_df["Bird"].str.upper().ne("UNKNOWN")

Videos_to_keep = filtered_scoring_df[keep_single_bird & keep_known_bird]

Videos_to_keep.to_csv(filtered_scoring, index=False, encoding="utf-16")

print(f"Done")