In [21]:
import pandas as pd
import os

In [22]:
# filter for:
# - files with a rating above X
# - files with no secondary labels (VERY BIG MUST)

In [23]:
data_2020 = pd.read_csv("../datasets/birdsongs-combined/metadata/train_metadata_2020.csv")
data_2020 = data_2020[["ebird_code", "filename", "rating", "secondary_labels"]]
data_2020["primary_label"] = data_2020["ebird_code"]
data_2020 = data_2020.drop("ebird_code", axis=1)

In [24]:
data_2021 = pd.read_csv("../datasets/birdsongs-combined/metadata/train_metadata_2021.csv")
data_2021 = data_2021[["primary_label", "filename", "rating", "secondary_labels"]]

In [25]:
data_2022 = pd.read_csv("../datasets/birdsongs-combined/metadata/train_metadata_2022.csv")
data_2022 = data_2022[["primary_label", "filename", "rating", "secondary_labels"]]

In [26]:
data_2023 = pd.read_csv("../datasets/birdsongs-combined/metadata/train_metadata_2023.csv")
data_2023 = data_2023[["primary_label", "filename", "rating", "secondary_labels"]]

In [27]:
data_2024 = pd.read_csv("../datasets/birdsongs-combined/metadata/train_metadata_2024.csv")
data_2024 = data_2024[["primary_label", "filename", "rating", "secondary_labels"]]

In [28]:
total_data = pd.concat([
    data_2020,
    data_2021,
    data_2022,
    data_2023,
    data_2024,
], axis=0).reset_index(drop=True)

total_data["xc_id"] = [i.split("/")[-1][:-4][2:] for i in total_data["filename"]]

# remove files with any secondary labels
# total_data is now ONLY single label XC ids
#total_data = total_data[total_data["secondary_labels"] == "[]"].drop("secondary_labels", axis=1).reset_index(drop=True)

# this also ensures the files are kept entirely up to date (as they were concatenated in order, dropping the last ensures the latest file is kept for each XC ID)
total_data = total_data.drop_duplicates(subset="xc_id", keep="last").reset_index(drop=True)

files_before_preprocessing = len(total_data)
print(files_before_preprocessing, "total files in the dataset before any preprocessing")

112861 total files in the dataset before any preprocessing


In [29]:
# drop files with secondary labels

total_data = total_data[total_data["secondary_labels"] == "[]"]
files_after_secondary_filtering = len(total_data)

print(files_before_preprocessing - files_after_secondary_filtering, "files were multi-label files.")
print(files_after_secondary_filtering, "files exist after removing multi-label files.")

27195 files were multi-label files.
85666 files exist after removing multi-label files.


In [30]:
# cumulative distribution plot of all ratings at X or above.
cumsum = pd.concat([total_data["rating"].value_counts().sort_index(), total_data["rating"].value_counts().sort_index().cumsum()], axis=1)
cumsum["sum"] = cumsum.iloc[:, 1]
cumsum = cumsum.iloc[:, [0, 2]]
cumsum

Unnamed: 0_level_0,count,sum
rating,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,3907,3907
0.5,88,3995
1.0,664,4659
1.5,419,5078
2.0,2756,7834
2.5,2066,9900
3.0,11391,21291
3.5,5048,26339
4.0,28782,55121
4.5,5028,60149


In [31]:
updated_filenames = []

for i in range(len(total_data.index)):
    row = total_data.iloc[i]

    updated_filenames.append(f"{row["primary_label"]}/{row["filename"].split("/")[-1]}")

total_data["filename"] = updated_filenames

In [32]:
for rating in [i / 2.0 for i in range(11)]:
    f = total_data[total_data["rating"] >= rating]
    min_10_class_species = [i for i, v in (f["primary_label"].value_counts() >= 10).items() if v]
    f = f[f["primary_label"].isin(min_10_class_species)]
    
    path = f"rating_thresholds_at_least_10_classes/min_rating_{int(rating * 2)}.txt"
    
    print(rating, f["filename"].__len__())
    
    if not os.path.exists("rating_thresholds_at_least_10_classes/"):
        os.mkdir("rating_thresholds_at_least_10_classes")

    with open(path, mode="w") as file:
        file.write('\n'.join(f["filename"].values))
        file.close()

0.0 85081
0.5 81164
1.0 81076
1.5 80411
2.0 79979
2.5 77214
3.0 75147
3.5 63650
4.0 58589
4.5 29239
5.0 24163


In [33]:
for rating in [i / 2.0 for i in range(8, 11)]:
    f = total_data[total_data["rating"] >= rating]

    print(">=", rating)

    print("Total classes:", len(f["primary_label"].unique()))
    print(">= 10 files per class:", (f["primary_label"].value_counts() >= 10).sum())
    print("Mean files per class:", f["primary_label"].value_counts().mean())
    print("STD:", f["primary_label"].value_counts().std())

    print()

>= 4.0
Total classes: 976
>= 10 files per class: 833
Mean files per class: 60.78586065573771
STD: 75.33573568810999

>= 4.5
Total classes: 951
>= 10 files per class: 659
Mean files per class: 32.11882229232387
STD: 43.57649744482584

>= 5.0
Total classes: 945
>= 10 files per class: 638
Mean files per class: 27.002116402116403
STD: 34.09166351089114

