# What this notebook does:

* Comparing species in the BirdCLEF 2022 tarain dataset and the Pyle's checklist[1]

In [None]:
!pip install nb-black > /dev/null

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2

plt.style.use("ggplot")
%load_ext lab_black

In [None]:
taxonomy = pd.read_csv("../input/birdclef-2022/eBird_Taxonomy_v2021.csv")
primary_list = pd.read_csv(
    "../input/the-birds-of-the-hawaiian-islands/primary_checklist.csv"
)
primary_tax = pd.read_csv(
    "../input/the-birds-of-the-hawaiian-islands/primary_checklist_taxonomy.csv"
)
train = pd.read_csv(
    "../input/birdclef-2022-train-metadata-with-audio-metadata/train_ext.csv"
)

# exclue extinct species
extinct = ["X", "x"]
primary_tax = primary_tax.query("`HAWAIIAN ISLANDS` not in @extinct").reset_index(
    drop=True
)

# unique species
train_species = train["primary_label"].unique()
primary_species = primary_tax["SPECIES_CODE"].unique()
scored_species = train.query("is_scored == True")["primary_label"].unique()

# taxonomy of species in train dataset
train_tax = pd.merge(
    pd.DataFrame({"primary_label": train_species}),
    taxonomy,
    left_on="primary_label",
    right_on="SPECIES_CODE",
).drop("primary_label", axis=1)
train_tax.to_csv("train_tax.csv", index=False)

# specify dataset type
primary_tax["dataset"] = "Pyle's checklist"
train_tax["dataset"] = "BirdCLEF 2022 train dataset"

# add eBird URL
primary_tax["EBIRD_URL"] = "https://ebird.org/species/" + primary_tax["SPECIES_CODE"]
train_tax["EBIRD_URL"] = "https://ebird.org/species/" + train_tax["SPECIES_CODE"]

# species are endemic or not
endemic = primary_tax.query("'R' in `HAWAIIAN ISLANDS`")["SPECIES_CODE"].to_numpy()

In [None]:
np.save("endemic.npy", endemic)
np.load("endemic.npy", allow_pickle=True)

# Duplication of Species

In [None]:
_, ax = plt.subplots(figsize=(6, 6))
sns.set_color_codes("pastel")

s1, s2 = set(train_species), set(primary_species)
venn2(
    subsets=(s1, s2),
    set_labels=("BirdCLEF '22", "Pyle's checklist"),
    ax=ax,
)
ax.set_title("Duplication of Species")

* About 95% the species in BirdCLEF 2022 train dataset are encompassed to the Pyle's checklist.

In [None]:
s1, s2 = set(train_species), set(primary_species)
not_in_list = s1 - s2

print(f"Species not in the Pyle's checklist:")
train_tax.query("SPECIES_CODE in @not_in_list").drop(
    ["TAXON_ORDER", "CATEGORY", "SPECIES_GROUP", "REPORT_AS"], axis=1
)

* According to [eBird website](https://ebird.org), `Burrowing Parakeet`, `Golden Pheasant`, `Inca Tern`, `Northern Harrier`, and `Rosy-faced Lovebird` are observed in Hawaii. Since Pyle's checklist was edited in 2017, species that is recently observed in Hawaii islands are not in the list.

In [None]:
s1, s2 = set(train_species), set(primary_species)
not_in_list = s2 - s1

print(f"Species not in the BirdCLEF 2022 train data:")
ext_species = primary_tax.query("SPECIES_CODE in @not_in_list").drop(
    ["TAXON_ORDER", "CATEGORY", "SPECIES_GROUP", "REPORT_AS"], axis=1
)
ext_species.to_csv("ext_species.csv", index=False)
ext_species

* According to eBird website, at least some of these birds are obserbed in Hawaii. -> Why they are ommited in the BirdCLEFF 2022 dataset?

# Taxonomy

In [None]:
def plot_taxonomy(primary_tax, train_tax, key, ax):
    united_tax = pd.concat([primary_tax, train_tax])
    df = united_tax.groupby([key, "dataset"]).count()["SPECIES_CODE"].reset_index()
    sns.barplot(
        y=key,
        x="SPECIES_CODE",
        data=df,
        ax=ax,
        order=primary_tax[key].value_counts().index,
        hue_order=["Pyle's checklist", "BirdCLEF 2022 train dataset"],
        hue="dataset",
        dodge=False,
        alpha=0.8,
    )
    ax.legend(ncol=2, loc="lower right", frameon=True)


_, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 12))
sns.set_color_codes("pastel")

plt.suptitle("Order & Family", fontsize=18)
ax1.set_title("Bird Order")
ax2.set_title("Bird Family")

plot_taxonomy(primary_tax, train_tax, "ORDER1", ax1)
plot_taxonomy(primary_tax, train_tax, "FAMILY", ax2)

plt.tight_layout()
plt.show()

* There is no bias toward any particular orders or families.

# Discussion

* I assume species in BirdCLEF 2022 dataset are subset of that in Pyle's checklist (except for recently observed species).
* At least at this time, no rules of thumb could be found for the species selected and excluded from the BirdCLEF 2022 dataset.

# Reference

[1] Robert L. Pyle and Peter Pyle, "The Birds of the Hawaiian Islands: Occurrence, History, Distribution, and Status Version 2 - 1 January 2017", http://hbs.bishopmuseum.org/birds/rlp-monograph/PrimaryChecklist.htm