# What the notebook does:

* in-depth EDA for the scored bird classes with two additional datasets:
    - 2020 IUCN Red List category
    - The birds of the Hawaiian islands (Pyle's checklist)
* visualize a few samples of all scored classes with mel-scaled spectrograms

In [None]:
!pip install nb-black > /dev/null
!pip install adjustText > /dev/null

In [None]:
import os
import warnings
from collections import defaultdict, Counter

warnings.filterwarnings("ignore")

import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import seaborn as sns

import librosa
import librosa.display
import IPython.display as ipd

import plotly.express as px
import plotly.offline as py
import plotly.graph_objects as go

from adjustText import adjust_text
from plotly.offline import init_notebook_mode, iplot
from wordcloud import WordCloud

init_notebook_mode(connected=True)

plt.style.use("ggplot")

%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
# load data
train = pd.read_csv(
    "../input/birdclef-2022-train-metadata-with-audio-metadata/train_ext.csv"
)
taxonomy = pd.read_csv("../input/birdclef-2022/eBird_Taxonomy_v2021.csv")
primary_tax = pd.read_csv(
    "../input/the-birds-of-the-hawaiian-islands/primary_checklist_taxonomy.csv"
)
birdlife = pd.read_csv(
    "../input/hbw-and-birdlife-taxonomic-checklist/HBW-BirdLife_List_of_Birds_v6.csv"
)

# additional informations
train["type"] = train["type"].apply(eval)
train["secondary_labels"] = train["secondary_labels"].apply(eval)
train_org = train.copy()

# obserbed in Hawaii?
train["in_hawaii"] = (
    (train["longitude"] >= -161)
    & (train["longitude"] < -153)
    & (train["latitude"] >= 18)
    & (train["latitude"] < 24)
)

# is_endemic?
endemic = primary_tax.query("'R' in `HAWAIIAN ISLANDS`")["SPECIES_CODE"].to_numpy()
train["is_endemic"] = train["primary_label"].apply(lambda x: x in endemic)

# extract scored 21 species
scored_idx = train["is_scored"] == True
scored = train[scored_idx].reset_index(drop=True)
scored_org = scored.copy()

# IUCN Red List category
category_map = {
    "DD": 0,
    "LC": 1,
    "NT": 2,
    "VU": 3,
    "EN": 4,
    "CR": 5,
    "CR (PE)": 6,
    "EW": 7,
    "EX": 8,
}
birdlife["2021_IUCN_Red_List_category"] = birdlife["2021 IUCN Red List category"].apply(
    lambda x: category_map[x]
)

In [None]:
scored.head(1).T

In [None]:
train["is_scored"].value_counts()

## Endemic species by Pyle's checklist

definition of endemic species is based on Pyle's checklist[1]

* [1] https://www.kaggle.com/datasets/tatamikenn/the-birds-of-the-hawaiian-islands

In [None]:
primary_tax.query("SPECIES_CODE in @endemic")["PRIMARY_COM_NAME"].to_numpy()

## Primary concerned 10 species
- https://www.kaggle.com/code/amandanavine/hawaiian-bird-species/notebook

In [None]:
primary_concern = [
    "hawgoo",
    "iiwi",
    "crehon",
    "maupar",
    "akiapo",
    "hawcre",
    "hawama",
    "puaioh",
    "hawpet1",
    "barpet",
]

# Pre-process: remove duplicated secondary labels

In [None]:
def check_duplication(df, pkey="primary_label", skey="secondary_labels"):
    def print_header():
        print(f"Duplicated count: {len(duplicated)}")
        print("")
        print("filename | primary_label | secondary_labels")
        print("-" * 40)

    def print_duplication(filename, primary_label, secondary_labels):
        print(f"{filename} | {primary_label} | {secondary_labels}")

    duplicated = []

    for item in df.itertuples():
        primary_label = getattr(item, pkey)
        secondary_labels = getattr(item, skey)
        if primary_label in set(secondary_labels):
            duplicated.append((item.filename, primary_label, secondary_labels))

    if len(duplicated) == 0:
        print("no duplication")
    else:
        print_header()
        for args in duplicated[:5]:
            print_duplication(*args)
        print("...")
        for item in duplicated[-5:]:
            print_duplication(*args)

In [None]:
check_duplication(scored_org)

In [None]:
check_duplication(train_org)

In [None]:
for item in train.itertuples():
    train.at[item.Index, "secondary_labels"] = list(
        set(item.secondary_labels).difference(set([item.primary_label]))
    )
for item in scored.itertuples():
    scored.at[item.Index, "secondary_labels"] = list(
        set(item.secondary_labels).difference(set([item.primary_label]))
    )

In [None]:
check_duplication(scored)

In [None]:
check_duplication(train)

# Taxonomy

Save taxonomy of scored species. I also added URL of [eBird](https://ebird.org) site where the pictures and sound recordings of these classes.

In [None]:
train_birds = pd.DataFrame(
    train.groupby("primary_label").max()["is_scored"]
).reset_index()
train_birds.rename({"primary_label": "label"}, axis=1, inplace=True)

train_tax = scored_tax = pd.merge(
    train_birds, taxonomy, left_on="label", right_on="SPECIES_CODE"
).drop(["label", "TAXON_ORDER", "CATEGORY", "SPECIES_GROUP", "REPORT_AS"], axis=1)
train_tax["URL"] = scored_tax["SPECIES_CODE"].apply(
    lambda x: f"https://ebird.org/species/{x}"
)
train_tax["is_endemic"] = train_tax["SPECIES_CODE"].apply(lambda x: x in endemic)
train_tax["is_primary_concerned"] = train_tax["SPECIES_CODE"].apply(
    lambda x: x in primary_concern
)
train_tax.to_csv("train_taxonomy.csv", index=False)
scored_tax = (
    train_tax.query("is_scored == True")
    .drop("is_scored", axis=1)
    .reset_index(drop=True)
)

# join IUCN Red List category
scored_tax = pd.merge(scored_tax, taxonomy)
scored_tax = pd.merge(
    scored_tax, birdlife, left_on="SCI_NAME", right_on="Scientific name", how="left"
)
scored_tax.loc[
    scored_tax["SPECIES_CODE"] == "hawcre", birdlife.columns
] = birdlife.query("`Common name` == 'Hawaii Creeper'").to_numpy()

scored_tax["is_endemic"] = scored_tax["SPECIES_CODE"].apply(lambda x: x in endemic)
scored_tax["is_primary_concerned"] = scored_tax["SPECIES_CODE"].apply(
    lambda x: x in primary_concern
)
scored_tax

## Number of endemic species

In [None]:
print("- {} endemic species in train data.".format(train_tax["is_endemic"].sum()))
print("- {} endemic species are scored.".format(scored_tax["is_endemic"].sum()))

## Note: Order & Family

*Order* is a larger category than *family*[1].
All bird orders and famillies are listed in [2].

* [1] https://en.wikipedia.org/wiki/Taxonomic_rank
* [2] https://en.wikipedia.org/wiki/List_of_birds

In [None]:
def taxonomy_count_plot(df1, df2, title, log_scale=False):
    _, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))

    plt.suptitle(title, fontsize=18)
    ax1.set_title("Order")
    ax2.set_title("Family")

    gs = [
        sns.countplot(
            y="ORDER1",
            data=df1,
            ax=ax1,
            alpha=0.5,
            color="gray",
            order=df1["ORDER1"].value_counts().index,
            label="all species",
        ),
        sns.countplot(
            y="ORDER1",
            data=df2,
            ax=ax1,
            color="orange",
            order=df1["ORDER1"].value_counts().index,
            label="scored",
        ),
        sns.countplot(
            y="FAMILY",
            data=df1,
            ax=ax2,
            color="gray",
            alpha=0.5,
            order=df1["FAMILY"].value_counts().index,
            label="all species",
        ),
        sns.countplot(
            y="FAMILY",
            data=df2,
            ax=ax2,
            color="green",
            order=df1["FAMILY"].value_counts().index,
            label="scored",
        ),
    ]
    if log_scale:
        for g in gs:
            g.set_xscale("log")
    ax1.legend()
    ax2.legend()
    plt.tight_layout()
    plt.show()

In [None]:
taxonomy_count_plot(train_tax, scored_tax, "Distribution of Bird Order and Family")

In [None]:
df1 = pd.merge(
    train, taxonomy, left_on="primary_label", right_on="SPECIES_CODE", how="left"
)
df2 = pd.merge(
    scored, taxonomy, left_on="primary_label", right_on="SPECIES_CODE", how="left"
)

taxonomy_count_plot(df1, df2, "Sample Counts of Order and Family", log_scale=True)

## Findings

* About 3/4 of the scored classes are `Passeriformers`[3] order.
* About 1/2 of the scored classes are `Fringillidae (Finches, Euphonias, and Allies)`[4] family.



* [3] https://en.wikipedia.org/wiki/Passerine
* [4] https://en.wikipedia.org/wiki/Finch

# 2021 IUCN Red List category

- https://www.kaggle.com/datasets/tatamikenn/hbw-and-birdlife-taxonomic-checklist

In [None]:
def plot_IUCN_heatap(dfs, axes):
    for df, ax in zip(dfs, axes):
        df = df.copy()
        g = sns.heatmap(
            df,
            annot=True,
            fmt="g",
            cmap="Reds",
            ax=ax,
            vmin=0,
            vmax=8,
        )
        g.set(xlabel=None)
        g.set(ylabel=None)


_, axes = plt.subplots(1, 5, figsize=(25, 6))
(ax1, ax2, ax3, ax4, ax5) = axes
plt.suptitle("2021 IUCN Red List category in scored species", fontsize=16)
ax1.set_title("all")
ax2.set_title("endemic")
ax3.set_title("non-endemic")
ax4.set_title("primary concerned")
ax5.set_title("non-primary concerned")

plot_IUCN_heatap(
    [
        scored_tax.set_index("PRIMARY_COM_NAME")[["2021_IUCN_Red_List_category"]],
        scored_tax.query("SPECIES_CODE in @endemic").set_index("PRIMARY_COM_NAME")[
            ["2021_IUCN_Red_List_category"]
        ],
        scored_tax.query("SPECIES_CODE not in @endemic").set_index("PRIMARY_COM_NAME")[
            ["2021_IUCN_Red_List_category"]
        ],
        scored_tax.query("SPECIES_CODE in @primary_concern").set_index(
            "PRIMARY_COM_NAME"
        )[["2021_IUCN_Red_List_category"]],
        scored_tax.query("SPECIES_CODE not in @primary_concern").set_index(
            "PRIMARY_COM_NAME"
        )[["2021_IUCN_Red_List_category"]],
    ],
    axes,
)

plt.tight_layout()
pd.DataFrame({"Code": category_map.keys(), "Value": category_map.values()})

* Most of the scored endemic species are considered as "threatened" (>= 3) according to IUCN Red List.
* *Band-rumped Storm Petrel*, *Hawaii Akakihi* are not endangered species, but it is in the primary-concerned list.

Note: according to the host's notebook[1], the common species *Band-rumped Storm-Petrel* is **not** endangered, but its Hawaiian breeding *'Ake'ake* is endangered.

* [1] https://www.kaggle.com/code/amandanavine/hawaiian-bird-species/notebook#'Ake'ake-(Oceanodroma-castro)

# Geo distribution

In [None]:
fig = px.scatter_geo(
    scored,
    lat="latitude",
    lon="longitude",
    color="common_name",
    title="Geo Distribution",
)
fig.update_geos(lataxis_showgrid=True, lonaxis_showgrid=True)
fig.update_layout(width=960, height=400, margin={"r": 0, "t": 30, "l": 0, "b": 0})
fig.show()

In [None]:
fig = px.scatter_geo(
    scored.query("in_hawaii == True"),
    lat="latitude",
    lon="longitude",
    color="common_name",
    title="Geo Distribution in Hawaii",
)
fig.update_geos(lataxis_showgrid=True, lonaxis_showgrid=True)
fig.update_layout(width=960, height=400, margin={"r": 0, "t": 30, "l": 0, "b": 0})
fig.update_layout(
    geo=dict(
        projection_scale=45,
        center=dict(lat=20.5, lon=-157.5),
    )
)
fig.show()

# Sample Count

In [None]:
df = scored.value_counts("common_name")

fig, (ax1, ax2) = plt.subplots(
    1, 2, figsize=(12, 5), gridspec_kw={"width_ratios": [2, 3]}
)
sns.countplot(
    y="common_name",
    data=scored,
    order=df.index,
    ax=ax1,
    hue="is_endemic",
    dodge=False,
)
ax1.set(
    xlabel="Count",
    ylabel="Common Name",
    title="Sample Counts",
    xscale="log",
    xlim=(1, 1000),
)

sns.ecdfplot(df, ax=ax2)
ax2.set(
    xlim=(0, 100),
    xlabel="Sample count",
    title="Empirical cumulative distribution of sample count",
)
plt.show()

* Good number of species are suitable for typical settings of few-shot learning:
    * **1/3** of species have sample size <= 10
    * **more than 1/2** of species have sample size <= 20

Thus, **10-shot or 20-shot** situation might fit the requirements of the competition.

In [None]:
_, axes = plt.subplots(1, 5, figsize=(25, 6))
(ax1, ax2, ax3, ax4, ax5) = axes
plt.suptitle("Sample count of scored species", fontsize=16)
ax1.set_title("all")
ax2.set_title("endemic")
ax3.set_title("non-endemic")
ax4.set_title("primary concerned")
ax5.set_title("non-primary concerned")

sns.countplot(y="common_name", data=scored, hue="in_hawaii", ax=ax1)
sns.countplot(
    y="common_name",
    data=scored.query("primary_label in @endemic"),
    hue="in_hawaii",
    ax=ax2,
)
sns.countplot(
    y="common_name",
    data=scored.query("primary_label not in @endemic"),
    hue="in_hawaii",
    ax=ax3,
)
sns.countplot(
    y="common_name",
    data=scored.query("primary_label in @primary_concern"),
    hue="in_hawaii",
    ax=ax4,
)
sns.countplot(
    y="common_name",
    data=scored.query("primary_label not in @primary_concern"),
    hue="in_hawaii",
    ax=ax5,
)
for ax in axes:
    ax.set_ylabel(None)

plt.tight_layout()
plt.show()

* Most of endemic species have less than 50 samples, some have only a few samples.
* Almost all of the endemic species are recorded in Hawaii islands. On the other hand, non-endemic species are mostly recorded outside the Hawaii islands.

# Author

In [None]:
_, ax = plt.subplots(figsize=(8, 5))
df = scored.value_counts("author") / len(scored) * 100
df = df.head(20)
sns.barplot(x=df, y=df.index, ax=ax)
ax.set(
    ylabel="sample count",
    title="Number of recording per author (Top 20)",
    xlabel="Number of recordings (%)",
)
plt.show()

In [None]:
top_n = 5
top_n_author = 10
top_n_species = 8

n_cols = 4
n_rows = (top_n_species - 1) // n_cols + 1
fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 4, n_rows * 3), sharex=True)
axes = axes.ravel()
common_names = scored.value_counts("common_name").head(top_n_species).index.array

for ax, common_name in zip(axes, common_names):
    df_ = scored.query("common_name == @common_name").value_counts("author")
    df_ = df_ / sum(df_) * 100
    df = df_.head(top_n_author)
    sns.barplot(x=df, y=df.index, ax=ax)
    share = df_.head(top_n).sum()
    ax.set(
        title=f"{common_name} (top {top_n}: {share:.0f}%)",
        xlabel="recordings (%)",
        xlim=(0, 40),
    )

plt.suptitle(
    f"Number of recordings per species (top {top_n_species}), per authors (top {top_n_author})",
    fontsize=16,
)
plt.tight_layout()

* For top 3 endemic species (Apapane, Iiwi, Omao) are mostly recorded by only a few authors. Most of them also appears in the graph of different species. (Maybe endemic species are only recorded by few number of specialists.)

# Ratings

In [None]:
_, ax = plt.subplots()
sns.countplot(x="rating", data=scored, ax=ax, color="blue")
ax.set_title("Count of Ratings in Scored Classes")
ax.set_xlabel("rating(0-5)")
ax.set_ylabel("count")
plt.show()

In [None]:
print(
    "{:.1f}% of scored recordings are rating >= 3.0.".format(
        len(scored[scored["rating"] >= 3.0]) / len(scored) * 100
    )
)

In [None]:
_, ax = plt.subplots(figsize=(4, 8))
sns.boxplot(y="common_name", x="rating", data=scored, hue="is_endemic", dodge=False)
ax.set_xlabel("rating (0-5)")
ax.set_title("Rating distribution per scored classes")
plt.legend(bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0.0, title="is_endemic")
plt.show()

# Tags

## Note: Songs & Calls

According to Wikipedia[1]

> The distinction between songs and calls is based upon complexity, length, and context. Songs are longer and more complex and are associated with territory and courtship and mating, while calls tend to serve such functions as alarms or keeping members of a flock in contact.

[1] https://en.wikipedia.org/wiki/Bird_vocalization

In [None]:
def plot_type_charts(input_df, title):
    input_df = input_df.copy()
    types = set(input_df["type"].sum())
    type_count = defaultdict(int)

    norm_type_count = defaultdict(int)

    print(f"num types: {len(types)}")
    for item in input_df.itertuples():
        call, song = False, False
        for t in item.type:
            t = t.lower()
            type_count[t] += 1
            if "call" in t:
                call |= True
            elif "song" in t:
                song |= True
            elif "sing" in t:
                song |= True
            else:
                pass
        if call and song:
            norm_type_count["both"] += 1
        elif song:
            norm_type_count["song"] += 1
        elif call:
            norm_type_count["call"] += 1
        else:
            norm_type_count["others"] += 1

    df = pd.DataFrame({"type": type_count.keys(), "count": type_count.values()})
    df.sort_values("count", ascending=False, inplace=True)
    df_ = pd.DataFrame(
        {"type": norm_type_count.keys(), "count": norm_type_count.values()}
    )
    df_.sort_values("count", ascending=False, inplace=True)

    _, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 8))
    plt.setp(ax1.get_xticklabels(), rotation=90)

    ax1.set_title("Broad category of tags")
    ax1.pie(
        df_["count"], labels=df_["type"], autopct="%.1f%%", textprops={"fontsize": 16}
    )

    ax2.set_title("Tags in WordCloud")
    wordcloud = WordCloud(
        width=960, height=600, background_color="white"
    ).generate_from_frequencies(type_count)
    ax2.imshow(wordcloud)
    ax2.axis("off")

    plt.suptitle(title, fontsize=20)
    plt.tight_layout()
    plt.show()


plot_type_charts(scored, "Tags in scored species")

In [None]:
plot_type_charts(train, "Tags in All Train data")

In the charts above, *broad category* is calculated as follows:
1. when keyword `call` is included in a tag, count it as *call*
2. when keyword `sing` or `song` is included in a tag, cout it as *song*
3. in other case, count it as *others*

## Findings

* almost all the broad category are *call* or *song* (or both)
* Compared to the entire train data, the data of only scored species accounts for a larger percentage of *song*.

# Sedondary labels

In [None]:
fig, ax = plt.subplots()
sns.countplot(x="num_secondary_labels", data=scored, ax=ax, color="blue")
ax.set_title("Countplot of num_secondary_labels")
ax.set_xlabel("number of secondary labels")
ax.set_ylabel("count")
plt.show()

In [None]:
print(
    ("{:.1f}% of scored recordings have no secondary labels.").format(
        100 * len(scored.query("num_secondary_labels == 0")) / len(scored)
    )
)

In [None]:
_, ax = plt.subplots(figsize=(4, 8))
sns.boxplot(
    y="common_name",
    x="num_secondary_labels",
    data=scored,
    hue="is_endemic",
    dodge=False,
)
ax.set_xlabel("number of secondary labels")
ax.set_title("Number of secondary labels per scored classes")
plt.legend(bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0.0, title="is_endemic")
plt.show()

* The majority of secondary labels are assigned to the endemic species. On the other hand, few are assigned to non-endemic species.

## Are secondary labels also scored?

In [None]:
scored_labels = set(scored["primary_label"].unique())

sl_count = defaultdict(int)
non_scored = defaultdict(int)

for item in scored.itertuples():
    for label in item.secondary_labels:
        if label in scored_labels:
            sl_count["scored"] += 1
        else:
            sl_count["not_scored"] += 1
            non_scored[label] += 1


df = pd.DataFrame({"label": sl_count.keys(), "count": sl_count.values()})
df_ = pd.DataFrame(
    {"label": non_scored.keys(), "count": non_scored.values()}
).sort_values("count", ascending=False)
df_ = (
    pd.merge(
        df_,
        train_tax[["SPECIES_CODE", "PRIMARY_COM_NAME", "is_endemic"]],
        left_on="label",
        right_on="SPECIES_CODE",
    )
    .drop(["SPECIES_CODE", "label"], axis=1)
    .rename({"PRIMARY_COM_NAME": "label"}, axis=1)
)

_, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 6))

ax1.set_title("Are Secondary Labels Scored?")
ax1.pie(df["count"], labels=df["label"], autopct="%.1f%%", textprops={"fontsize": 16})

ax2.set_title("counts of non-scored labels")
sns.barplot(y="label", x="count", data=df_, hue="is_endemic", dodge=False, ax=ax2)

plt.tight_layout()
plt.show()

* 2/3 of secondary labels tagged to scored species are also scored species. This is an understandable result considering that most of the secondary labels are added to endemic species.

## Co-occurrence of second labels

In [None]:
data = {
    "primary_label": [],
    "secondary_label": [],
}
scored_species = scored_tax["SPECIES_CODE"].unique()
endemic_species = scored_tax.query("is_endemic == True")["SPECIES_CODE"].unique()
code2name = pd.Series(
    scored_tax.PRIMARY_COM_NAME.values, index=scored_tax.SPECIES_CODE
).to_dict()
others_label = "*non-scored*"

for item in scored.itertuples():
    p_label = item.primary_label
    for s_label in item.secondary_labels:
        data["primary_label"].append(p_label)
        if s_label in scored_species:
            data["secondary_label"].append(s_label)
        else:
            data["secondary_label"].append(others_label)

columns = scored_species.tolist() + [others_label]
df = pd.DataFrame(data)
cross = pd.crosstab(df.primary_label, df.secondary_label)
cross = cross.reindex(columns, axis=1).fillna(0).astype(int)
cross_endemic = cross.reset_index().query("primary_label in @endemic_species")
cross_endemic = cross_endemic.set_index("primary_label")
cross_endemic = cross_endemic[endemic_species.tolist() + [others_label]]

cross = cross.rename(code2name).rename(code2name, axis=1)
cross_endemic = cross_endemic.rename(code2name).rename(code2name, axis=1)

# plot
plt.tight_layout()
fig = plt.figure(figsize=(16, 13))
gs0 = gridspec.GridSpec(2, 1, figure=fig)
gs00 = gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=gs0[0])
ax1 = fig.add_subplot(gs00[0])
ax2 = fig.add_subplot(gs00[1])
gs01 = gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=gs0[1])
ax3 = fig.add_subplot(gs01[0])
ax4 = fig.add_subplot(gs01[1])

# ax1
sns.heatmap(cross.apply(np.log1p), square=True, ax=ax1)
ax1.set(title="all scored species")

# ax2
sns.heatmap(cross_endemic.apply(np.log1p), square=True, ax=ax2)
ax2.set(title="only endemic species")

# ax3
df = cross.sum()
sns.barplot(y=df.index, x=df, ax=ax3)
ax3.set(title="all scored species")

# ax4
df = cross_endemic.sum()
sns.barplot(y=df.index, x=df, ax=ax4)
ax4.set(title="only endemic species")

plt.suptitle(
    "Up: co-occurence matrix (logarithmic scale) / Bottom: distribution of secondary labels",
    fontsize=16,
)
gs0.tight_layout(fig)
plt.show()

* There is a bias toward certain species that appear on the second labels: Apapane, Hawaii Amakihi, Iiwi are more frequently appeares in background than other scored species.

## Foreground v.s. background occurrence

Below is the scatter plot of primary/secondary labels counts. From the graph, we can see that the frequency of the second label is roughly proportional to the frequency of the primary label. If the frequency of the primary labels is proportional to the number of inhabitants of the species, **species that are frequently recorded in the background are simply considered to be those with a high number of inhabitants, or those with larger area of inhabitants**.

In [None]:
endemic_count = (
    pd.DataFrame(scored["primary_label"].value_counts())
    .reindex(columns)
    .drop(others_label)
    .reset_index()
    .rename(
        {"index": "primary_label", "primary_label": "count_of_primary_labels"}, axis=1
    )
    .query("primary_label in @endemic_species")
)
endemic_count["common_name"] = endemic_count["primary_label"].apply(
    lambda x: code2name[x]
)

df = pd.merge(
    endemic_count,
    cross_endemic.sum().reset_index().rename({0: "count_of_secondary_labels"}, axis=1),
    left_on="common_name",
    right_on="secondary_label",
    how="left",
)
_, ax = plt.subplots(figsize=(13, 8))
sns.scatterplot(
    x="count_of_primary_labels", y="count_of_secondary_labels", data=df, ax=ax
)
texts = [plt.text(X[1], X[4], X[2], ha="center", va="center") for X in df.to_numpy()]
adjust_text(texts, arrowprops=dict(arrowstyle="->"), color="black")

ax.set(
    title="Correlation between #secondary labels and #primary labels",
    xlabel="count of primary labels",
    ylabel="count of secondary labels",
)
plt.show()

# Audio Statistics

* used extended metadata:
https://www.kaggle.com/tatamikenn/birdclef-2022-train-metadata-with-audio-metadata

In [None]:
ax = sns.histplot(x=scored["length"], log_scale=True)
ax.set_xlabel("length [sec]")
ax.set_title("Histogram of sequece length in scored classes")
plt.show()

In [None]:
print(
    ("{:.1f}% of recordings are length > 10 sec.").format(
        100 * len(scored[scored["length"] > 10]) / len(scored)
    )
)

In [None]:
_, ax = plt.subplots(figsize=(5, 8))
ax = sns.boxplot(
    y="common_name", x="length", data=scored, hue="is_endemic", dodge=False
)
ax.set_xlabel("length [sec]")
ax.set_title("Distribution of Sequence Length per Scored Bird Class")
ax.set_xscale("log")
plt.legend(bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0.0, title="is_endemic")
plt.show()

In [None]:
_, ax = plt.subplots(figsize=(5, 6))

# df = scored.groupby("common_name")["length"].sum().reset_index()
df = (
    scored.groupby("common_name")
    .agg(length=("length", "sum"), is_endemic=("is_endemic", "max"))
    .reset_index()
)

df_ = scored.value_counts("common_name")

df["length"] /= 60
sns.barplot(
    y="common_name",
    x="length",
    order=df_.index,
    data=df,
    ax=ax,
    hue="is_endemic",
    dodge=False,
    alpha=1.0,
)
ax.set_xlabel("length [min]")
ax.set_title("Total Sequence Length of Scored Bird Class")
ax.set_xscale("log")
ax.set_xlim((1e-1, 1e3))
plt.show()
print(f"Total length of all scored recordings: {df['length'].sum() / 60:.2f} hours")

* Puaiohi has only less than 1 minute recordings.

In [None]:
ax = sns.histplot(x="channels", data=scored, bins=2)
ax.set_title("Distribution of audio channels for the scored samples")

* About 2/3 of recordings are stereo recordings.

# Visualize audio samples

## Findings

* Some bird can shift \~4kHz frequency in a short period of time (\~100ms), so collectly tuning time/frequency resolution is important.
* There exists different "mode" of call even if it is in the same bird class.
* Calls or songs of multiple species are in the same recording even if there are no secondary labels. Separating these calls/songs of different bird species is important especially for the bird class of few samples.

In [None]:
def get_full_path(path):
    return f"../input/birdclef-2022/train_audio/{path}"


def play_song(filename, bird_class):
    display(ipd.Audio(get_full_path(filename)))


def load_audio(filename, sr):
    full_path = get_full_path(filename)
    assert os.path.isfile(full_path), (full_path, filename)
    audio, _ = librosa.core.load(full_path, sr=sr, mono=True)
    return audio


def create_spectrogram(
    filename,
    bird_class,
    audio_params,
):
    sr, n_fft, hop_length, n_mels, fmin, fmax = [
        audio_params[key]
        for key in ["sr", "n_fft", "hop_length", "n_mels", "fmin", "fmax"]
    ]
    audio = load_audio(filename, sr)
    melspec = librosa.feature.melspectrogram(
        audio,
        sr=sr,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        power=1.0,
        fmin=fmin,
        fmax=fmax,
    )
    S_db = librosa.amplitude_to_db(melspec, ref=np.max)
    return S_db


def show_spectrogram(S_db, audio_params, title, fig, ax):
    hop_length, sr, fmin, fmax = [
        audio_params[key] for key in ["hop_length", "sr", "fmin", "fmax"]
    ]
    colormesh = librosa.display.specshow(
        S_db,
        hop_length=hop_length,
        sr=sr,
        fmin=fmin,
        fmax=fmax,
        x_axis="time",
        y_axis="mel",
        ax=ax,
    )
    ax.set_title(
        title,
        fontsize=15,
    )
    return colormesh


def view_spectrogram(bird_class, random_state=123, drop_low_freq=False, num_samples=3):
    audio_params = dict(
        sr=32_000,
        n_mels=128,
        n_fft=800,  # 25 ms
        hop_length=320,  # 10 ms
        fmin=0,
        fmax=16_000,
    )

    selected = scored.query("primary_label == @bird_class").sort_values(
        "length", ascending=True
    )
    common_name = scored_tax.query("SPECIES_CODE == @bird_class")[
        "PRIMARY_COM_NAME"
    ].array[0]
    sample_count = min(num_samples, len(selected))
    print("=" * 40)
    print(f"Common Name: {common_name}")
    print(f"URL: https://ebird.org/species/{bird_class}")
    print(f"params: {audio_params}")
    print("=" * 40)

    n_fig = len(selected[:sample_count])
    fig, axes = plt.subplots(n_fig, 1, figsize=(24, 3 * n_fig))
    for i, item in enumerate(selected[:sample_count].itertuples()):
        params = {
            "rating": item.rating,
            "filename": item.filename,
            "type": item.type,
            "secondary labels": item.secondary_labels,
        }
        print(params)
        ax = axes[i] if n_fig >= 2 else axes

        title = f"[Bird: {bird_class}] Mel-scaled spectrogram of audio: {item.filename.split('/')[-1]}"
        S_db = create_spectrogram(item.filename, bird_class, audio_params)
        colormesh = show_spectrogram(S_db, audio_params, title, fig, ax)
        play_song(item.filename, bird_class)

    plt.tight_layout()
    fig.colorbar(colormesh, ax=axes, format="%+2.0f dB", location="right")

In [None]:
view_spectrogram("akiapo")

In [None]:
view_spectrogram("aniani")

In [None]:
view_spectrogram("apapan")

In [None]:
view_spectrogram("barpet")

In [None]:
view_spectrogram("crehon")

In [None]:
view_spectrogram("elepai")

In [None]:
view_spectrogram("ercfra")

In [None]:
view_spectrogram("hawama")

In [None]:
view_spectrogram("hawcre")

In [None]:
view_spectrogram("hawgoo")

In [None]:
view_spectrogram("hawhaw")

In [None]:
view_spectrogram("hawpet1")

In [None]:
view_spectrogram("houfin")

In [None]:
view_spectrogram("iiwi")

In [None]:
view_spectrogram("jabwar")

In [None]:
view_spectrogram("maupar")

In [None]:
view_spectrogram("omao")

In [None]:
view_spectrogram("puaioh")

In [None]:
view_spectrogram("skylar")

In [None]:
view_spectrogram("warwhe1")

In [None]:
view_spectrogram("yefcan")