# What this notebook does:

* comparing BirdCLEF 2021 train data and BirdCLEF 2022 data

In [None]:
!pip install nb-black > /dev/null

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib_venn import venn2

plt.style.use("ggplot")
%load_ext lab_black

In [None]:
# load train data
train_2021 = pd.read_csv("../input/birdclef-2021/train_metadata.csv").drop(
    "date", axis=1
)
train_2022 = pd.read_csv("../input/birdclef-2022/train_metadata.csv")
scored = pd.read_json("../input/birdclef-2022/scored_birds.json")

# normalize columns of 2021
train_2021 = train_2021.reindex(train_2022.columns, axis=1)  # normalize column order
train_2021["filename"] = train_2021["primary_label"] + "/" + train_2021["filename"]
assert (train_2021.columns == train_2022.columns).all()

# add year columns
train_2021["year"] = 2021
train_2022["year"] = 2022

# append audio metadata
audio_2021 = pd.read_csv(
    "../input/birdclef-2022-train-metadata-with-audio-metadata/audio_metadata_2021.csv"
)
audio_2022 = pd.read_csv(
    "../input/birdclef-2022-train-metadata-with-audio-metadata/audio_metadata_2022.csv"
)
train_2021 = pd.concat([train_2021, audio_2021], axis=1)
train_2022 = pd.concat([train_2022, audio_2022], axis=1)

# concat 2021 and 2022
train = pd.concat([train_2021, train_2022])
assert len(train) == len(train_2021) + len(train_2022)

# add auxiliary columns
scored["is_scored"] = True
scored.rename({0: "label"}, axis=1, inplace=True)
train = (
    pd.merge(train, scored, left_on="primary_label", right_on="label", how="left")
    .fillna(False)
    .drop("label", axis=1)
)
# num_secondary_labels
train["num_secondary_labels"] = train["secondary_labels"].apply(lambda x: len(eval(x)))

# re-split into 2021 and 2022
train_2021, train_2022 = train.head(len(train_2021)), train.tail(len(train_2022))

# distinct dataset
train_distinct = train.drop_duplicates(subset=["filename"])

# Duplication Check

In [None]:
"scored classes in 2021 data: {}".format(
    train_2021[train_2021["is_scored"] == True]["primary_label"].unique()
)

In [None]:
_, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
plt.suptitle("Duplication between 2021 vs 2022", fontsize=16)
plt.tight_layout()

l21, l22 = set(train_2021["primary_label"].unique()), set(
    train_2022["primary_label"].unique()
)
venn2(subsets=(l21, l22), set_labels=("train_2021", "train_2022"), ax=ax1)
ax1.set_title("primary_labels")

f21, f22 = set(train_2021["filename"].unique()), set(train_2022["filename"].unique())
venn2(subsets=(f21, f22), set_labels=("train_2021", "train_2022"), ax=ax2)
ax2.set_title("filename")

plt.show()

* 40 bird species and 5901 files are duplicated in 2021 and 2022 data.

# Geo Distribution

In [None]:
_, ax = plt.subplots(figsize=(13, 8))

countries = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
countries.plot(color="lightgrey", ax=ax)
sns.scatterplot(
    x="longitude",
    y="latitude",
    data=train,
    hue="year",
    palette="Set1",
    alpha=0.5,
    marker="+",
    ax=ax,
)

ax.set_title("Geo Distribution")
plt.show()

# Taxonomy

In [None]:
def create_tax_df(taxonomy, labels):
    labels = list(labels)
    birds = pd.DataFrame({"label": labels})

    tax = pd.merge(birds, taxonomy, left_on="label", right_on="SPECIES_CODE").drop(
        ["label", "TAXON_ORDER", "CATEGORY", "SPECIES_GROUP", "REPORT_AS"], axis=1
    )
    tax["URL"] = tax["SPECIES_CODE"].apply(lambda x: f"https://ebird.org/species/{x}")
    return tax


taxonomy = pd.read_csv("../input/birdclef-2022/eBird_Taxonomy_v2021.csv")
tax_2021 = create_tax_df(taxonomy, l21 - l22)
tax_2021["year"] = 2021
tax_2022 = create_tax_df(taxonomy, l22)
tax_2022["year"] = 2022
tax_merged = pd.concat([tax_2021, tax_2022])

In [None]:
def taxonomy_count_plot(
    df1, df2, title, label1="2021 + 2022", label2="2022", log_scale=False
):
    _, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 14))

    plt.suptitle(title, fontsize=18)
    ax1.set_title("Order")
    ax2.set_title("Family")

    gs = [
        sns.countplot(
            y="ORDER1",
            data=df1,
            ax=ax1,
            alpha=0.5,
            color="gray",
            order=df1["ORDER1"].value_counts().index,
            label=label1,
        ),
        sns.countplot(
            y="ORDER1",
            data=df2,
            ax=ax1,
            color="orange",
            order=df1["ORDER1"].value_counts().index,
            label=label2,
        ),
        sns.countplot(
            y="FAMILY",
            data=df1,
            ax=ax2,
            color="gray",
            alpha=0.5,
            order=df1["FAMILY"].value_counts().index,
            label=label1,
        ),
        sns.countplot(
            y="FAMILY",
            data=df2,
            ax=ax2,
            color="green",
            order=df1["FAMILY"].value_counts().index,
            label=label2,
        ),
    ]
    if log_scale:
        for g in gs:
            g.set_xscale("log")
    ax1.legend()
    ax2.legend()
    plt.tight_layout()
    plt.show()

In [None]:
taxonomy_count_plot(tax_merged, tax_2022, "Distribution of Bird Order and Family")

In [None]:
df1 = pd.merge(
    train_distinct,
    taxonomy,
    left_on="primary_label",
    right_on="SPECIES_CODE",
    how="left",
)
df2 = pd.merge(
    train_2022, taxonomy, left_on="primary_label", right_on="SPECIES_CODE", how="left"
)

taxonomy_count_plot(df1, df2, "Sample Counts of Order and Family", log_scale=True)

# Sample Counts

In [None]:
Sltfig, (ax1) = plt.subplots(1, 1, figsize=(8, 3))
df1 = train_2021["primary_label"].value_counts()
df2 = train_2022["primary_label"].value_counts()
ax1.bar(
    x=list(range(len(df1))),
    height=df1.values,
    color="blue",
    width=1,
    alpha=0.8,
    label="2021",
)
ax1.bar(
    x=list(range(len(df2))),
    height=df2.values,
    color="red",
    width=1,
    alpha=0.8,
    label="2022",
)

ax1.set(
    title=f"distribution of sample counts per species",
    xticks=[],
    xlabel="Species",
    ylabel="Count",
)
ax1.legend()
plt.tight_layout()
plt.show()
print("total sample counts:")
print(f"  - 2021: {len(train_2021)}")
print(f"  - 2022: {len(train_2022)}")
print("unique species:")
print(f"  - 2021: {train_2021['primary_label'].nunique()}")
print(f"  - 2022: {train_2022['primary_label'].nunique()}")

In [None]:
_, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5), sharex=True)
df1 = (
    train_distinct.query("year == 2021")
    .groupby("primary_label")
    .agg(count=("year", "count"))
)
df2 = (
    train_distinct.query("year == 2022")
    .groupby("primary_label")
    .agg(count=("year", "count"))
)

sns.histplot(x="count", data=df1, log_scale=True, ax=ax1, color="blue")
sns.histplot(x="count", data=df2, log_scale=True, ax=ax2, color="red")
ax1.set(xlabel="Sample count", title="2021", xlim=(1e-1, 1e3))
ax2.set(xlabel="Sample count", title="2022", xlim=(1e-1, 1e3))
plt.suptitle("Distribution of sample counts per species", fontsize=16)
plt.tight_layout()
plt.show()
print("* mean sample counts per species:")
print(f"  - 2021: {df1['count'].mean():.1f}")
print(f"  - 2022: {df2['count'].mean():.1f}")

print("* number of species with sample counts >= 20:")
print(f"  - 2021: {len(df1[df1['count'] >= 20])}/{len(df1)}")
print(f"  - 2022: {len(df2[df2['count'] >= 20])}/{len(df2)}")

# Ratings

In [None]:
_, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5))
df1 = train_distinct.query("year == 2021")
df2 = train_distinct.query("year == 2022")

sns.countplot(
    x="rating",
    data=df1,
    color="blue",
    label="2021",
    ax=ax1,
)

sns.countplot(
    x="rating",
    data=df2,
    color="red",
    label="2022",
    ax=ax2,
)
ax1.set(xlabel="Rating", title="2021")
ax2.set(xlabel="Rating", title="2022")
plt.suptitle("Distribution of rating", fontsize=16)
plt.tight_layout()
plt.show()
print("mean rating:")
print(
    f"  - 2021: {df1['rating'].mean():.1f} sec. (std: {df1['rating'].std():.1f} sec.)"
)
print(
    f"  - 2022: {df2['rating'].mean():.1f} sec. (std: {df2['rating'].std():.1f} sec.)"
)

# Secondary Labels

In [None]:
_, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5))
df1 = train_distinct.query("year == 2021")
df2 = train_distinct.query("year == 2022")

sns.countplot(
    x="num_secondary_labels",
    data=df1,
    color="blue",
    label="2021",
    ax=ax1,
)

sns.countplot(
    x="num_secondary_labels",
    data=df2,
    color="red",
    label="2022",
    ax=ax2,
)
ax1.set(xlabel="#secondary labels", title="2021")
ax2.set(xlabel="#secondary labels", title="2022")
plt.suptitle("Distribution of #secondary labels", fontsize=16)
plt.tight_layout()
plt.show()
print("mean #secondary labels:")
print(
    f"  - 2021: {df1['num_secondary_labels'].mean():.1f} (std: {df1['num_secondary_labels'].std():.1f})"
)
print(
    f"  - 2022: {df2['num_secondary_labels'].mean():.1f} (std: {df2['num_secondary_labels'].std():.1f})"
)

# Audio Length

## Audio length per sample

In [None]:
_, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5), sharex=True)
df1 = train_distinct.query("year == 2021")
df2 = train_distinct.query("year == 2022")

sns.histplot(x="length", data=df1, log_scale=True, ax=ax1, color="blue")

sns.histplot(x="length", data=df2, log_scale=True, ax=ax2, color="red")
ax1.set(xlabel="length [sec]", title="2021")
ax2.set(xlabel="length [sec]", title="2022")
plt.suptitle("Distribution of audio length per sample", fontsize=16)
plt.tight_layout()
plt.show()
print("mean length of one clip:")
print(f"  - 2021: {df1['length'].mean():.1f} sec.")
print(f"  - 2022: {df2['length'].mean():.1f} sec.")
print("total length:")
print(f"  - 2021: {df1['length'].sum() / 3600:.0f} hours")
print(f"  - 2022: {df2['length'].sum() / 3600:.0f} hours")

* The 2021 data is clipped with a **lower bound of 6 seconds**.
* The total length of audio samples in 2021 data are about **1/10** of that in 2021.

## Total audio length per species

In [None]:
_, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5), sharex=True)
df1 = (
    train_distinct.query("year == 2021").groupby("primary_label")[["length"]].sum() / 60
)
df2 = (
    train_distinct.query("year == 2022").groupby("primary_label")[["length"]].sum() / 60
)

sns.histplot(x="length", data=df1, log_scale=True, ax=ax1, color="blue")

sns.histplot(x="length", data=df2, log_scale=True, ax=ax2, color="red")
ax1.set(xlabel="length [min]", title="2021", xlim=(1e-1, 1e3))
ax2.set(xlabel="length [min]", title="2022", xlim=(1e-1, 1e3))
plt.suptitle("Distribution of total audio length per species", fontsize=16)
plt.tight_layout()
plt.show()
print("mean total length per species:")
print(f"  - 2021: {df1['length'].mean():.1f} min.")
print(f"  - 2022: {df2['length'].mean():.1f} min.")

* mean total audio length per species in 2022 are about **28%** of that in 2021.

# Audio Channels

In [None]:
_, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5), sharex=True)
df1 = train_distinct.query("year == 2021")
df2 = train_distinct.query("year == 2022")

sns.countplot(
    x="channels",
    data=df1,
    color="blue",
    label="2021",
    ax=ax1,
)

sns.countplot(
    x="channels",
    data=df2,
    color="red",
    label="2022",
    ax=ax2,
)
ax1.set(title="2021")
ax2.set(title="2022")
plt.suptitle("Distribution of channels", fontsize=16)
plt.tight_layout()
plt.show()
print("mean channels:")
print(f"  - 2021: {df1['channels'].mean():.1f} (std: {df1['channels'].std():.1f})")
print(f"  - 2022: {df2['channels'].mean():.1f} (std: {df2['channels'].std():.1f})")

# Sample Rate

In [None]:
df1 = train_distinct.query("year == 2021")
df2 = train_distinct.query("year == 2022")

df1["sample_rate"].value_counts(), df2["sample_rate"].value_counts(),

* both the 2021's and 2022's data are sampled with 32kHz.