In [None]:
!pip install nb-black > /dev/null

In [None]:
import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("ggplot")

%load_ext lab_black

In [None]:
from types import SimpleNamespace

cfg = SimpleNamespace(subclip_len_sec=60, n_splits=3, random_seed=135)

In [None]:
train_meta = pd.read_csv("../input/birdclef-2022/train_metadata.csv")
subclip_meta = pd.read_csv("../input/birdclef-2022-subclip-60-sec/subclip_meta.csv")
train_meta.rename({"filename": "original_filename"}, axis=1, inplace=True)
scored_birds = pd.read_json("../input/birdclef-2022/scored_birds.json")[0].tolist()
train_meta["is_scored"] = train_meta.primary_label.apply(lambda x: x in scored_birds)
train_meta["in_hawaii"] = (
    (train_meta["longitude"] >= -161)
    & (train_meta["longitude"] < -153)
    & (train_meta["latitude"] >= 18)
    & (train_meta["latitude"] < 24)
)

In [None]:
fig, ax = plt.subplots()
sns.countplot(x="in_hawaii", data=train_meta, ax=ax)
ax.set(title="Sample Counts")
plt.show()

In [None]:
fig, ax = plt.subplots()
sns.countplot(x="is_scored", data=train_meta, ax=ax)
ax.set(title="Sample Counts")
plt.show()

In [None]:
meta_merged = pd.merge(train_meta, subclip_meta, on="original_filename", how="right")

In [None]:
meta_merged.head().T

# Define Label order

* 元のmetadataにおいて
    * scored 21 species: primary_labelのサンプル数が多い順
    * unscored 131 species: primary_labelのサンプル数が多い順

In [None]:
all_set = set(meta_merged.primary_label.unique())
scored_set = set(scored_birds)
unscored_set = all_set.difference(scored_set)
len(all_set), len(scored_set), len(unscored_set)

In [None]:
label_counts = meta_merged.value_counts("primary_label")
label_counts = pd.DataFrame({"num_samples": label_counts}).reset_index()
label_counts["is_scored"] = label_counts["primary_label"].apply(
    lambda s: s in scored_birds
)
label_counts.sort_values(["is_scored", "num_samples"])
label_counts

In [None]:
def create_scored_org():
    train_org = pd.read_csv("../input/birdclef-2022/train_metadata.csv")
    scored_org = train_org.query("primary_label in @scored_birds")
    df = (
        scored_org.groupby("primary_label")
        .agg(num_samples=("filename", "count"))
        .sort_values("num_samples", ascending=False)
    )
    df["group"] = ["top5"] * 5 + ["mid_top5"] * 5 + ["mid_low5"] * 5 + ["low6"] * 6
    groups = df["group"].unique()
    group2id = {g: i for i, g in enumerate(groups)}
    df["group_id"] = df["group"].apply(lambda g: group2id[g])
    df = df.reset_index()
    df = df.rename({"num_samples": "num_samples_org"}, axis=1)
    return df

In [None]:
def create_unscored_org():
    train_org = pd.read_csv("../input/birdclef-2022/train_metadata.csv")
    unscored_org = train_org.query("primary_label not in @scored_birds")
    df = (
        unscored_org.groupby("primary_label")
        .agg(num_samples=("filename", "count"))
        .sort_values("num_samples", ascending=False)
    )
    df = df.reset_index()
    df = df.rename({"num_samples": "num_samples_org"}, axis=1)
    return df

In [None]:
create_scored_org()

In [None]:
create_unscored_org()

In [None]:
ordered_species = np.concatenate(
    [
        create_scored_org().primary_label.tolist(),
        create_unscored_org().primary_label.tolist(),
    ]
)
assert len(ordered_species) == 152
ordered_species

In [None]:
label2id = {l: i for i, l in enumerate(ordered_species)}
meta_merged["target"] = meta_merged.primary_label.apply(lambda l: label2id[l])
meta_merged["target2"] = meta_merged.secondary_labels.apply(
    lambda ls: " ".join([str(label2id[l]) for l in eval(ls)])
)

In [None]:
meta_merged[meta_merged.target2.apply(lambda x: len(x)) > 0].head().T

# Split into folds

In [None]:
from sklearn.model_selection import StratifiedGroupKFold

In [None]:
def split_group_by(df, group_key, cfg):
    df = df.copy()
    skf = StratifiedGroupKFold(
        n_splits=cfg.n_splits, shuffle=True, random_state=cfg.random_seed
    )

    df["fold"] = -1
    for n, (trn_index, val_index) in enumerate(
        skf.split(
            df,
            df.primary_label,
            df[group_key],
        )
    ):
        df.loc[val_index, "fold"] = int(n)
    return df

In [None]:
def create_metadata(df, group_key, cfg):
    df = df.copy()
    scored_df = df.query("is_scored == True").reset_index(drop=True)
    unscored_df = df.query("is_scored == False").reset_index(drop=True)
    scored_df = split_group_by(scored_df, group_key, cfg)
    unscored_df["fold"] = -2
    merged_df = pd.concat([scored_df, unscored_df]).reset_index(drop=True)
    merged_df.to_csv(
        f"birdclef2022_metadata_v2_subclip_{cfg.subclip_len_sec}_sec_group_key_{group_key}_{cfg.n_splits}_fold_seed_{cfg.random_seed}.csv",
        index=False,
    )
    return merged_df

In [None]:
df = create_metadata(meta_merged, "original_filename", cfg)
df = df.query("fold >= 0")

fig, ax = plt.subplots(figsize=(8, 8))
sns.countplot(
    y="primary_label",
    data=df,
    hue="fold",
    order=ordered_species[:21],
    ax=ax,
)
ax.set(xscale="log", title="sample count per fold")
plt.show()

In [None]:
df = create_metadata(meta_merged, "author", cfg)
df = df.query("fold >= 0")

fig, ax = plt.subplots(figsize=(8, 8))
sns.countplot(
    y="primary_label",
    data=df,
    hue="fold",
    order=ordered_species[:21],
    ax=ax,
)
ax.set(xscale="log", title="sample count per fold")
plt.show()