In [None]:
from pathlib import Path
import librosa
import pandas as pd

base = Path.cwd()

df = pd.read_csv("birdclef-2023/train_metadata.csv")
top_10_species = df['primary_label'].value_counts().nlargest(10).index

In [10]:
# Remove metadata records that aren't in the top 10 species by primary label counts
df = df[df['primary_label'].isin(top_10_species)]
df.reset_index(drop=True, inplace=True)

# Overwrite the original CSV with top-10 filtered data
df.to_csv("birdclef-2023/train_metadata.csv", index=False)

In [None]:
# Add duration column to metadata
def get_duration(file_path):
    y, sr = librosa.load(file_path, sr=None)
    return librosa.get_duration(y=y, sr=sr)

# Only calculate duration for samples that don't have it set
if 'duration' not in df.columns:
    df['duration'] = None

# Filter to only samples without duration
missing_duration = df['duration'].isna()
print(f"Calculating duration for {missing_duration.sum()} samples without duration...")

# Apply duration calculation only to samples missing duration
df.loc[missing_duration, 'duration'] = df.loc[missing_duration, 'filename'].apply(
    lambda x: get_duration(base / "birdclef-2023" / "train_audio" / x)
)

df.to_csv("birdclef-2023/train_metadata_with_duration.csv", index=False)
print(f"Updated CSV saved with {len(df)} total records")

In [None]:
# Remove duplicates based on duration, author, primary_label
df_with_duration = pd.read_csv("birdclef-2023/train_metadata_with_duration.csv")
df = df_with_duration

# See what would be removed
removed = df[df.duplicated(subset=['duration', 'author', 'primary_label'], keep=False)]
print(f"Found {len(removed)} duplicate records based on duration, author, and primary_label")
print(removed)

df = df.drop_duplicates(subset=['duration', 'author', 'primary_label'])
df.to_csv("birdclef-2023/train_metadata_deduped.csv", index=False)
print(f"Deduplicated CSV saved with {len(df)} total records")

Found 66 duplicate records based on duration, author, and primary_label
     primary_label secondary_labels                               type  \
308         barswa               []  ['call', 'song', 'various calls']   
309         barswa               []  ['call', 'song', 'various calls']   
394         barswa               []                  ['adult', 'song']   
395         barswa               []                  ['adult', 'song']   
420         barswa               []                           ['call']   
...            ...              ...                                ...   
4020        wlwwar               []                           ['call']   
4022        wlwwar               []                   ['male', 'song']   
4040        wlwwar               []               ['call', 'juvenile']   
4580        woosan               []                           ['song']   
4606        woosan               []                           ['song']   

      latitude  longitude         scien