In [1]:
from pathlib import Path
import librosa
import pandas as pd

base = Path.cwd()

df = pd.read_csv("birdclef-2023/train_metadata.csv")
top_10_species = df['primary_label'].value_counts().nlargest(10).index

In [2]:
# Remove metadata records that aren't in the top 10 species by primary label counts
df = df[df['primary_label'].isin(top_10_species)]
print(df.shape)
df.reset_index(drop=True, inplace=True)

# Overwrite the original CSV with top-10 filtered data
df.to_csv("birdclef-2023/train_metadata.csv", index=False)

(4761, 13)


In [3]:
# Add duration column to metadata
def get_duration(file_path):
    y, sr = librosa.load(file_path, sr=None)
    return librosa.get_duration(y=y, sr=sr)

# Only calculate duration for samples that don't have it set
if 'duration' not in df.columns:
    df['duration'] = None

# Filter to only samples without duration
missing_duration = df['duration'].isna()
print(f"Calculating duration for {missing_duration.sum()} samples without duration...")

# Apply duration calculation only to samples missing duration
df.loc[missing_duration, 'duration'] = df.loc[missing_duration, 'filename'].apply(
    lambda x: get_duration(base / "birdclef-2023" / "train_audio" / x)
)

df.to_csv("birdclef-2023/train_metadata_with_duration.csv", index=False)
print(f"Updated CSV saved with {len(df)} total records")

Calculating duration for 4761 samples without duration...
Updated CSV saved with 4761 total records


In [4]:
# Remove duplicates based on duration, author, primary_label
df_with_duration = pd.read_csv("birdclef-2023/train_metadata_with_duration.csv")
df = df_with_duration

# See what would be removed
removed = df[df.duplicated(subset=['duration', 'author', 'primary_label'], keep=False)]
print(f"Found {len(removed)} duplicate records based on duration, author, and primary_label")
print(removed)

df = df.drop_duplicates(subset=['duration', 'author', 'primary_label'])
df.to_csv("birdclef-2023/train_metadata_deduped.csv", index=False)
print(f"Deduplicated CSV saved with {len(df)} total records")

Found 66 duplicate records based on duration, author, and primary_label
     primary_label secondary_labels                               type  \
308         barswa               []  ['call', 'song', 'various calls']   
309         barswa               []  ['call', 'song', 'various calls']   
394         barswa               []                  ['adult', 'song']   
395         barswa               []                  ['adult', 'song']   
420         barswa               []                           ['call']   
...            ...              ...                                ...   
4020        wlwwar               []                           ['call']   
4022        wlwwar               []                   ['male', 'song']   
4040        wlwwar               []               ['call', 'juvenile']   
4580        woosan               []                           ['song']   
4606        woosan               []                           ['song']   

      latitude  longitude         scien

# Data Cleaning

In [5]:
# Load the deduplicated dataset
df = pd.read_csv("birdclef-2023/train_metadata_deduped.csv")
print(f"Initial shape: {df.shape}")
print(f"Initial columns: {df.columns.tolist()}")
print("Unique values in 'type' column:")
unique_type_values = df['type'].unique()
print(unique_type_values)

Initial shape: (4727, 13)
Initial columns: ['primary_label', 'secondary_labels', 'type', 'latitude', 'longitude', 'scientific_name', 'common_name', 'author', 'license', 'rating', 'url', 'filename', 'duration']
Unique values in 'type' column:
["['call', 'flight call']" "['song']" "['call', 'female', 'male', 'song']"
 "['call', 'song']" "['flight call']" "['alarm call']"
 "['begging call', 'call', 'song']" "['call', 'juvenile']" "['call']"
 "['Begging calls']" "['flight call', 'male']" "['call', 'male', 'song']"
 "['flight call', 'male', 'song']" "['male', 'song']" "['call', 'male']"
 "['call', 'female', 'male']" "['call', 'flight call', 'song']"
 "['song in flight']" "['alarm call', 'flight call']"
 "['juvenile', 'song']"
 "['alarm call', 'begging call', 'call', 'juvenile']"
 "['alarm call', 'call', 'male']"
 "['call', 'juvenile', 'voice of fledglings']"
 "['begging call', 'feeding sounds']" "['call,flight call,begging call']"
 "['begging call', 'juvenile']" "['call', 'flight call', 'hi

In [6]:
# Step 1: Inspect NaN values
print("=== Step 1: Inspecting NaN values ===")
print("NaN counts per column:")
print(df.isna().sum())
print(f"\nRows with NaN latitude: {df['latitude'].isna().sum()}")
print(f"Rows with NaN longitude: {df['longitude'].isna().sum()}")
print(f"Rows with both NaN: {df[['latitude', 'longitude']].isna().all(axis=1).sum()}")

# Option: Keep rows with NaN coordinates but flag them
# Alternatively, we could drop them with: df = df.dropna(subset=['latitude', 'longitude'])
# For now, we'll keep them and handle them in geocoding step
df['has_coordinates'] = df[['latitude', 'longitude']].notna().all(axis=1)
print(f"Rows with valid coordinates: {df['has_coordinates'].sum()}")

=== Step 1: Inspecting NaN values ===
NaN counts per column:
primary_label        0
secondary_labels     0
type                 0
latitude            63
longitude           63
scientific_name      0
common_name          0
author               0
license              0
rating               0
url                  0
filename             0
duration             0
dtype: int64

Rows with NaN latitude: 63
Rows with NaN longitude: 63
Rows with both NaN: 63
Rows with valid coordinates: 4664


In [7]:
# Step 2: Drop unused columns
print("=== Step 2: Dropping unused columns ===")
print(f"Columns before dropping: {df.columns.tolist()}")

# Drop columns not useful for analysis
columns_to_drop = ['secondary_labels', 'scientific_name', 'common_name', 'author', 'license', 'url']

# Only drop columns that actually exist in the dataframe
existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]
missing_columns = [col for col in columns_to_drop if col not in df.columns]

if missing_columns:
    print(f"Note: These columns were not found (may have been dropped already): {missing_columns}")

if existing_columns_to_drop:
    df = df.drop(columns=existing_columns_to_drop)
    print(f"Dropped columns: {existing_columns_to_drop}")
else:
    print("No columns to drop (all specified columns are already missing)")

print(f"Columns after dropping: {df.columns.tolist()}")
print(f"Shape after dropping: {df.shape}")

=== Step 2: Dropping unused columns ===
Columns before dropping: ['primary_label', 'secondary_labels', 'type', 'latitude', 'longitude', 'scientific_name', 'common_name', 'author', 'license', 'rating', 'url', 'filename', 'duration', 'has_coordinates']
Dropped columns: ['secondary_labels', 'scientific_name', 'common_name', 'author', 'license', 'url']
Columns after dropping: ['primary_label', 'type', 'latitude', 'longitude', 'rating', 'filename', 'duration', 'has_coordinates']
Shape after dropping: (4727, 8)


In [8]:
# Step 3: Clean up the 'type' column
print("=== Step 3: Cleaning up 'type' column ===")
import ast

def simplify_type(type_str):
    """Summarize type to 'call', 'song', 'blank', or 'both'."""
    try:
        type_list = ast.literal_eval(type_str) if isinstance(type_str, str) else type_str
        if not isinstance(type_list, list):
            type_list = [type_list]
        
        # Filter out empty strings and non-call/song related terms
        # Remove lifestage and sex information
        lifestage_sex_terms = ['adult', 'juvenile', 'hatchling or nestling', 'life stage uncertain',
                              'male', 'female', 'sex uncertain', '']
        
        # Keep only call/song related terms
        relevant_terms = [t.lower().strip() for t in type_list 
                         if t and str(t).strip() and t.lower().strip() not in lifestage_sex_terms]
        
        if not relevant_terms:
            return 'blank'
        
        # Check for 'call' and 'song' in the terms
        has_call = any('call' in term for term in relevant_terms)
        has_song = any('song' in term for term in relevant_terms)
        
        if has_call and has_song:
            return 'both'
        elif has_call:
            return 'call'
        elif has_song:
            return 'song'
        else:
            # If neither call nor song, but has other terms, default to 'call'
            return 'call'
    except:
        return 'blank'

# Apply simplification
df['type'] = df['type'].apply(simplify_type)

print("Type column distribution after cleaning:")
print(df['type'].value_counts())
print(f"\nType column now has {df['type'].nunique()} unique values: {df['type'].unique()}")

=== Step 3: Cleaning up 'type' column ===
Type column distribution after cleaning:
type
call     2673
song     1198
blank     587
both      269
Name: count, dtype: int64

Type column now has 4 unique values: ['call' 'song' 'both' 'blank']


In [9]:
# Step 4: Extract country and continent from lat/long using geopy
print("=== Step 4: Extracting country and continent from coordinates ===")
try:
    from geopy.geocoders import Nominatim
    from geopy.exc import GeocoderTimedOut, GeocoderServiceError
    import time
    
    # Initialize geocoder (using Nominatim, which is free but has rate limits)
    geolocator = Nominatim(user_agent="birdclef_geocoding")
    
    def get_country_continent(lat, lon, max_retries=3):
        """Get country and continent from coordinates using geopy."""
        if pd.isna(lat) or pd.isna(lon):
            return None, None
        
        for attempt in range(max_retries):
            try:
                # Reverse geocode
                location = geolocator.reverse((float(lat), float(lon)), timeout=10)
                
                if location and location.raw:
                    address = location.raw.get('address', {})
                    country = address.get('country', None)
                    
                    # Simple continent mapping (can be improved)
                    continent = None
                    if country:
                        # Basic continent mapping for common countries
                        # This is simplified - for production, use a proper mapping
                        country_lower = country.lower()
                        if any(c in country_lower for c in ['united states', 'canada', 'mexico', 'brazil', 'argentina']):
                            continent = 'Americas'
                        elif any(c in country_lower for c in ['united kingdom', 'france', 'germany', 'spain', 'italy', 'sweden', 'norway', 'finland', 'poland', 'russia']):
                            continent = 'Europe'
                        elif any(c in country_lower for c in ['china', 'japan', 'india', 'thailand', 'indonesia', 'australia']):
                            continent = 'Asia' if 'australia' not in country_lower else 'Oceania'
                        elif any(c in country_lower for c in ['south africa', 'kenya', 'ethiopia', 'egypt']):
                            continent = 'Africa'
                    
                    return country, continent
                return None, None
            except (GeocoderTimedOut, GeocoderServiceError) as e:
                if attempt < max_retries - 1:
                    time.sleep(1)  # Wait before retry
                    continue
                return None, None
            except Exception as e:
                return None, None
        
        return None, None
    
    print("Geocoding coordinates (this may take a while due to rate limits)...")
    print("Note: Nominatim has rate limits, so this may be slow for large datasets.")
    
    # Apply geocoding only to rows with valid coordinates
    valid_mask = df[['latitude', 'longitude']].notna().all(axis=1)
    valid_indices = df[valid_mask].index
    
    # Initialize columns
    df['country'] = None
    df['continent'] = None
    
    # Process in batches with delays to respect rate limits
    batch_size = 100
    for i in range(0, len(valid_indices), batch_size):
        batch_indices = valid_indices[i:i+batch_size]
        for idx in batch_indices:
            country, continent = get_country_continent(df.loc[idx, 'latitude'], df.loc[idx, 'longitude'])
            df.at[idx, 'country'] = country
            df.at[idx, 'continent'] = continent
        
        if i + batch_size < len(valid_indices):
            time.sleep(1)  # Rate limiting delay
        
        if (i // batch_size + 1) % 10 == 0:
            print(f"Processed {min(i + batch_size, len(valid_indices))} / {len(valid_indices)} records...")
    
    print(f"\nSuccessfully geocoded {df['country'].notna().sum()} records")
    print(f"\nCountry distribution (top 10):")
    print(df['country'].value_counts().head(10))
    print(f"\nContinent distribution:")
    print(df['continent'].value_counts())
    
except ImportError:
    print("=== Geopy not available ===")
    print("To extract country/continent, install: pip install geopy")
    print("\nSkipping geocoding for now...")
    df['country'] = None
    df['continent'] = None

=== Step 4: Extracting country and continent from coordinates ===
Geocoding coordinates (this may take a while due to rate limits)...
Note: Nominatim has rate limits, so this may be slow for large datasets.
Processed 1000 / 4664 records...
Processed 2000 / 4664 records...
Processed 3000 / 4664 records...
Processed 4000 / 4664 records...

Successfully geocoded 4662 records

Country distribution (top 10):
country
France             585
España             541
Polska             397
Россия             369
Deutschland        335
Sverige            312
United Kingdom     292
Portugal           195
Nederland          153
Suomi / Finland    131
Name: count, dtype: int64

Continent distribution:
continent
Europe      1008
Africa        82
Americas      46
Asia          29
Name: count, dtype: int64


In [11]:
# Step 5: Final summary and save cleaned dataset
print("=== Step 5: Final summary ===")
print(f"Final shape: {df.shape}")
print(f"Final columns: {df.columns.tolist()}")
print(f"\nNaN counts in final dataset:")
print(df.isna().sum())
print(f"\nReduced dimensionality by dropping unused columns (secondary_labels, scientific_name, common_name, author, license, url)")
print(f"This helps reduce the curse of dimensionality!")

# Save cleaned dataset
output_file = "birdclef-2023/train_metadata_cleaned.csv"
df.to_csv(output_file, index=False)
print(f"\nCleaned dataset saved to: {output_file}")
print(f"Total records: {len(df)}")

=== Step 5: Final summary ===
Final shape: (4727, 10)
Final columns: ['primary_label', 'type', 'latitude', 'longitude', 'rating', 'filename', 'duration', 'has_coordinates', 'country', 'continent']

NaN counts in final dataset:
primary_label         0
type                  0
latitude             63
longitude            63
rating                0
filename              0
duration              0
has_coordinates       0
country              65
continent          3562
dtype: int64

Reduced dimensionality by dropping unused columns (secondary_labels, scientific_name, common_name, author, license, url)
This helps reduce the curse of dimensionality!

Cleaned dataset saved to: birdclef-2023/train_metadata_cleaned.csv
Total records: 4727
