In [1]:
# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()

# Import services and libraries
import sys
sys.path.append('..')
import pandas as pd
import json
from pathlib import Path
from typing import Dict, List, Optional
import tempfile

from services.s3_manager import S3Manager
from services.spectrogram_manager import SpectrogramManager

# Initialize managers
s3_manager = S3Manager(create_bucket_if_not_exists=False)
spectrogram_manager = SpectrogramManager()

print(f"✅ S3 Manager initialized for bucket: {s3_manager.bucket_name}")
print(f"✅ Spectrogram Manager initialized")
print(f"\nDefault spectrogram parameters:")
params = spectrogram_manager.get_default_params()
for key, value in params.items():
    print(f"  {key}: {value}")



Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
Matplotlib is building the font cache; this may take a moment.


✅ S3 Manager initialized for bucket: bird-classification-data
✅ Spectrogram Manager initialized

Default spectrogram parameters:
  n_fft: 2048
  hop_length: 512
  n_mels: 128
  fmin: 0
  fmax: 8000
  sr: 22050
  dpi: 100
  figsize: (10, 4)


In [2]:
# Load recordings metadata from S3

# Download metadata CSV from S3
metadata_s3_key = "metadata/recordings_metadata.csv"
temp_dir = Path("/tmp/bird_metadata")
temp_dir.mkdir(exist_ok=True)
local_metadata_path = temp_dir / "recordings_metadata.csv"

print(f"Downloading metadata from S3...")
if s3_manager.download_file(metadata_s3_key, str(local_metadata_path)):
    print(f"✅ Downloaded metadata to {local_metadata_path}")
    
    # Load into DataFrame
    df_recordings = pd.read_csv(local_metadata_path)
    print(f"\n✅ Loaded {len(df_recordings)} recordings from metadata")
    print(f"\nRecordings per species:")
    species_counts = df_recordings['species_common_name'].value_counts()
    for species, count in species_counts.items():
        print(f"  {species}: {count}")
else:
    print("❌ Failed to download metadata from S3")
    df_recordings = pd.DataFrame()



Downloading metadata from S3...
✅ Downloaded metadata to /tmp/bird_metadata/recordings_metadata.csv

✅ Loaded 692 recordings from metadata

Recordings per species:
  Northern Cardinal: 100
  Blue Jay: 100
  American Robin: 100
  Tufted Titmouse: 100
  Carolina Wren: 99
  Carolina Chickadee: 78
  Eastern Bluebird: 61
  Mourning Dove: 54


In [3]:
# Helper function to process a single recording

def process_recording_for_spectrogram(
    recording_row: pd.Series,
    skip_existing: bool = True,
    temp_dir: str = "/tmp/spectrogram_processing"
) -> Optional[Dict]:
    """
    Process a single recording: download audio, generate spectrogram, upload to S3.
    
    Args:
        recording_row: Pandas Series with recording metadata
        skip_existing: If True, skip if spectrogram already exists
        temp_dir: Temporary directory for processing
    
    Returns:
        Dictionary with spectrogram metadata, or None if processing failed
    """
    recording_id = str(recording_row['recording_id'])
    species_common_name = recording_row['species_common_name']
    audio_s3_uri = recording_row['audio_s3_uri']
    
    # Check if spectrogram already exists
    if skip_existing and s3_manager.spectrogram_exists(species_common_name, recording_id):
        return None  # Already processed
    
    # Extract S3 key from URI (format: s3://bucket/key)
    if not audio_s3_uri.startswith('s3://'):
        print(f"  ⚠️ Invalid S3 URI for {recording_id}: {audio_s3_uri}")
        return None
    
    s3_key = audio_s3_uri.replace(f"s3://{s3_manager.bucket_name}/", "")
    
    # Create temp directories
    temp_path = Path(temp_dir)
    temp_path.mkdir(parents=True, exist_ok=True)
    audio_temp = temp_path / f"{recording_id}_audio.mp3"
    spectrogram_temp = temp_path / f"{recording_id}_spectrogram.png"
    
    try:
        # Download audio from S3
        if not s3_manager.download_file(s3_key, str(audio_temp)):
            print(f"  ❌ Failed to download audio for {recording_id}")
            return None
        
        # Generate spectrogram
        spectrogram_metadata = spectrogram_manager.generate_mel_spectrogram(
            str(audio_temp),
            str(spectrogram_temp)
        )
        
        # Upload spectrogram to S3
        spectrogram_s3_uri = s3_manager.upload_spectrogram_file(
            str(spectrogram_temp),
            species_common_name,
            recording_id
        )
        
        if not spectrogram_s3_uri:
            print(f"  ❌ Failed to upload spectrogram for {recording_id}")
            return None
        
        # Clean up local files
        audio_temp.unlink(missing_ok=True)
        spectrogram_temp.unlink(missing_ok=True)
        
        # Build metadata dictionary
        return {
            'recording_id': recording_id,
            'spectrogram_s3_uri': spectrogram_s3_uri,
            'audio_s3_uri': audio_s3_uri,
            'spectrogram_params': spectrogram_metadata['spectrogram_params'],
            'image_width': spectrogram_metadata['image_width'],
            'image_height': spectrogram_metadata['image_height'],
            'sample_rate': spectrogram_metadata['sample_rate'],
            'duration_seconds': spectrogram_metadata['duration_seconds'],
            'species_common_name': species_common_name,
            'species_scientific_name': recording_row['species_scientific_name']
        }
        
    except Exception as e:
        print(f"  ❌ Error processing {recording_id}: {e}")
        # Clean up on error
        audio_temp.unlink(missing_ok=True)
        spectrogram_temp.unlink(missing_ok=True)
        return None

print("✅ Helper function defined")



✅ Helper function defined


In [4]:
# Process all recordings to generate spectrograms

if len(df_recordings) == 0:
    print("⚠️ No recordings to process")
else:
    all_spectrogram_metadata = []
    errors = []
    skipped = 0
    
    print(f"Starting spectrogram generation pipeline...")
    print(f"Total recordings: {len(df_recordings)}")
    print(f"Skip existing: True\n")
    
    for idx, (_, row) in enumerate(df_recordings.iterrows(), 1):
        recording_id = str(row['recording_id'])
        species = row['species_common_name']
        
        print(f"[{idx}/{len(df_recordings)}] Processing {recording_id} ({species})...", end=' ')
        
        # Check if already exists
        if s3_manager.spectrogram_exists(species, recording_id):
            print("⏭️ Already exists, skipping")
            skipped += 1
            continue
        
        result = process_recording_for_spectrogram(row, skip_existing=False)
        
        if result:
            all_spectrogram_metadata.append(result)
            print("✅ Generated and uploaded")
        else:
            errors.append(f"{recording_id} ({species})")
            print("❌ Failed")
    
    print(f"\n{'='*60}")
    print(f"Spectrogram Generation Complete!")
    print(f"{'='*60}")
    print(f"✅ Successfully processed: {len(all_spectrogram_metadata)}")
    print(f"⏭️ Skipped (already exists): {skipped}")
    print(f"❌ Failed: {len(errors)}")
    
    if errors:
        print(f"\nFailed recordings:")
        for error in errors[:10]:  # Show first 10
            print(f"  - {error}")
        if len(errors) > 10:
            print(f"  ... and {len(errors) - 10} more")



Starting spectrogram generation pipeline...
Total recordings: 692
Skip existing: True

[1/692] Processing 1070756 (Northern Cardinal)... ✅ Generated and uploaded
[2/692] Processing 1070366 (Northern Cardinal)... ✅ Generated and uploaded
[3/692] Processing 1027842 (Northern Cardinal)... ✅ Generated and uploaded
[4/692] Processing 1012918 (Northern Cardinal)... ✅ Generated and uploaded
[5/692] Processing 1012917 (Northern Cardinal)... ✅ Generated and uploaded
[6/692] Processing 1012916 (Northern Cardinal)... ✅ Generated and uploaded
[7/692] Processing 1012495 (Northern Cardinal)... ✅ Generated and uploaded
[8/692] Processing 1012494 (Northern Cardinal)... ✅ Generated and uploaded
[9/692] Processing 1012493 (Northern Cardinal)... ✅ Generated and uploaded
[10/692] Processing 1012492 (Northern Cardinal)... ✅ Generated and uploaded
[11/692] Processing 1012491 (Northern Cardinal)... ✅ Generated and uploaded
[12/692] Processing 1012142 (Northern Cardinal)... ✅ Generated and uploaded
[13/692] P

Note: Illegal Audio-MPEG-Header 0x00000000 at offset 361589.
Note: Trying to resync...
Note: Hit end of (available) data during resync.


✅ Generated and uploaded
[64/692] Processing 740407 (Northern Cardinal)... ✅ Generated and uploaded
[65/692] Processing 738697 (Northern Cardinal)... ✅ Generated and uploaded
[66/692] Processing 738212 (Northern Cardinal)... ✅ Generated and uploaded
[67/692] Processing 730976 (Northern Cardinal)... ✅ Generated and uploaded
[68/692] Processing 727761 (Northern Cardinal)... ✅ Generated and uploaded
[69/692] Processing 725630 (Northern Cardinal)... ✅ Generated and uploaded
[70/692] Processing 725506 (Northern Cardinal)... ✅ Generated and uploaded
[71/692] Processing 725505 (Northern Cardinal)... ✅ Generated and uploaded
[72/692] Processing 725504 (Northern Cardinal)... ✅ Generated and uploaded
[73/692] Processing 725495 (Northern Cardinal)... ✅ Generated and uploaded
[74/692] Processing 721721 (Northern Cardinal)... ✅ Generated and uploaded
[75/692] Processing 720913 (Northern Cardinal)... ✅ Generated and uploaded
[76/692] Processing 718207 (Northern Cardinal)... ✅ Generated and uploaded




✅ Generated and uploaded
[296/692] Processing 549775 (Blue Jay)... ✅ Generated and uploaded
[297/692] Processing 530848 (Blue Jay)... ✅ Generated and uploaded
[298/692] Processing 521284 (Blue Jay)... ✅ Generated and uploaded
[299/692] Processing 511960 (Blue Jay)... ✅ Generated and uploaded
[300/692] Processing 1068477 (American Robin)... ✅ Generated and uploaded
[301/692] Processing 1049493 (American Robin)... ✅ Generated and uploaded
[302/692] Processing 1047670 (American Robin)... ✅ Generated and uploaded
[303/692] Processing 1046358 (American Robin)... ✅ Generated and uploaded
[304/692] Processing 1045448 (American Robin)... ✅ Generated and uploaded
[305/692] Processing 1023056 (American Robin)... ✅ Generated and uploaded
[306/692] Processing 1023050 (American Robin)... ✅ Generated and uploaded
[307/692] Processing 1013854 (American Robin)... ✅ Generated and uploaded
[308/692] Processing 1012677 (American Robin)... ✅ Generated and uploaded
[309/692] Processing 1012675 (American Ro



✅ Generated and uploaded
[503/692] Processing 569728 (Tufted Titmouse)... ✅ Generated and uploaded
[504/692] Processing 568906 (Tufted Titmouse)... ✅ Generated and uploaded
[505/692] Processing 562016 (Tufted Titmouse)... ✅ Generated and uploaded
[506/692] Processing 556166 (Tufted Titmouse)... ✅ Generated and uploaded
[507/692] Processing 544303 (Tufted Titmouse)... ✅ Generated and uploaded
[508/692] Processing 529792 (Tufted Titmouse)... ✅ Generated and uploaded
[509/692] Processing 522855 (Tufted Titmouse)... ✅ Generated and uploaded
[510/692] Processing 501232 (Tufted Titmouse)... ✅ Generated and uploaded
[511/692] Processing 501231 (Tufted Titmouse)... ✅ Generated and uploaded
[512/692] Processing 499563 (Tufted Titmouse)... ✅ Generated and uploaded
[513/692] Processing 479486 (Tufted Titmouse)... ✅ Generated and uploaded
[514/692] Processing 469655 (Tufted Titmouse)... ✅ Generated and uploaded
[515/692] Processing 469582 (Tufted Titmouse)... ✅ Generated and uploaded
[516/692] Pro



✅ Generated and uploaded
[520/692] Processing 446662 (Tufted Titmouse)... ✅ Generated and uploaded
[521/692] Processing 428278 (Tufted Titmouse)... ✅ Generated and uploaded
[522/692] Processing 422042 (Tufted Titmouse)... ✅ Generated and uploaded
[523/692] Processing 385825 (Tufted Titmouse)... ✅ Generated and uploaded
[524/692] Processing 375008 (Tufted Titmouse)... ✅ Generated and uploaded
[525/692] Processing 363065 (Tufted Titmouse)... 

Note: Illegal Audio-MPEG-Header 0x454e5245 at offset 621840.
Note: Trying to resync...
Note: Hit end of (available) data during resync.


✅ Generated and uploaded
[526/692] Processing 348110 (Tufted Titmouse)... ✅ Generated and uploaded
[527/692] Processing 333285 (Tufted Titmouse)... ✅ Generated and uploaded
[528/692] Processing 322769 (Tufted Titmouse)... ✅ Generated and uploaded
[529/692] Processing 318897 (Tufted Titmouse)... ✅ Generated and uploaded
[530/692] Processing 316285 (Tufted Titmouse)... ✅ Generated and uploaded
[531/692] Processing 313159 (Tufted Titmouse)... ✅ Generated and uploaded
[532/692] Processing 311712 (Tufted Titmouse)... ✅ Generated and uploaded
[533/692] Processing 310835 (Tufted Titmouse)... ✅ Generated and uploaded
[534/692] Processing 309396 (Tufted Titmouse)... ✅ Generated and uploaded
[535/692] Processing 308401 (Tufted Titmouse)... ✅ Generated and uploaded
[536/692] Processing 306851 (Tufted Titmouse)... ✅ Generated and uploaded
[537/692] Processing 305136 (Tufted Titmouse)... ✅ Generated and uploaded
[538/692] Processing 305015 (Tufted Titmouse)... ✅ Generated and uploaded
[539/692] Pro

Note: Illegal Audio-MPEG-Header 0x6e736973 at offset 186880.
Note: Trying to resync...
Note: Hit end of (available) data during resync.


✅ Generated and uploaded
[590/692] Processing 466694 (Carolina Chickadee)... ✅ Generated and uploaded
[591/692] Processing 452616 (Carolina Chickadee)... ✅ Generated and uploaded
[592/692] Processing 391442 (Carolina Chickadee)... ✅ Generated and uploaded
[593/692] Processing 391432 (Carolina Chickadee)... ✅ Generated and uploaded
[594/692] Processing 391431 (Carolina Chickadee)... ✅ Generated and uploaded
[595/692] Processing 366600 (Carolina Chickadee)... ✅ Generated and uploaded
[596/692] Processing 364473 (Carolina Chickadee)... ✅ Generated and uploaded
[597/692] Processing 362250 (Carolina Chickadee)... ✅ Generated and uploaded
[598/692] Processing 360934 (Carolina Chickadee)... ✅ Generated and uploaded
[599/692] Processing 339932 (Carolina Chickadee)... ✅ Generated and uploaded
[600/692] Processing 310196 (Carolina Chickadee)... ✅ Generated and uploaded
[601/692] Processing 309927 (Carolina Chickadee)... ✅ Generated and uploaded
[602/692] Processing 309925 (Carolina Chickadee)...

In [None]:
# Generate and upload spectrogram metadata

if all_spectrogram_metadata:
    # Create DataFrame
    df_spectrograms = pd.DataFrame(all_spectrogram_metadata)
    
    # Save to temporary files
    csv_path = temp_dir / "spectrograms_metadata.csv"
    json_path = temp_dir / "spectrograms_metadata.json"
    
    # Save CSV
    df_spectrograms.to_csv(csv_path, index=False)
    print(f"✅ Created CSV metadata: {csv_path}")
    
    # Save JSON
    df_spectrograms.to_json(json_path, orient='records', indent=2)
    print(f"✅ Created JSON metadata: {json_path}")
    
    # Upload to S3
    csv_s3_uri = s3_manager.upload_metadata_file(str(csv_path), "spectrograms_metadata")
    json_s3_uri = s3_manager.upload_metadata_file(str(json_path), "spectrograms_metadata")
    
    if csv_s3_uri:
        print(f"✅ Uploaded CSV to S3: {csv_s3_uri}")
    if json_s3_uri:
        print(f"✅ Uploaded JSON to S3: {json_s3_uri}")
    
    # Display summary statistics
    print(f"\n{'='*60}")
    print(f"Spectrogram Dataset Summary")
    print(f"{'='*60}")
    print(f"Total spectrograms: {len(df_spectrograms)}")
    print(f"\nSpectrograms per species:")
    species_counts = df_spectrograms['species_common_name'].value_counts()
    for species, count in species_counts.items():
        print(f"  {species}: {count}")
    
    print(f"\nImage dimensions:")
    print(f"  Average width: {df_spectrograms['image_width'].mean():.0f} pixels")
    print(f"  Average height: {df_spectrograms['image_height'].mean():.0f} pixels")
    print(f"  Width range: {df_spectrograms['image_width'].min()}-{df_spectrograms['image_width'].max()}")
    print(f"  Height range: {df_spectrograms['image_height'].min()}-{df_spectrograms['image_height'].max()}")
    
    print(f"\nSample rate: {df_spectrograms['sample_rate'].iloc[0]} Hz (all recordings)")
    print(f"Total audio duration: {df_spectrograms['duration_seconds'].sum() / 60:.1f} minutes")
    
    # Display first few rows
    print(f"\nSample spectrogram metadata:")
    print(df_spectrograms.head().to_string())
else:
    print("⚠️ No spectrogram metadata to process")

