In [1]:
import os
import csv
from dataclasses import dataclass
from pathlib import Path
from typing import List, Set

import pandas as pd
from tqdm.auto import tqdm
import librosa
from IPython.display import Audio, display
import ipywidgets as widgets

In [2]:
@dataclass
class AudioSampleMetadata:
  duration: float
  sample_rate: int
  num_channels: int

@dataclass
class AudioSample():
  path: str
  label: str
  metadata: AudioSampleMetadata

  def toDict(self):
    return {
      'path': self.path,
      'label': self.label,
      'duration': f"{self.metadata.duration:.4f}",
      'sample_rate': self.metadata.sample_rate,
      'num_channels': self.metadata.num_channels
    }
  


In [3]:
FIELD_NAMES = ['path', 'label', 'duration', 'sample_rate', 'num_channels']
AUDIO_EXTENSIONS = {'.wav', '.mp3', '.flac', '.ogg'} 
SAMPLES_DIR = Path('/mnt/z/Music/Samples')
OUTPUT_DIR = Path('/mnt/c/Users/rishi/wsl/sample-manager/lib/data')

In [4]:
def find_audio_files(root_path: Path) -> List[Path]:
    return sorted([
        p for p in root_path.rglob("*")
        if p.suffix.lower() in AUDIO_EXTENSIONS and p.is_file()
    ])

In [5]:
def load_labeled_paths(csv_path: Path) -> Set[str]:
    if not csv_path.exists():
        print(f"File {csv_path} does not exist")
        return set()
    df = pd.read_csv(csv_path)
    return set(df["path"].dropna().astype(str))

In [6]:
def save_chunk_to_csv(entries: List[dict], csv_path: Path, is_first_chunk: bool):
    df = pd.DataFrame(entries)
    df.to_csv(
        csv_path,
        mode='w' if is_first_chunk else 'a',
        header=is_first_chunk,
        index=False,
        quoting=csv.QUOTE_NONNUMERIC
    )

In [7]:
def extract_metadata(file_path: str) -> AudioSampleMetadata:
    """
    Extracts audio metadata (duration and sample rate) using Librosa.
    """
    try:
        y, sr = librosa.load(file_path, sr=None, mono=False)
        duration = librosa.get_duration(y=y, sr=sr)
        return AudioSampleMetadata(duration=duration, sample_rate=sr, num_channels=y.shape[0])
    except Exception as e:
        print(f"Error extracting metadata from {file_path}: {e}")
        return None

In [8]:
def subtract_path(path: str, root: str) -> str:
    try:
        return str(Path(path).relative_to(Path(root)))
    except ValueError:
        return str(Path(path))

In [9]:
def label_audio_file(file_path: str, root_folder: str) -> AudioSample:
    """
    Extracts audio metadata and labels the audio file.
    """
    metadata = extract_metadata(file_path)
    display(Audio(file_path))

    path = subtract_path(file_path, root_folder)
    
    # Manual inputs for labeling
    label = input(f"{path}").strip().lower()
    # ... add further data here

    return AudioSample(path=file_path, label=label, metadata=metadata)

In [24]:
def label_audio_files(root_path: Path, output_path: Path, chunk_size: int = 500):
    print(f"Scanning for audio files in: {root_path}")
    all_files = find_audio_files(root_path)
    labeled_paths = load_labeled_paths(output_path)
    
    # Filter out already labeled files
    files_to_label = [
        p for p in all_files
        if str(p) not in labeled_paths
    ]

    print(f"Found {len(all_files)} total files.")
    print(f"Resuming with {len(files_to_label)} files left to label.\n")

    chunk: List[dict] = []
    entries_written = len(labeled_paths)
    is_first_chunk = entries_written == 0

    if len(files_to_label) > 0:
        paths = tqdm(files_to_label, desc="Labeling samples")

        audio_display = widgets.Output()
        display(audio_display) 

        for file_path in paths:
            try:
                with audio_display:
                    audio_display.clear_output(wait=True)
                    sample = label_audio_file(file_path, root_path) 
                chunk.append(sample.toDict())
            except Exception as e:
                print(f"⚠️ Skipped {file_path.name} due to error: {e}")
                continue

            if len(chunk) >= chunk_size:
                save_chunk_to_csv(chunk, output_path, is_first_chunk)
                is_first_chunk = False
                chunk.clear()

        # Write any remaining entries
        if chunk:
            save_chunk_to_csv(chunk, output_path, is_first_chunk)

    print(f"✅ Done. Labels saved to: {output_path}")

In [13]:
arsenal_path = "/mnt/z/Music/Samples/Mo Falk/Jonth & Mo Falk - The Arsenal"
drums_path = f"{arsenal_path}/Drums"
snares_path = f"{drums_path}/Snares"
perc_loops_path = f"{drums_path}/Perc Loops"

test_snare = f"{snares_path}/RSNL Snare - 001.wav"
test_perc_loop = f"{perc_loops_path}/RSNL Perc Loop - 001.wav"

In [25]:
sample_flips_path = Path("/mnt/z/Music/Samples/Mo Falk/Sample Flips")

label_audio_files(sample_flips_path, OUTPUT_DIR.joinpath("mo_falk.csv"), chunk_size=100)

Scanning for audio files in: /mnt/z/Music/Samples/Mo Falk/Sample Flips
Found 10 total files.
Resuming with 0 files left to label.

✅ Done. Labels saved to: /mnt/c/Users/rishi/wsl/sample-manager/lib/data/mo_falk.csv
