1. Offline augmentation (przed treningiem)
   - zbalansowanie klas
   - augmentacja waveformów
   - augmentacja spektrogramów
   - wygenerowanie przykładów dla klasy silence
   - wygenerowanie spektrogramów

In [None]:
import os
import torch
import torchaudio.transforms as T
import soundfile as sf
from tqdm import tqdm

def preprocess_and_save_spectrograms(
    audio_dir, 
    output_dir, 
    sample_rate=16000,
    n_mels=64, 
    n_fft=400, 
    hop_length=200
):
    os.makedirs(output_dir, exist_ok=True)
    
    mel_spectrogram = T.MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
    )
    db_transform = T.AmplitudeToDB()
    
    print(f"Loading audio files from {audio_dir}...")
    audio_files = []
    for root, _, files in os.walk(audio_dir):
        for file in files:
            if file.endswith('.wav'):
                audio_files.append(os.path.relpath(os.path.join(root, file), audio_dir))

    print(f"Processing {len(audio_files)} audio files...")
    for audio_file in tqdm(audio_files, desc="Processing audio files"):
        # Load audio
        audio_path = os.path.join(audio_dir, audio_file)
        data, orig_sr = sf.read(audio_path)
        waveform = torch.tensor(data, dtype=torch.float32).unsqueeze(0)

        # Convert to mono if stereo
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        # Resample if necessary
        if orig_sr != sample_rate:
            resampler = T.Resample(orig_freq=orig_sr, new_freq=sample_rate)
            waveform = resampler(waveform)
        
        spec = mel_spectrogram(waveform)
        spec = db_transform(spec)

        output_path = os.path.join(output_dir, os.path.splitext(audio_file)[0] + '.pt')
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        torch.save(spec, output_path)
    print(f"Spectrograms saved to {output_dir}")

preprocess_and_save_spectrograms("data/train/audio", "processed_data/train/audio")
