In [1]:
"""
DDSP Transformation Engine - Differentiable Audio Synthesis
Applies archetype-conditioned transformations to audio using DDSP principles
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import librosa
from scipy import signal
from typing import Dict, Tuple, Optional

In [2]:
class DDSPTransformationEngine(nn.Module):
    """
    Differentiable Digital Signal Processing transformation engine

    Transforms input audio based on predicted archetype mixture weights
    Implements harmonic synthesis, filtering, and distortion
    """

    def __init__(
        self,
        sample_rate=44100,
        frame_size=64,
        learnable_filters=True,
        device='cpu'
    ):
        super(DDSPTransformationEngine, self).__init__()

        self.sample_rate = sample_rate
        self.frame_size = frame_size
        self.device = device

        # Learnable filter banks for each archetype
        if learnable_filters:
            self.archetype_filters = nn.ModuleDict({
                'sine': LearnableFilter(num_bands=128, device=device),
                'square': LearnableFilter(num_bands=128, device=device),
                'sawtooth': LearnableFilter(num_bands=128, device=device),
                'triangle': LearnableFilter(num_bands=128, device=device),
                'noise': LearnableFilter(num_bands=128, device=device)
            })
        else:
            self.archetype_filters = None

        # Harmonic synthesizer
        self.harmonic_synth = HarmonicSynthesizer(sample_rate, device=device)

        # Noise generator
        self.noise_gen = NoiseGenerator(device=device)

        self.to(device)

    def decompose_audio(self, audio: torch.Tensor) -> Dict[str, torch.Tensor]:
        """
        Decompose audio into harmonic, noise, and residual components

        Args:
            audio: Input audio (batch, samples)

        Returns:
            Dict with 'harmonic', 'noise', 'residual' components
        """
        # Convert to numpy for librosa processing
        audio_np = audio.cpu().numpy()

        harmonics = []
        noises = []

        for i in range(audio_np.shape[0]):
            # Harmonic-percussive separation
            y_harmonic, y_percussive = librosa.effects.hpss(audio_np[i])

            harmonics.append(y_harmonic)
            noises.append(y_percussive)

        harmonic = torch.from_numpy(np.stack(harmonics)).float().to(self.device)
        noise = torch.from_numpy(np.stack(noises)).float().to(self.device)

        # Residual is what's left
        residual = audio - harmonic - noise

        return {
            'harmonic': harmonic,
            'noise': noise,
            'residual': residual
        }

    def apply_archetype_transformation(
        self,
        audio: torch.Tensor,
        archetype_weights: torch.Tensor
    ) -> torch.Tensor:
        """
        Transform audio based on archetype mixture

        Args:
            audio: Input audio (batch, samples)
            archetype_weights: Archetype weights (batch, 5)

        Returns:
            Transformed audio (batch, samples)
        """
        batch_size = audio.shape[0]

        # Decompose into components
        components = self.decompose_audio(audio)

        # Transform each component based on archetype weights
        transformed = torch.zeros_like(audio)

        archetype_names = ['sine', 'square', 'sawtooth', 'triangle', 'noise']

        for i, archetype in enumerate(archetype_names):
            weight = archetype_weights[:, i].unsqueeze(1)  # (batch, 1)

            # Apply archetype-specific transformation
            if archetype == 'sine':
                # Enhance fundamental, suppress harmonics
                component = self._apply_sine_transform(components['harmonic'])

            elif archetype == 'square':
                # Add odd harmonics, digital character
                component = self._apply_square_transform(components['harmonic'])

            elif archetype == 'sawtooth':
                # Enhance all harmonics, brighten
                component = self._apply_sawtooth_transform(components['harmonic'])

            elif archetype == 'triangle':
                # Reduce even harmonics, mellow
                component = self._apply_triangle_transform(components['harmonic'])

            elif archetype == 'noise':
                # Add noise texture
                component = self._apply_noise_transform(components['noise'])

            # Weight and accumulate
            transformed += weight * component

        # Normalize to prevent clipping
        max_val = torch.abs(transformed).max(dim=1, keepdim=True)[0]
        transformed = transformed / (max_val + 1e-8) * 0.95

        return transformed

    def _apply_sine_transform(self, audio: torch.Tensor) -> torch.Tensor:
        """Make audio more sine-like: smooth, fundamental-focused"""
        # Apply strong low-pass filter
        filtered = self._lowpass_filter(audio, cutoff=0.3)
        return filtered

    def _apply_square_transform(self, audio: torch.Tensor) -> torch.Tensor:
        """Make audio more square-like: add odd harmonics"""
        # Apply soft clipping to generate harmonics
        clipped = torch.tanh(audio * 2.0) * 0.7
        return clipped

    def _apply_sawtooth_transform(self, audio: torch.Tensor) -> torch.Tensor:
        """Make audio more sawtooth-like: bright, all harmonics"""
        # Emphasize high frequencies
        filtered = self._highpass_filter(audio, cutoff=0.4)
        enhanced = audio + filtered * 0.5
        return enhanced

    def _apply_triangle_transform(self, audio: torch.Tensor) -> torch.Tensor:
        """Make audio more triangle-like: mellow, reduced harmonics"""
        # Gentle filtering
        filtered = self._lowpass_filter(audio, cutoff=0.4)
        return filtered

    def _apply_noise_transform(self, audio: torch.Tensor) -> torch.Tensor:
        """Add noise texture"""
        # Generate shaped noise
        noise = torch.randn_like(audio) * 0.1
        return audio + noise

    def _lowpass_filter(self, audio: torch.Tensor, cutoff: float) -> torch.Tensor:
        """Apply learnable low-pass filter"""
        if self.archetype_filters is not None:
            return self.archetype_filters['sine'](audio, filter_type='lowpass', cutoff=cutoff)
        else:
            # Fallback to simple filter
            return self._simple_lowpass(audio, cutoff)

    def _highpass_filter(self, audio: torch.Tensor, cutoff: float) -> torch.Tensor:
        """Apply learnable high-pass filter"""
        if self.archetype_filters is not None:
            return self.archetype_filters['sawtooth'](audio, filter_type='highpass', cutoff=cutoff)
        else:
            return self._simple_highpass(audio, cutoff)

    def _simple_lowpass(self, audio: torch.Tensor, cutoff: float) -> torch.Tensor:
        """Simple IIR low-pass filter"""
        # Convert to numpy for scipy filtering
        audio_np = audio.cpu().numpy()

        # Butterworth filter
        b, a = signal.butter(4, cutoff, btype='low')

        filtered = []
        for i in range(audio_np.shape[0]):
            y_filt = signal.filtfilt(b, a, audio_np[i])
            filtered.append(y_filt)

        return torch.from_numpy(np.stack(filtered)).float().to(self.device)

    def _simple_highpass(self, audio: torch.Tensor, cutoff: float) -> torch.Tensor:
        """Simple IIR high-pass filter"""
        audio_np = audio.cpu().numpy()

        b, a = signal.butter(2, cutoff, btype='high')

        filtered = []
        for i in range(audio_np.shape[0]):
            y_filt = signal.filtfilt(b, a, audio_np[i])
            filtered.append(y_filt)

        return torch.from_numpy(np.stack(filtered)).float().to(self.device)

    def forward(
        self,
        audio: torch.Tensor,
        archetype_weights: torch.Tensor
    ) -> torch.Tensor:
        """
        Forward pass: transform audio with archetype weights
        """
        return self.apply_archetype_transformation(audio, archetype_weights)

In [3]:
class LearnableFilter(nn.Module):
    """
    Learnable frequency-domain filter
    Implements differentiable filtering in spectral domain
    """

    def __init__(self, num_bands=128, device='cpu'):
        super(LearnableFilter, self).__init__()

        self.num_bands = num_bands

        # Learnable filter response
        self.filter_response = nn.Parameter(torch.ones(num_bands))

        self.to(device)

    def forward(
        self,
        audio: torch.Tensor,
        filter_type: str = 'lowpass',
        cutoff: float = 0.5
    ) -> torch.Tensor:
        """
        Apply learnable filter to audio

        Args:
            audio: Input audio (batch, samples)
            filter_type: 'lowpass' or 'highpass'
            cutoff: Cutoff frequency (0-1, normalized)

        Returns:
            Filtered audio
        """
        # FFT
        audio_fft = torch.fft.rfft(audio)

        # Create filter response
        freq_bins = audio_fft.shape[-1]
        filter_curve = F.interpolate(
            self.filter_response.unsqueeze(0).unsqueeze(0),
            size=freq_bins,
            mode='linear',
            align_corners=False
        ).squeeze()

        # Apply sigmoid for smooth response
        filter_curve = torch.sigmoid(filter_curve)

        # Modify based on filter type
        cutoff_bin = int(cutoff * freq_bins)
        if filter_type == 'lowpass':
            mask = torch.ones_like(filter_curve)
            mask[cutoff_bin:] *= 0.1
            filter_curve = filter_curve * mask
        elif filter_type == 'highpass':
            mask = torch.ones_like(filter_curve)
            mask[:cutoff_bin] *= 0.1
            filter_curve = filter_curve * mask

        # Apply filter in frequency domain
        filtered_fft = audio_fft * filter_curve

        # Inverse FFT
        filtered_audio = torch.fft.irfft(filtered_fft, n=audio.shape[-1])

        return filtered_audio

In [4]:
class HarmonicSynthesizer(nn.Module):
    """
    Synthesizes harmonic content based on fundamental frequency
    """

    def __init__(self, sample_rate=44100, max_harmonics=64, device='cpu'):
        super(HarmonicSynthesizer, self).__init__()

        self.sample_rate = sample_rate
        self.max_harmonics = max_harmonics
        self.device = device

    def synthesize(
        self,
        f0: torch.Tensor,
        amplitudes: torch.Tensor,
        n_samples: int
    ) -> torch.Tensor:
        """
        Synthesize harmonic audio from fundamental frequency and amplitudes

        Args:
            f0: Fundamental frequency (batch, frames)
            amplitudes: Harmonic amplitudes (batch, frames, max_harmonics)
            n_samples: Number of samples to generate

        Returns:
            Synthesized audio (batch, n_samples)
        """
        batch_size = f0.shape[0]

        # Generate time axis
        t = torch.linspace(0, n_samples / self.sample_rate, n_samples).to(self.device)

        # Synthesize each harmonic
        audio = torch.zeros(batch_size, n_samples).to(self.device)

        for h in range(self.max_harmonics):
            # Interpolate f0 and amplitudes to sample rate
            f0_interp = F.interpolate(f0.unsqueeze(1), size=n_samples, mode='linear').squeeze(1)
            amp_interp = F.interpolate(
                amplitudes[:, :, h].unsqueeze(1),
                size=n_samples,
                mode='linear'
            ).squeeze(1)

            # Generate harmonic
            phase = 2 * np.pi * (h + 1) * f0_interp * t.unsqueeze(0)
            harmonic = amp_interp * torch.sin(phase)

            audio += harmonic

        return audio

In [5]:
class NoiseGenerator(nn.Module):
    """
    Generates shaped noise for texture
    """

    def __init__(self, device='cpu'):
        super(NoiseGenerator, self).__init__()
        self.device = device

    def generate(
        self,
        n_samples: int,
        batch_size: int = 1,
        color: str = 'white'
    ) -> torch.Tensor:
        """
        Generate colored noise

        Args:
            n_samples: Number of samples
            batch_size: Batch size
            color: 'white', 'pink', or 'brown'

        Returns:
            Noise tensor (batch_size, n_samples)
        """
        # Generate white noise
        noise = torch.randn(batch_size, n_samples).to(self.device)

        if color == 'pink':
            # Apply 1/f filtering
            noise = self._pink_filter(noise)
        elif color == 'brown':
            # Apply 1/f^2 filtering
            noise = self._brown_filter(noise)

        return noise

    def _pink_filter(self, noise: torch.Tensor) -> torch.Tensor:
        """Apply pink noise filtering (1/f spectrum)"""
        # FFT
        noise_fft = torch.fft.rfft(noise)

        # Create 1/sqrt(f) filter
        freqs = torch.arange(noise_fft.shape[-1]).float().to(self.device)
        freqs[0] = 1  # Avoid division by zero
        filter_curve = 1.0 / torch.sqrt(freqs)

        # Apply filter
        filtered_fft = noise_fft * filter_curve

        # Inverse FFT
        filtered = torch.fft.irfft(filtered_fft, n=noise.shape[-1])

        # Normalize
        filtered = filtered / filtered.std() * noise.std()

        return filtered

    def _brown_filter(self, noise: torch.Tensor) -> torch.Tensor:
        """Apply brown noise filtering (1/f^2 spectrum)"""
        noise_fft = torch.fft.rfft(noise)

        freqs = torch.arange(noise_fft.shape[-1]).float().to(self.device)
        freqs[0] = 1
        filter_curve = 1.0 / freqs

        filtered_fft = noise_fft * filter_curve
        filtered = torch.fft.irfft(filtered_fft, n=noise.shape[-1])
        filtered = filtered / filtered.std() * noise.std()

        return filtered

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Initialize DDSP engine
ddsp_engine = DDSPTransformationEngine(
    sample_rate=44100,
    learnable_filters=True,
    device=device
)

In [7]:
# Create dummy audio and archetype weights
batch_size = 2
n_samples = 44100 * 2  # 2 seconds

dummy_audio = torch.randn(batch_size, n_samples).to(device)

# Example: mostly sawtooth with some noise
archetype_weights = torch.tensor([
    [0.1, 0.1, 0.6, 0.1, 0.1],  # Bright sawtooth
    [0.5, 0.1, 0.1, 0.2, 0.1]   # Smooth sine
]).to(device)

print("=== Testing DDSP Transformation ===")
print(f"Input audio shape: {dummy_audio.shape}")
print(f"Archetype weights:\n{archetype_weights}")

# Transform audio
transformed = ddsp_engine(dummy_audio, archetype_weights)

print(f"\nTransformed audio shape: {transformed.shape}")
print(f"Input RMS: {dummy_audio.pow(2).mean().sqrt().item():.4f}")
print(f"Output RMS: {transformed.pow(2).mean().sqrt().item():.4f}")

=== Testing DDSP Transformation ===
Input audio shape: torch.Size([2, 88200])
Archetype weights:
tensor([[0.1000, 0.1000, 0.6000, 0.1000, 0.1000],
        [0.5000, 0.1000, 0.1000, 0.2000, 0.1000]])

Transformed audio shape: torch.Size([2, 88200])
Input RMS: 1.0008
Output RMS: 0.2363


In [8]:
import IPython.display as ipd
from IPython.display import display, HTML

# Display original audio
print("\n▶️  ORIGINAL AUDIO:")
display(ipd.Audio(dummy_audio.detach().cpu().numpy(), rate=44100, autoplay=False))

# Display transformed audio
print("\n▶️  TRANSFORMED AUDIO:")
display(ipd.Audio(transformed.detach().cpu().numpy(), rate=44100, autoplay=False))



▶️  ORIGINAL AUDIO:



▶️  TRANSFORMED AUDIO:


In [9]:
# Generate proper audio waveforms (not random noise)
sample_rate = 44100
duration = 2.0
t = np.linspace(0, duration, int(sample_rate * duration))

# Create a clean sine wave as input (440 Hz = A4 note)
original_audio = np.sin(2 * np.pi * 440 * t) * 0.5

# Convert to tensor
dummy_audio = torch.from_numpy(original_audio).unsqueeze(0).float().to(device)

# Define archetype weights
# Example 1: Mostly sawtooth (bright)
archetype_weights_bright = torch.tensor([
    [0.1, 0.1, 0.6, 0.1, 0.1]  # High sawtooth
]).to(device)

# Example 2: Mostly sine (smooth)
archetype_weights_smooth = torch.tensor([
    [0.6, 0.1, 0.1, 0.2, 0.0]  # High sine
]).to(device)

In [10]:
print("=== Testing DDSP Transformation with Real Audio ===")
print(f"Input: Clean sine wave at 440 Hz")

# Transform with different archetype weights
print("\n--- Transformation 1: Make it BRIGHT (sawtooth) ---")
transformed_bright = ddsp_engine(dummy_audio, archetype_weights_bright)

print("\n▶️  ORIGINAL AUDIO (Sine Wave):")
display(ipd.Audio(dummy_audio.detach().cpu().numpy(), rate=44100, autoplay=False))

print("\n▶️  TRANSFORMED AUDIO (Bright Sawtooth):")
display(ipd.Audio(transformed_bright.detach().cpu().numpy(), rate=44100, autoplay=False))

print("\n--- Transformation 2: Make it SMOOTH (more sine) ---")
transformed_smooth = ddsp_engine(dummy_audio, archetype_weights_smooth)

print("\n▶️  TRANSFORMED AUDIO (Smooth Sine):")
display(ipd.Audio(transformed_smooth.detach().cpu().numpy(), rate=44100, autoplay=False))

# Show RMS comparison
print(f"\nOriginal RMS: {dummy_audio.pow(2).mean().sqrt().item():.4f}")
print(f"Transformed (bright) RMS: {transformed_bright.pow(2).mean().sqrt().item():.4f}")
print(f"Transformed (smooth) RMS: {transformed_smooth.pow(2).mean().sqrt().item():.4f}")

=== Testing DDSP Transformation with Real Audio ===
Input: Clean sine wave at 440 Hz

--- Transformation 1: Make it BRIGHT (sawtooth) ---

▶️  ORIGINAL AUDIO (Sine Wave):



▶️  TRANSFORMED AUDIO (Bright Sawtooth):



--- Transformation 2: Make it SMOOTH (more sine) ---

▶️  TRANSFORMED AUDIO (Smooth Sine):



Original RMS: 0.3536
Transformed (bright) RMS: 0.6258
Transformed (smooth) RMS: 0.6528


In [11]:
# Test noise generator
print("\n=== Testing Noise Generator ===")
noise_gen = NoiseGenerator(device=device)

white_noise = noise_gen.generate(44100, batch_size=1, color='white')
pink_noise = noise_gen.generate(44100, batch_size=1, color='pink')
brown_noise = noise_gen.generate(44100, batch_size=1, color='brown')

print(f"White noise RMS: {white_noise.pow(2).mean().sqrt().item():.4f}")
print(f"Pink noise RMS: {pink_noise.pow(2).mean().sqrt().item():.4f}")
print(f"Brown noise RMS: {brown_noise.pow(2).mean().sqrt().item():.4f}")


=== Testing Noise Generator ===
White noise RMS: 0.9996
Pink noise RMS: 1.0046
Brown noise RMS: 1.0093
