## Install Dependencies

In [None]:
!pip install crepe
!pip install --upgrade ddsp
!pip install --upgrade tensorflow==2.17
!pip install --force-reinstall -v "numpy==1.24.0"

Collecting ddsp
  Using cached ddsp-3.7.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting librosa<=0.10 (from ddsp)
  Using cached librosa-0.10.0-py3-none-any.whl.metadata (8.3 kB)
Collecting numpy<1.24 (from ddsp)
  Using cached numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting scipy<=1.10.1 (from ddsp)
  Using cached scipy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
Collecting tensorflow<=2.11 (from ddsp)
  Using cached tensorflow-2.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.1 kB)
Collecting tensorflow-probability<=0.19 (from ddsp)
  Using cached tensorflow_probability-0.19.0-py2.py3-none-any.whl.metadata (13 kB)
INFO: pip is looking at multiple versions of ddsp to determine which version is compatible with other requirements. This could take a while.
Using cached numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
Installing colle

Collecting tensorflow==2.17
  Using cached tensorflow-2.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 (from tensorflow==2.17)
  Downloading protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting tensorboard<2.18,>=2.17 (from tensorflow==2.17)
  Downloading tensorboard-2.17.1-py3-none-any.whl.metadata (1.6 kB)
Collecting keras>=3.2.0 (from tensorflow==2.17)
  Downloading keras-3.6.0-py3-none-any.whl.metadata (5.8 kB)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard<2.18,>=2.17->tensorflow==2.17)
  Downloading tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl.metadata (1.1 kB)
Downloading tensorflow-2.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (601.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m601.3/601.3 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hDow

Using pip 24.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
Collecting numpy==1.24.0
  Obtaining dependency information for numpy==1.24.0 from https://files.pythonhosted.org/packages/f8/af/d6a4f957a15287faa4f5d47c8f4290fd5fac24649ed8df0e4a6634bc493a/numpy-1.24.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached numpy-1.24.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Using cached numpy-1.24.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.23.5
    Uninstalling numpy-1.23.5:
      Removing file or directory /usr/local/bin/f2py
      Removing file or directory /usr/local/bin/f2py3
      Removing file or directory /usr/local/bin/f2py3.10
      Removing file or directory /usr/local/lib/python3.10/dist-packages/numpy-1.23.5.dist-info/
      Removing file or directory /usr/local/

In [None]:
import os
from IPython.display import Audio
import librosa
import librosa.display
import librosa.filters
import numpy as np
import soundfile
import crepe
import tensorflow as tf
from hmmlearn import hmm
import torch

# From DDSP-Core

These functions are taken and adapted from DDSP (https://github.com/magenta/ddsp/tree/main/ddsp) and MIDI-DDSP (https://github.com/magenta/midi-ddsp/tree/main/midi_ddsp) for our purposes

##Math functions
(adapted from https://github.com/magenta/ddsp/blob/main/ddsp/core.py)

In [None]:
F0_RANGE = 127.0
DB_RANGE = 80.0

def safe_divide(numerator, denominator, eps=1e-7):
  """Avoid dividing by zero by adding a small epsilon."""
  safe_denominator = tf.where(denominator == 0.0, eps, denominator)
  return numerator / safe_denominator

def safe_log(x, eps=1e-5):
  """Avoid taking the log of a non-positive number."""
  safe_x = tf.where(x <= 0.0, eps, x)
  return tf.math.log(safe_x)

def logb(x, base=2.0, eps=1e-5):
  """Logarithm with base as an argument."""
  return safe_divide(safe_log(x, eps), safe_log(base, eps), eps)

def log10(x, eps=1e-5):
  """Logarithm with base 10."""
  return logb(x, base=10, eps=eps)

def power_to_db(power, ref_db=0.0, range_db=DB_RANGE, use_tf=True):
  """Converts power from linear scale to decibels."""
  # Choose library.
  maximum = tf.maximum if use_tf else np.maximum
  log_base10 = log10 if use_tf else np.log10

  # Convert to decibels.
  pmin = 10**-(range_db / 10.0)
  power = maximum(pmin, power)
  db = 10.0 * log_base10(power)

  # Set dynamic range.
  db -= ref_db
  db = maximum(db, -range_db)
  return db

In [None]:
from typing import TypeVar

Number = TypeVar('Number', int, float, np.ndarray, tf.Tensor)

def tf_float32(x):
  """Ensure array/tensor is a float32 tf.Tensor."""
  if isinstance(x, tf.Tensor):
    return tf.cast(x, dtype=tf.float32)  # This is a no-op if x is float32.
  else:
    return tf.convert_to_tensor(x, tf.float32)

def hz_to_midi(frequencies: Number) -> Number:
  """TF-compatible hz_to_midi function."""
  frequencies = tf_float32(frequencies)
  notes = 12.0 * (logb(frequencies, 2.0) - logb(440.0, 2.0)) + 69.0
  # Map 0 Hz to MIDI 0 (Replace -inf MIDI with 0.)
  notes = tf.where(tf.less_equal(frequencies, 0.0), 0.0, notes)
  return notes

## MIDI-DDSP Audio I/O

For loading, reformatting, and preparing audio data for preprocessing/feature extraction

(adapted from https://github.com/magenta/midi-ddsp/blob/main/midi_ddsp/utils/audio_io.py)

In [None]:
def load_audio(file_path, sample_rate, mono=True, window_size=0,
               from_numpy=False):
  """
  Load audio file from disk.
  :param file_path: Audio file path.
  :param sample_rate: Sample rate to load, will resample to the sample_rate
  if file has a different sample rate.
  :param mono: Whether to load a mono audio file.
  :param window_size: If larger than 0, then the waveform will be cut off to
  have a length that is multiple of window_size,
  :param from_numpy: If the file_path provided is pointing to a npy file.
  :return:
  """
  if from_numpy:
    y = np.load(file_path)
  else:
    y, _ = librosa.load(file_path, sample_rate, mono=mono, dtype=np.float64)
  if window_size > 0:
    output_length = len(y) // window_size * window_size
    y = y[:output_length]
  return y

def predict_voicing(confidence):
  # https://github.com/marl/crepe/pull/26
  """
  Find the Viterbi path for voiced versus unvoiced frames.
  Parameters
  ----------
  confidence : np.ndarray [shape=(N,)]
      voicing confidence array, i.e. the confidence in the presence of
      a pitch
  Returns
  -------
  voicing_states : np.ndarray [shape=(N,)]
      HMM predictions for each frames state, 0 if unvoiced, 1 if
      voiced
  """
  # uniform prior on the voicing confidence
  starting = np.array([0.5, 0.5])

  # transition probabilities inducing continuous voicing state
  transition = np.array([[0.99, 0.01], [0.01, 0.99]])

  # mean and variance for unvoiced and voiced states
  means = np.array([[0.0], [1.0]])
  variances = np.array([[0.25], [0.25]])

  # fix the model parameters because we are not optimizing the model
  model = hmm.GaussianHMM(n_components=2)
  model.startprob_, model.covars_, model.transmat_, model.means_, \
  model.n_features = starting, variances, transition, means, 1

  # find the Viterbi path
  voicing_states = model.predict(confidence.reshape(-1, 1), [len(confidence)])

  return np.array(voicing_states)


def get_framed_lengths(input_length, frame_size, hop_size, padding='center'):
  """Give a strided framing, such as tf.signal.frame, gives output lengths.

  Args:
    input_length: Original length along the dimension to be framed.
    frame_size: Size of frames for striding.
    hop_size: Striding, space between frames.
    padding: Type of padding to apply, ['valid', 'same', 'center']. 'valid' is
      a no-op. 'same' applies padding to the end such that
      n_frames = n_t / hop_size. 'center' applies padding to both ends such that
      each frame timestamp is centered and n_frames = n_t / hop_size + 1.

  Returns:
    n_frames: Number of frames left after striding.
    padded_length: Length of the padded signal before striding.
  """
  # Use numpy since this function isn't used dynamically.
  def get_n_frames(length):
    return int(np.floor((length - frame_size) // hop_size)) + 1

  if padding == 'valid':
    padded_length = input_length
    n_frames = get_n_frames(input_length)

  elif padding == 'center':
    padded_length = input_length + frame_size
    n_frames = get_n_frames(padded_length)

  elif padding == 'same':
    n_frames = int(np.ceil(input_length / hop_size))
    padded_length = (n_frames - 1) * hop_size + frame_size

  return n_frames, padded_length

def pad(x, frame_size, hop_size, padding='center',
        axis=1, mode='CONSTANT', constant_values=0):
  """Pad a tensor for strided framing such as tf.signal.frame.

  Args:
    x: Tensor to pad, any shape.
    frame_size: Size of frames for striding.
    hop_size: Striding, space between frames.
    padding: Type of padding to apply, ['valid', 'same', 'center']. 'valid' is
      a no-op. 'same' applies padding to the end such that
      n_frames = n_t / hop_size. 'center' applies padding to both ends such that
      each frame timestamp is centered and n_frames = n_t / hop_size + 1.
    axis: Axis along which to pad `x`.
    mode: Padding mode for tf.pad(). One of "CONSTANT", "REFLECT", or
      "SYMMETRIC" (case-insensitive).
    constant_values: Passthrough kwarg for tf.pad().

  Returns:
    A padded version of `x` along axis. Output sizes can be computed separately
      with strided_lengths.
  """
  x = tf_float32(x)

  if padding == 'valid':
    return x

  if hop_size > frame_size:
    raise ValueError(f'During padding, frame_size ({frame_size})'
                     f' must be greater than hop_size ({hop_size}).')

  if len(x.shape) <= 1:
    axis = 0

  n_t = x.shape[axis]
  _, n_t_padded = get_framed_lengths(n_t, frame_size, hop_size, padding)
  pads = [[0, 0] for _ in range(len(x.shape))]

  if padding == 'same':
    pad_amount = int(n_t_padded - n_t)
    pads[axis] = [0, pad_amount]

  elif padding == 'center':
    pad_amount = int(frame_size // 2)  # Symmetric even padding like librosa.
    pads[axis] = [pad_amount, pad_amount]

  else:
    raise ValueError('`padding` must be one of [\'center\', \'same\''
                     f'\'valid\'], received ({padding}).')

  return tf.pad(x, pads, mode=mode, constant_values=constant_values)



## Spectral Utilities

For calculating Short-Time Fourier Transform, Mel-spectrogram, Spectral Centroid, etc.

(adapted from https://github.com/magenta/ddsp/blob/main/ddsp/spectral_ops.py)

In [None]:
def spectral_centroid(wav, hop_length, sr):
  centroid = librosa.feature.spectral_centroid(y=wav, sr=sr,
                                               hop_length=hop_length)
  return centroid


def tf_stft(audio, win_length, hop_length, n_fft, pad_end=True):
  s = tf.signal.stft(
    signals=audio,
    frame_length=win_length,
    frame_step=hop_length,
    fft_length=n_fft,
    pad_end=pad_end)
  mag = tf.abs(s)
  return tf.cast(mag, tf.float32)


def tf_mel(audio, sample_rate, win_length, hop_length, n_fft, num_mels, fmin=40,
           pad_end=True):
  """Calculate Mel Spectrogram."""
  mag = tf_stft(audio, win_length, hop_length, n_fft, pad_end=pad_end)
  num_spectrogram_bins = int(mag.shape[-1])
  hi_hz = sample_rate // 2
  linear_to_mel_matrix = tf.signal.linear_to_mel_weight_matrix(
    num_mels,
    num_spectrogram_bins,
    sample_rate,
    fmin,
    hi_hz)
  mel = tf.tensordot(mag, linear_to_mel_matrix, 1)
  mel.set_shape(mag.shape[:-1].concatenate(linear_to_mel_matrix.shape[-1:]))
  return mel


def tf_log_mel(audio, sample_rate, win_length, hop_length, n_fft, num_mels,
               fmin=40, pad_end=True):
  mel = tf_mel(audio, sample_rate, win_length, hop_length, n_fft, num_mels,
               fmin=fmin, pad_end=pad_end)
  return safe_log(mel)


def stft(audio, frame_size=2048, overlap=0.75, pad_end=True):
  """Differentiable stft in tensorflow, computed in batch."""
  # Remove channel dim if present.
  audio = tf_float32(audio)
  if len(audio.shape) == 3:
    audio = tf.squeeze(audio, axis=-1)

  s = tf.signal.stft(
      signals=audio,
      frame_length=int(frame_size),
      frame_step=int(frame_size * (1.0 - overlap)),
      fft_length=None,  # Use enclosing power of 2.
      pad_end=pad_end)
  return s


def stft_np(audio, frame_size=2048, overlap=0.75, pad_end=True):
  """Non-differentiable stft using librosa, one example at a time."""
  assert frame_size * overlap % 2.0 == 0.0
  hop_size = int(frame_size * (1.0 - overlap))
  is_2d = (len(audio.shape) == 2)

  if pad_end:
    audio = pad(audio, frame_size, hop_size, 'same', axis=is_2d).numpy()

  def stft_fn(y):
    return librosa.stft(
        y=y, n_fft=int(frame_size), hop_length=hop_size, center=False).T

  s = np.stack([stft_fn(a) for a in audio]) if is_2d else stft_fn(audio)
  return s

# Feature Extraction

Feature extraction as implemented in MIDI-DDSP's DDSP Inference model



*   f0 extraction is performed using pre-trained CREPE model (https://github.com/marl/crepe)List item
*   loudness curve is calculated via A-weighting of the power spectrum

These features calculated for each training example and serve as input into the encoder model in addition to the log-mel spectrogram




In [None]:
def extract_f0(wav, frame_shift_ms=5, sr=44100, unvoice=True, no_log=False):
  """Extract f0 from audio using CREPE."""
  if sr != 16000:
    raise RuntimeError('CREPE method should use sr=16khz')
  _, frequency, confidence, _ = crepe.predict(
    wav, sr=sr,
    viterbi=True,
    step_size=frame_shift_ms,
    verbose=0 if no_log else 1)
  f0 = frequency
  if unvoice:
    is_voiced = predict_voicing(confidence)
    frequency_unvoiced = frequency * is_voiced
    f0 = frequency_unvoiced

  return f0

def compute_loudness(audio,
                     sample_rate=16000,
                     frame_rate=250,
                     n_fft=512,
                     range_db=DB_RANGE,
                     ref_db=0.0,
                     use_tf=True,
                     padding='center'):
  """Perceptual loudness (weighted power) in dB.

  Function is differentiable if use_tf=True.
  Args:
    audio: Numpy ndarray or tensor. Shape [batch_size, audio_length] or
      [audio_length,].
    sample_rate: Audio sample rate in Hz.
    frame_rate: Rate of loudness frames in Hz.
    n_fft: Fft window size.
    range_db: Sets the dynamic range of loudness in decibles. The minimum
      loudness (per a frequency bin) corresponds to -range_db.
    ref_db: Sets the reference maximum perceptual loudness as given by
      (A_weighting + 10 * log10(abs(stft(audio))**2.0). The old (<v2.0.0)
      default value corresponded to white noise with amplitude=1.0 and
      n_fft=2048. With v2.0.0 it was set to 0.0 to be more consistent with power
      calculations that have a natural scale for 0 dB being amplitude=1.0.
    use_tf: Make function differentiable by using tensorflow.
    padding: 'same', 'valid', or 'center'.

  Returns:
    Loudness in decibels. Shape [batch_size, n_frames] or [n_frames,].
  """
  # Pick tensorflow or numpy.
  lib = tf if use_tf else np
  reduce_mean = tf.reduce_mean if use_tf else np.mean
  stft_fn = stft if use_tf else stft_np

  # Make inputs tensors for tensorflow.
  frame_size = n_fft
  hop_size = sample_rate // frame_rate
  audio = pad(audio, frame_size, hop_size, padding=padding)
  audio = audio if use_tf else np.array(audio)

  # Temporarily a batch dimension for single examples.
  is_1d = (len(audio.shape) == 1)
  audio = audio[lib.newaxis, :] if is_1d else audio

  # Take STFT.
  overlap = 1 - hop_size / frame_size
  s = stft_fn(audio, frame_size=frame_size, overlap=overlap, pad_end=False)

  # Compute power.
  amplitude = lib.abs(s)
  power = amplitude**2

  # Perceptual weighting.
  frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft)
  a_weighting = librosa.A_weighting(frequencies)[lib.newaxis, lib.newaxis, :]

  # Perform weighting in linear scale, a_weighting given in decibels.
  weighting = 10**(a_weighting/10)
  power = power * weighting

  # Average over frequencies (weighted power per a bin).
  avg_power = reduce_mean(power, axis=-1)
  #loudness = librosa.power_to_db(avg_power, ref=ref_db, top_db = range_db)
  loudness = power_to_db(avg_power, ref_db=ref_db, range_db=range_db, use_tf=use_tf)

  # Remove temporary batch dimension.
  loudness = loudness[0] if is_1d else loudness

  return loudness



# Data Preprocessing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def pad_feature_array(feature_array):
  """
  Zero-pads the extracted feature arrays to be size 1000

  Args:
    feature_array: the extracted feature array
  Returns:
    padded: the zero-padded feature array
  """
  if feature_array.ndim == 2:
    1000 - feature_array.shape[0] % 1000
    z = np.zeros((1000 - feature_array.shape[0] % 1000, feature_array.shape[1]))
    padded = np.concatenate([feature_array, z])
  else:
    1000 - len(feature_array) % 1000
    z = np.zeros(1000 - len(feature_array) % 1000)
    padded = np.concatenate([feature_array, z])

  return padded

def pad_audio(y, frame_size, padded_shape):
  """
    Zero-pads the audio based on the frame size

    Args:
      y: the audio as an array
      frame_size: number of frames per window
      padded_shape: the shape of the feature array
    Returns:
      padded: the zero-padded audio
  """
  target_size = padded_shape[0] * frame_size
  pad_amount = target_size - len(y)
  z = np.zeros(pad_amount)
  padded = np.concatenate([y, z])

  return padded


def preprocess_file(filename, frame_size=64, sr=16000):
  """
  Takes input audio, extracts features, and slices into examples.

  Args:
    filename: path to audio file (string)
    frame_size: the number of samples per processing frame (int)
    sr: sample rate of audio file (int)

  Returns:
    split_output: a dict containing f0 (shape=(m, 1000)), loudness (shape=(m, 1000)), and log-mel (shape=(m, 1000, 64))
    output: a dict containing the raw audio, as well as f0, loudness, and log-mel for the full audio length
  """

  # load whole audio file
  y, _ = librosa.load(filename, sr=16000)

  # processing parameters
  hop_length = frame_size
  win_length = hop_length * 2
  frame_shift_ms = 1000 / sr * frame_size
  frame_rate = sr / frame_size

  # extract features
  f0_hz = extract_f0(y,frame_shift_ms=frame_shift_ms, sr=sr)
  loudness_db = compute_loudness(y, sample_rate=sr, frame_rate=frame_rate, n_fft=512)
  mel = tf_log_mel(y, sample_rate=sr, win_length=win_length, hop_length=hop_length, n_fft=1024, num_mels=64)

  # slice into examples, reshape

  f0_hz_padded = pad_feature_array(f0_hz)
  loudness_db_padded = pad_feature_array(loudness_db)
  mel_padded = pad_feature_array(mel)
  y_padded = pad_audio(y, frame_size, f0_hz_padded.shape)

  f0_hz_split = np.reshape(f0_hz_padded, (-1, 1000))
  loudness_db_split = np.reshape(loudness_db_padded, (-1, 1000))
  mel_split = np.reshape(mel_padded, (-1, 1000, 64))
  audio_split = np.reshape(mel_padded, (-1, 1000 * frame_size))

  split_output = {'audio': audio_split, 'f0_hz': f0_hz_split, 'loudness_db': loudness_db_split, 'mel': mel_split}
  output = {'audio': y, 'f0_hz': f0_hz, 'loudness_db': loudness_db, 'mel': mel}

  return split_output, output


In [None]:
# rewriting files as 16kHz

def convert_to_16kHz(input_file, output_folder):
  """
  Converts input audio to 16kHz

  Args:
    input_file: path to audio file (string)
    output_folder: path to output folder (string)

  Returns:
    None
  """
  audio, sr = librosa.load(input_file, sr=None)

  if audio is None or len(audio) == 0:
      print(f"Failed to load audio from {input_file}. The audio data is empty.")
      return

  print(f"Loaded audio with sample rate: {sr} and shape: {audio.shape}")

  audio_resampled = librosa.resample(audio, orig_sr=sr, target_sr=16000)
  print(f"Resampled audio shape: {audio_resampled.shape}")

  # Get the original file's directory and name
  file_name, file_extension = os.path.splitext(os.path.basename(input_file))

  # Create a new file name by appending '_16kHz' before the file extension
  output_file = os.path.join(output_folder, f"{file_name}_16kHz{file_extension}")

  # Save the resampled audio to the new output file
  try:
      soundfile.write(output_file, audio_resampled, 16000)
      print(f"Audio resampled and saved to {output_file}")
  except Exception as e:
      print(f"Error saving file: {e}")

def folder_to_16kHz(folder, output_folder):
  """
  Converts all audio files in a folder to 16kHz

  Args:
    folder: path to input folder (string)
    output_folder: path to output folder (string)

  Returns:
    None
  """
    # Ensure the output folder exists

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Loop through all files in the input folder
    for file_name in os.listdir(folder):
        # Full path to the file
        input_file = os.path.join(folder, file_name)

        # Check if it's a valid audio file (you can extend this with more formats if needed)
        if os.path.isfile(input_file) and file_name.lower().endswith('.mp3'):  # Adjust if other formats should be processed
            print(f"Processing file: {input_file}")
            convert_to_16kHz(input_file, output_folder)

In [None]:
# Alex's Drive (test audio preprocessing)
folder = '/content/drive/MyDrive/Stanford/230/FiloBass/train_data_short_16kHz/'
filepath = folder + 'All-the-Things-You-Are_16kHz.mp3'

In [None]:
# nathan's drive
filepath = "/content/drive/MyDrive/FiloBass/train_data_short_16kHz/All-the-Things-You-Are_16kHz.mp3"

In [None]:
# added cell to load data from reid's drive
folder = '/content/drive/MyDrive/230/train_data_short_16kHz/'
filepath = folder + 'All-the-Things-You-Are_16kHz.mp3'

In [None]:
# if needed in program, would convert to 16kHz and save in a different folder

output_folder = '/content/drive/MyDrive/Basket'
convert_to_16kHz(filepath, output_folder)

Loaded audio with sample rate: 44100 and shape: (14880385,)
Resampled audio shape: (5398780,)
Error saving file: Error opening '/content/drive/MyDrive/Basket/All-the-Things-You-Are_16kHz_16kHz.mp3': System error.


In [None]:
# TEST: extract features from full audio file
y, _ = librosa.load(filepath, sr=16000)
print(y.shape)
f0_hz = extract_f0(y,frame_shift_ms=4, sr=16000)
loudness_db = compute_loudness(y, sample_rate=16000, frame_rate=250, n_fft=512)
mel = tf_log_mel(y, sample_rate=16000, win_length=128, hop_length=64, n_fft=1024, num_mels=64)

(5398780,)


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-14-9e7780e453e6>", line 4, in <cell line: 4>
    f0_hz = extract_f0(y,frame_shift_ms=4, sr=16000)
  File "<ipython-input-10-6900b20e5358>", line 5, in extract_f0
    _, frequency, confidence, _ = crepe.predict(
  File "/usr/local/lib/python3.10/dist-packages/crepe/core.py", line 255, in predict
    activation = get_activation(audio, sr, model_capacity=model_capacity,
  File "/usr/local/lib/python3.10/dist-packages/crepe/core.py", line 212, in get_activation
    return model.predict(frames, verbose=verbose)
  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/keras/src/backend/tensorflow/trainer.py", line 510, in predict
    batch_output

TypeError: object of type 'NoneType' has no len()

In [None]:
# TEST: extract features and slice into examples
# returns split_output, a dict containing f0 (shape=(m, 1000)), loudness (shape=(m, 1000)), and log-mel (shape=(m, 1000, 64))
# also returns output, a dict containing the raw audio, as well as f0, loudness, and log-mel for the full audio length
# for the All the Things You Are file, this should return 85 examples (each 4s long)
# note: in the MIDI-DDSP paper, they use a batch size of 4 examples
split_input, input = preprocess_file(filepath)
print(split_input['f0_hz'].shape)
print(split_input['loudness_db'].shape)
print(split_input['mel'].shape)
print(split_input["audio"].shape)

[1m2637/2637[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 6ms/step


  + 2 * np.log10(f_sq)


In [None]:
# TEST: dimension check
f0_hz = split_input['f0_hz'][0]
loudness_db = split_input['loudness_db'][0]
mel = split_input['mel'][0]

f0_hz_norm = hz_to_midi(f0_hz) / F0_RANGE
loudness_db_norm = loudness_db / DB_RANGE

f0_loudness = np.concatenate([f0_hz_norm, loudness_db_norm], axis=0)
f0_loudness.shape

(2000,)


#DDSP-Synthesis

Utility functions needed to synthesize audio based on calculated DDSP synthesis parameters (f0, harmonic amplitudes, harmonic distribution, noise magnitudes)

Used in decoder to generate final "predicted" audio that serves as our output

In [None]:
from ddsp import synths, processors

def get_process_group(n_frames, frame_size=64, sample_rate=16000,
                      use_angular_cumsum=True):
  harmonic_synth = synths.Harmonic(n_frames * frame_size, sample_rate,
                                        use_angular_cumsum=use_angular_cumsum)
  noise_synth = synths.FilteredNoise(n_frames * frame_size, sample_rate)
  add = processors.Add(name='add')
  # Create ProcessorGroup.
  # Change here: Use string names for modules
  dag = [('harmonic_synth', ['amplitudes', 'harmonic_distribution', 'f0_hz']),
         ('noise_synth', ['noise_magnitudes']),
         ('add', ['noise_synth/signal', 'harmonic_synth/signal'])]

  processor_group = processors.ProcessorGroup(dag=dag,
                                                   name='processor_group')

  # Add modules as attributes to the processor_group
  processor_group.harmonic_synth = harmonic_synth
  processor_group.noise_synth = noise_synth
  processor_group.add = add

  return processor_group

In [None]:
# TEST: Synthesize audio from synth params using ProcessingGroup DAG

f0_toy = np.random.rand(1,1000,1)
amps_toy = np.random.rand(1,1000,1)
harm_dist_toy = np.random.rand(1,1000,100)
noise_mag_toy = np.random.rand(1,1000,65)

synth_params = {
      'f0_hz': f0_toy,
      'amplitudes': amps_toy,
      'harmonic_distribution': harm_dist_toy,
      'noise_magnitudes': noise_mag_toy,
    }

processing_group = get_process_group(1000)

control_params = processing_group.get_controls(synth_params, verbose=False)
synth_audio = processing_group.get_signal(control_params)

synth_audio
Audio(synth_audio, rate=16000)
#specplot(audio_out)

#Model

The autoencoder model described in MIDI-DDSP, implemented from scratch.

In [None]:
import tensorflow as tf
tfk = tf.keras
tfkl = tfk.layers

def ConvBlock(inputs, out_channels, pool_size=(2, 2)):
  x = inputs
  x = tfkl.Conv2D(filters=out_channels,
                  kernel_size=(3, 3), strides=(1, 1),
                  padding='same', use_bias=False,
                  kernel_initializer=
                  tfk.initializers.GlorotUniform())(x)
  x = tfkl.BatchNormalization(beta_initializer='zeros',
                              gamma_initializer='ones')(x)
  x = tfkl.ReLU()(x)
  x = tfkl.Conv2D(filters=out_channels,
                  kernel_size=(3, 3), strides=(1, 1),
                  padding='same', use_bias=False,
                  kernel_initializer=
                  tfk.initializers.GlorotUniform())(x)
  x = tfkl.BatchNormalization(beta_initializer='zeros',
                              gamma_initializer='ones')(x)
  x = tfkl.ReLU()(x)
  x = tfkl.AveragePooling2D(pool_size=pool_size, padding='same')(x)

  return x


def CNN(inputs, pool_size=(1,2), dropout=0.2, nhid=256):
  x = inputs
  x = ConvBlock(x, out_channels=64, pool_size=pool_size)
  x = tfkl.Dropout(rate=dropout)(x)
  x = ConvBlock(x, out_channels=128, pool_size=pool_size)
  x = tfkl.Dropout(rate=dropout)(x)
  x = ConvBlock(x, out_channels=256, pool_size=pool_size)
  x = tfkl.Dropout(rate=dropout)(x)
  x = ConvBlock(x, out_channels=512, pool_size=pool_size)
  x = tfkl.Dropout(rate=dropout)(x)
  x = tfkl.Reshape((1000, -1))(x)
  x = tfkl.Dense(nhid)(x)

  return x

def DDSP_Encoder(inputs, nhid=256):
  mel = inputs['mel']
  z_cnn = CNN(mel, nhid=nhid)

  x = tfkl.Concatenate(axis=0)([inputs['f0_hz'], inputs['loudness_db']])
  #x = inputs['f0_loudness']
  #x = tf.concat([hz_to_midi(inputs['f0_hz']) / F0_RANGE,
  #                 inputs['loudness_db'] / DB_RANGE], -1)
  x_z = tfkl.Dense(nhid)(x)
  #x_z = tfkl.Reshape((1,-1))(x_z)
  x_z_concat = tfkl.Concatenate(axis=0)([x_z, z_cnn])
  z_out = tfkl.Bidirectional(tfkl.LSTM(units=nhid, return_sequences=True), name='bilstm')(x_z_concat)

  return z_out


def DDSP_Decoder(inputs, nharmonic=100, nnoise=65):
  z, data = inputs

  harmonic_amp = tfkl.Dense(1, bias_initializer='ones')(z)
  harmonic_distribution = tfkl.Dense(nharmonic)(z)
  noise_mag = tfkl.Dense(nnoise)(z)

  synth_params = {
      'f0_hz': data['f0_hz'],
      'amplitudes': harmonic_amp,
      'harmonic_distribution': harmonic_distribution,
      'noise_magnitudes': noise_mag,
    }

  n_frames = inputs[0].shape[1]
  frame_size = 64 #todo: set this better or get from input

  # if (inputs[0].shape[0] == None):
  #   return tf.convert_to_tensor(np.zeros((3, frame_size * n_frames)))

  # processing_group = get_process_group(n_frames)
  # control_params = processing_group.get_controls(synth_params, verbose=False)
  # synth_audio = processing_group.get_signal(control_params)

  return synth_params

def DDSP_model():
  f0_hz_input = tfk.Input(shape=(1000,1), name="f0_hz")
  loudness_db_input = tfk.Input(shape=(1000,1), name="loudness_db")
  mel_input = tfk.Input(shape=(1000, 64, 1), name="mel")
  inputs = {'f0_hz': f0_hz_input, 'loudness_db': loudness_db_input, 'mel': mel_input}

  z = DDSP_Encoder(inputs)
  synth_params = DDSP_Decoder([z, inputs])
  model = tfk.Model(inputs=[f0_hz_input, loudness_db_input, mel_input], outputs=synth_params)

  return model

model = DDSP_model()
model.summary()

In [None]:
 # TEST (TO-DO): try generating output from model using single example

# get single example from preprocessed data above
test_example = {'f0_hz': split_input['f0_hz'][0], 'loudness_db': split_input['loudness_db'][0], 'mel': split_input['mel'][0]}
print(test_example['f0_hz'].shape)

# reshape to fit model input dims
test_f0 = test_example['f0_hz'].reshape(1,1000,1)
test_loudness = test_example['loudness_db'].reshape(1,1000,1)
test_mel = test_example['mel'].reshape(1,1000,64,1)

# run model forward
pred_synth_params = model.predict([test_f0, test_loudness, test_mel])



(1000,)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


In [None]:
print(pred_synth_params.keys())
print(pred_synth_params['f0_hz'].shape)
print(pred_synth_params['amplitudes'].shape)
print(pred_synth_params['harmonic_distribution'].shape)
print(pred_synth_params['noise_magnitudes'].shape)

#print(pred_synth_params['f0_hz'])
print(pred_synth_params['amplitudes'])
#print(pred_synth_params['harmonic_distribution'])
#print(pred_synth_params['noise_magnitudes'])

dict_keys(['f0_hz', 'amplitudes', 'harmonic_distribution', 'noise_magnitudes'])
(1, 1000, 1)
(3, 1000, 1)
(3, 1000, 100)
(3, 1000, 65)
[[[1.0759577 ]
  [1.0554085 ]
  [1.0336186 ]
  ...
  [0.27283245]
  [0.26308107]
  [0.24890745]]

 [[1.3089577 ]
  [1.3021046 ]
  [1.3114251 ]
  ...
  [1.1249958 ]
  [1.1310816 ]
  [1.1396164 ]]

 [[1.0427201 ]
  [1.0593628 ]
  [1.0635271 ]
  ...
  [1.0292844 ]
  [1.0254701 ]
  [1.0188692 ]]]


In [None]:
processing_group = get_process_group(1000)

control_params = processing_group.get_controls(pred_synth_params, verbose=False)
synth_audio = processing_group.get_signal(control_params)

print(synth_audio.shape)
print(test_loudness.shape)

synth_audio
Audio(synth_audio, rate=16000)

(3, 64000)
(1, 1000, 1)


# Training

Following MIDI-DDSP, we use a Multi-Scale Spectral Loss (MSS Loss) which calculates the L1 distance between the spectrograms of the original and reconstructed audio at multiple time-scales. We import the MSS Loss from DDSP library (https://github.com/magenta/ddsp/blob/main/ddsp/losses.py)

## Loss

In [None]:
# Loss function
from ddsp import losses

def spectral_loss(y_true, y_pred):

  processing_group = get_process_group(1000)
  control_params = processing_group.get_controls(y_pred, verbose=False)
  synth_audio = processing_group.get_signal(control_params)

  return synth_spectral_loss(y_true,synth_audio)

synth_spectral_loss = losses.SpectralLoss(loss_type='L1',
                                                        mag_weight=1.0,
                                                        logmag_weight=1.0)

In [None]:
## TEST: confirm the loss function works
true_audio = split_input['audio'][0]

spectral_loss(true_audio, pred_synth_params)

<tf.Tensor: shape=(), dtype=float32, numpy=127.99893>

## Compile the model

In [None]:
model.compile(optimizer='adam', loss=spectral_loss)

In [None]:
# TEST: get small excerpt of training data to try training on
reshaped = {'f0_hz': split_input['f0_hz'].reshape(-1,1000,1),
                'loudness_db': split_input['loudness_db'].reshape(-1,1000,1),
                'mel': split_input['mel'].reshape(-1,1000,64,1)}

split_input_aslist = np.array([dict(zip(reshaped, vals)) for vals in zip(*reshaped.values())])

print(split_input_aslist[0]['f0_hz'].shape)

test_f0 = split_input['f0_hz'][0].reshape(1,1000,1)
test_ld = split_input['loudness_db'][0].reshape(1,1000,1)
test_mel = split_input['mel'][0].reshape(1,1000,64,1)

test_input = [test_f0, test_ld, test_mel]

test_audio = split_input['audio'][0].reshape(64000,1)

(1000, 1)


In [None]:
n_train = 80

# n_test = 16

#X_train_mini = {'f0_hz': split_input['f0_hz'][:n_train].reshape(-1,1000,1),
#                'loudness_db': split_input['loudness_db'][:n_train].reshape(-1,1000,1),
#                'mel': split_input['mel'][:n_train].reshape(-1,1000,64,1)}

X_train_mini = [split_input['f0_hz'][:n_train].reshape(-1,1000,1), split_input['loudness_db'][:n_train].reshape(-1,1000,1), split_input['mel'][:n_train].reshape(-1,1000,64,1)]
#
# X_test_mini = {'f0_hz': split_input['f0_hz'][n_train:n_train+n_test].reshape(n_test,1000,1),
#                 'loudness_db': split_input['loudness_db'][n_train:n_train+n_test].reshape(n_test,1000,1),
#                 'mel': split_input['mel'][n_train:n_train+n_test].reshape(n_test,1000,64,1)}

Y_train_mini = split_input['audio'][:n_train]
# Y_test_mini = split_input['audio'][n_train:n_train+n_test]


In [None]:
# TEST: try fitting the model
history = model.fit(test_input, test_audio, epochs=5, batch_size=1)

ValueError: Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 1, 1, 1
'y' sizes: 64000
