In [1]:
import os
import numpy as np
import librosa
import soundfile as sf
import ddsp
import ddsp.training
from ddsp.training import metrics
import gin
from IPython.display import Audio
import pickle
import tensorflow.compat.v2 as tf
from ddsp.training.postprocessing import (
    detect_notes, fit_quantile_transform
)

In [2]:
input_audio_path = r"C:\Users\prazw\Tune\test\voice4.wav"
output_audio_path = r"C:\Users\prazw\Tune\test\test3_output.wav"
model_path = r"C:\Users\prazw\Tune\model"
audio, sr = librosa.load(input_audio_path, sr=16000)
print(f"Audio shape: {audio.shape}")
print(f"Sample rate: {sr}")

Audio(audio, rate=16000)

Audio shape: (180800,)
Sample rate: 16000


In [3]:
#If the audio is mono (len(audio.shape) == 1), reshape it for processing
if len(audio.shape) == 1:
  audio = audio[np.newaxis, :]

# Setup the session.
ddsp.spectral_ops.reset_crepe()

# Compute features.
audio_features = metrics.compute_audio_features(audio)
# audio_features['loudness_db'] = audio_features['loudness_db'].astype(np.float32)
audio_features['loudness_db'] = audio_features['loudness_db'].numpy().astype(np.float32)



audio_features_mod = None
DATASET_STATS = None
dataset_stats_file = r"C:\Users\prazw\Tune\dataset_statistics.pkl"
try:
  if tf.io.gfile.exists(dataset_stats_file):
    with tf.io.gfile.GFile(dataset_stats_file, 'rb') as f:
      DATASET_STATS = pickle.load(f)
except Exception as err:
  print('Loading dataset statistics from pickle failed: {}.'.format(err))
print(DATASET_STATS)




#Load Model
gin_file = os.path.join(model_path, 'operative_config-0.gin')
with gin.unlock_config():
  gin.parse_config_file(gin_file, skip_unknown=True)

# ckpt = os.path.join(model_path, "\ckpt-3300.data-00000-of-00001")


# Ensure dimensions and sampling rates are equal
time_steps_train = gin.query_parameter('F0LoudnessPreprocessor.time_steps')
n_samples_train = gin.query_parameter('Harmonic.n_samples')
hop_size = int(n_samples_train / time_steps_train)
time_steps = int(audio.shape[1] / hop_size)
n_samples = time_steps * hop_size




gin_params = [
    'Harmonic.n_samples = {}'.format(n_samples),
    'FilteredNoise.n_samples = {}'.format(n_samples),
    'F0LoudnessPreprocessor.time_steps = {}'.format(time_steps),
    'oscillator_bank.use_angular_cumsum = True',  # Avoids cumsum accumulation errors.
]

with gin.unlock_config():
  gin.parse_config(gin_params)





# Trim all input vectors to correct lengths 
for key in ['f0_hz', 'f0_confidence', 'loudness_db']:
  audio_features[key] = audio_features[key][:time_steps]
audio_features['audio'] = audio_features['audio'][:, :n_samples]





# Set up the model just to predict audio given new conditioning
model = ddsp.training.models.Autoencoder()
model.restore(model_path)





# Build model by running a batch through it.
_ = model(audio_features, training=False)


# Assuming audio_gen is a NumPy array or tensor
audio_gen = model.get_audio_from_outputs(_)
# Convert to NumPy array if it's a TensorFlow tensor
if hasattr(audio_gen, 'numpy'):
    audio_gen = audio_gen.numpy()

# Normalize audio to prevent clipping (optional)
audio_gen = audio_gen / np.max(np.abs(audio_gen))


# What part is this
mask_on = None
audio_features_mod = {k: v.copy() for k, v in audio_features.items()}

## Helper functions.
def shift_ld(audio_features, ld_shift=-10.0):
  """Shift loudness by a number of ocatves."""
  audio_features['loudness_db'] += ld_shift
  return audio_features


def shift_f0(audio_features, pitch_shift=0.0):
  """Shift f0 by a number of ocatves."""
  audio_features['f0_hz'] *= 2.0 ** (pitch_shift)
  audio_features['f0_hz'] = np.clip(audio_features['f0_hz'], 
                                    0.0, 
                                    librosa.midi_to_hz(110.0))
  return audio_features

def get_tuning_factor(f0_midi, f0_confidence, mask_on):
  """Get an offset in cents, to most consistent set of chromatic intervals."""
  # Difference from midi offset by different tuning_factors.
  tuning_factors = np.linspace(-0.5, 0.5, 101)  # 1 cent divisions.
  midi_diffs = (f0_midi[mask_on][:, np.newaxis] -
                tuning_factors[np.newaxis, :]) % 1.0
  midi_diffs[midi_diffs > 0.5] -= 1.0
  weights = f0_confidence[mask_on][:, np.newaxis]

  ## Computes mininmum adjustment distance.
  cost_diffs = np.abs(midi_diffs)
  cost_diffs = np.mean(weights * cost_diffs, axis=0)

  ## Computes mininmum "note" transitions.
  f0_at = f0_midi[mask_on][:, np.newaxis] - midi_diffs
  f0_at_diffs = np.diff(f0_at, axis=0)
  deltas = (f0_at_diffs != 0.0).astype(float)
  cost_deltas = np.mean(weights[:-1] * deltas, axis=0)

  # Tuning factor is minimum cost.
  norm = lambda x: (x - np.mean(x)) / np.std(x)
  cost = norm(cost_deltas) + norm(cost_diffs)
  return tuning_factors[np.argmin(cost)]


def auto_tune(f0_midi, tuning_factor, mask_on, amount=0.0, chromatic=False):
  """Reduce variance of f0 from the chromatic or scale intervals."""
  if chromatic:
    midi_diff = (f0_midi - tuning_factor) % 1.0
    midi_diff[midi_diff > 0.5] -= 1.0
  else:
    major_scale = np.ravel(
        [np.array([0, 2, 4, 5, 7, 9, 11]) + 12 * i for i in range(10)])
    all_scales = np.stack([major_scale + i for i in range(12)])

    f0_on = f0_midi[mask_on]
    # [time, scale, note]
    f0_diff_tsn = (
        f0_on[:, np.newaxis, np.newaxis] - all_scales[np.newaxis, :, :])
    # [time, scale]
    f0_diff_ts = np.min(np.abs(f0_diff_tsn), axis=-1)
    # [scale]
    f0_diff_s = np.mean(f0_diff_ts, axis=0)
    scale_idx = np.argmin(f0_diff_s)
    scale = ['C', 'Db', 'D', 'Eb', 'E', 'F', 'Gb',
             'G', 'Ab', 'A', 'Bb', 'B', 'C'][scale_idx]

    # [time]
    f0_diff_tn = f0_midi[:, np.newaxis] - all_scales[scale_idx][np.newaxis, :]
    note_idx = np.argmin(np.abs(f0_diff_tn), axis=-1)
    midi_diff = np.take_along_axis(
        f0_diff_tn, note_idx[:, np.newaxis], axis=-1)[:, 0]
    print('Autotuning... \nInferred key: {}  '
          '\nTuning offset: {} cents'.format(scale, int(tuning_factor * 100)))

  # Adjust the midi signal.
  return f0_midi - amount * midi_diff


# Play audio in Jupyter Notebook
Audio(audio_gen, rate=16000)  # Replace 16000 with the correct sample rate

  + 2 * np.log10(f_sq)
  frames /= np.std(frames, axis=1)[:, np.newaxis]


{'mean_pitch': 570.4347841366474, 'quantile_transform': QuantileTransformer(), 'loudness_db': array([ -90.04965,  -90.04965,  -90.04965, ..., -100.     , -100.     ,
       -100.     ], dtype=float32)}


  + 2 * np.log10(f_sq)


In [4]:
ADJUST = True  # Flag for automatic adjustment
quiet = 40  # Quiet parts without notes detected (dB)
autotune = 0 # to be made variable
threshold = 1.0  # Threshold for note detection,to be made variable Adjust Sync
pitch_shift = 1  # Manual pitch shift (in octaves),to be made variable
loudness_shift = -30  # Manual loudness shift (in dB),to be made variable
autotune = 0.3 #to be made variable
# Manual Shifts.
audio_features_mod = {k: v.copy() for k, v in audio_features.items()}
print(audio_features_mod)



if ADJUST:
  # Detect sections that are "on". 
  mask_on, note_on_value = detect_notes(audio_features['loudness_db'],
                                        audio_features['f0_confidence'],
                                        threshold)
  # print(mask_on,note_on_value)  
  if np.any(mask_on):
  #   # Shift the pitch register.
    target_mean_pitch = DATASET_STATS['mean_pitch']
    pitch = ddsp.core.hz_to_midi(audio_features['f0_hz'])
    mean_pitch = np.mean(pitch[mask_on])
    p_diff = target_mean_pitch - mean_pitch
    p_diff_octave = p_diff / 12.0
    round_fn = np.floor if p_diff_octave > 1.5 else np.ceila
    p_diff_octave = round_fn(p_diff_octave)
    audio_features_mod = shift_f0(audio_features_mod, p_diff_octave)


  #   Quantile shift the note_on parts.
    _, loudness_norm = fit_quantile_transform(
        audio_features['loudness_db'],
        mask_on,
        inv_quantile=DATASET_STATS['quantile_transform'])

  #   Turn down the note_off parts.
    mask_off = np.logical_not(mask_on)
    loudness_norm[mask_off] -=  quiet * (1.0 - note_on_value[mask_off][:, np.newaxis])
    loudness_norm = np.reshape(loudness_norm, audio_features['loudness_db'].shape)
    
    audio_features_mod['loudness_db'] = loudness_norm 

    # Auto-tune
    if autotune:
        f0_midi = np.array(ddsp.core.hz_to_midi(audio_features_mod['f0_hz']))
        tuning_factor = get_tuning_factor(f0_midi, audio_features_mod['f0_confidence'], mask_on)
        f0_midi_at = auto_tune(f0_midi, tuning_factor, mask_on, amount=autotune)
        audio_features_mod['f0_hz'] = ddsp.core.midi_to_hz(f0_midi_at)
    # else:
        # print('\nSkipping auto-adjust (no notes detected or ADJUST box empty).')

else:
  print('\nSkipping auto-adujst (box not checked or no dataset statistics found).')



audio_features_mod = shift_ld(audio_features_mod, loudness_shift)
audio_features_mod = shift_f0(audio_features_mod, pitch_shift)
print(audio_features_mod)


af = audio_features if audio_features_mod is None else audio_features_mod
outputs = model(af, training=False)

# Assuming audio_gen is a NumPy array or tensor
audio_gen = model.get_audio_from_outputs(outputs)
# Convert to NumPy array if it's a TensorFlow tensor
if hasattr(audio_gen, 'numpy'):
    audio_gen = audio_gen.numpy()

# Normalize audio to prevent clipping (optional)
audio_gen = audio_gen / np.max(np.abs(audio_gen))

# Play audio in Jupyter Notebook
Audio(audio_gen, rate=16000)  # Replace 16000 with the correct sample rate

{'audio': array([[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -6.8325401e-11, -8.0804516e-11, -2.9713849e-11]], dtype=float32), 'loudness_db': array([-80., -80., -80., ..., -80., -80., -80.], dtype=float32), 'f0_hz': array([   0.    ,    0.    ,    0.    , ..., 1948.4619, 1947.5004,
       1964.4318], dtype=float32), 'f0_confidence': array([0.        , 0.        , 0.        , ..., 0.14567743, 0.14390795,
       0.07461278], dtype=float32)}
Autotuning... 
Inferred key: G  
Tuning offset: 0 cents
{'audio': array([[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -6.8325401e-11, -8.0804516e-11, -2.9713849e-11]], dtype=float32), 'loudness_db': array([-150., -150., -150., ..., -150., -150., -150.], dtype=float32), 'f0_hz': array([  18.460367,   18.460367,   18.460367, ..., 3913.0898  ,
       3911.7366  , 3935.5115  ], dtype=float32), 'f0_confidence': array([0.        , 0.        , 0.        , ..., 0.14567743, 0.14390795,
       0.07461278], dtype=float32)}


  + 2 * np.log10(f_sq)
