This notebook turned from a replication of the colab demo to a more general data exploration notebook for the synthesized and real wav files, as well as their audio parameter extractions from DDSP.

In [None]:
import numpy as np

In [None]:
def generate_twinkle(base=500, length=1250):
    # returns twinkle in f0

    Msec = 1.1224653607
    Mthird = 1.259913999
    fourth = 1.3348494983
    fifth = 1.4983086479
    Msixth = 1.6817964644
    arrs = [np.linspace(base, base, 50),  # B
            np.linspace(base, base, 50),  # B
            np.linspace(base*fifth, base*fifth, 50),  # F#
            np.linspace(base*fifth, base*fifth, 50),  # F#
            np.linspace(base*Msixth, base*Msixth, 50),  # G#
            np.linspace(base*Msixth, base*Msixth, 50),  # G#
            np.linspace(base*fifth, base*fifth, 100),  # F#
            np.linspace(base*fourth, base*fourth, 50),  # E
            np.linspace(base*fourth, base*fourth, 50),  # E
            np.linspace(base*Mthird, base*Mthird, 50),  # D#
            np.linspace(base*Mthird, base*Mthird, 50),  # D#
            np.linspace(base*Msec, base*Msec, 50),  # C#
            np.linspace(base*Msec, base*Msec, 50),  # C#
            np.linspace(base, base, 100),  # B
    ]


    f0 = np.concatenate((arrs))

    return np.concatenate((f0, np.linspace(base, base, length - f0.shape[0]))), f0.shape[0]

def generate_loud(length=1250, decay=True):
    beats = [0, 50, 100, 150, 200, 250, 300, 400, 450, 500, 550, 600, 650, 700, 800]
    arrs = []
    base = -50
    decay_rate = -0.25 # decays -1 per timestep/index
    notelength = 0.7
    for i, beat in enumerate(beats):
        if i == len(beats) - 1:
            arr = np.linspace(-100, -100, length - beat)
        else:
            next_beat = beats[i + 1]
            if decay:
                arr = np.linspace(base, base + decay_rate * (next_beat - beat), next_beat - beat)
            else:
                notelengthidx = int(notelength * (next_beat - beat))
                restlengthidx = (next_beat - beat) - notelengthidx
                l = [np.linspace(base, base, notelengthidx),
                     np.linspace(-100, -100, restlengthidx)]
                arr = np.concatenate(l)
        arrs.append(arr)
    return np.concatenate(arrs)
    
# in the notebook
f0_hz, stopidx = generate_twinkle()
loudness_db = generate_loud(decay=False)

In [None]:
loudness_db = np.concatenate((-40 * np.ones(800), -100 * np.ones(450)))

In [None]:
import matplotlib.pyplot as plt
plt.plot(f0_hz)
plt.title('Fundamental Frequency (No Heuristic Modification)')
plt.figure()
plt.title('Loudness (Partial Heuristic Modification)')
plt.plot(loudness_db)

In [None]:
AUDIO_URL_OR_FP = '/juice/scr/rjcaste/curis/wavegenie/notebooks/20af160e2b6d96f89016425f2e776910-35.wav'
START_TIME = 0
END_TIME = 5

In [None]:
# Load audio

from wavegenie.audio_io import load_audio, save_wav
from wavegenie.util import preview_audio, DDSP_DEFAULT_FS_AUDIO

audio, fs = load_audio(
    AUDIO_URL_OR_FP,
    DDSP_DEFAULT_FS_AUDIO,
    num_channels=1,
    normalize=True,
    start_time_seconds=START_TIME,
    end_time_seconds=END_TIME)

#preview_audio(audio)
#save_wav('input_16k_mono_f32.wav', audio, DDSP_DEFAULT_FS_AUDIO)

In [None]:
# Extract synthesis parameters

import matplotlib.pyplot as plt

from wavegenie.util import extract_ddsp_synthesis_parameters
import time

start = time.time()
audio_parameters = extract_ddsp_synthesis_parameters(audio)
print('took {:.3g} seconds'.format(time.time() - start))

In [None]:
import numpy as np

In [None]:
from wavegenie.viz_utils import standard_plot, alt_plot, get_percent
def alt_plot(audio_parameters, shade_param=None, cutoff_percentile=0.5, plot_freqs=False, waveform=None):
    # plot with extra stuff (overlays, etc.)
    # audio_parameters: the audio parameters obtained from before
    # shade_param: the parameter potentially used to shade the graph vertically
    # cutoff_percentile: percentile to cutoff the shading for shade_param
    # plot_freqs: plot musical note frequencies as horizontal lines on the plot
    # waveform: if audio provided, plot it
    
    plt.figure(figsize=(14,8))
    plt.plot(np.linspace(0, 2.4, audio_parameters['f0_hz'].shape[0]), audio_parameters['f0_hz'], label='f0')
    if not(shade_param is None):
        # if -1, use smooth shading, otherwise use percentile
        if cutoff_percentile == -1:
            max_, min_ = audio_parameters[shade_param].max(), audio_parameters[shade_param].min()
            for i, param in enumerate(audio_parameters[shade_param]):
                plt.axvspan(i, i + 1, alpha=(param - min_) / (2 * (max_ - min_)))
        else:
            cutoff = get_percent(audio_parameters[shade_param], cutoff_percentile)
            for region in contiguous_regions(audio_parameters[shade_param] > cutoff):
                plt.axvspan(region[0], region[1], alpha=0.3)

    title = 'f0 Hz'
    if not(shade_param is None):
        title += ', shaded by {} ({})'.format(shade_param, 'smoothly' if cutoff_percentile==-1 else cutoff_percentile)
    plt.title(title)

    # extra waveform ontop
    if not(waveform is None):
        # skipping over 63 elements since waveform is sampled 64x
        # compared to the audio parameters
        plt.plot(np.linspace(0, 2.4, waveform.flatten()[::64].shape[0]), 100 * waveform.flatten()[::64] + get_percent(audio_parameters['f0_hz'], 0.1),
                 label='waveform')
    
    # extra frequency horizontal lines
    if plot_freqs:
        frequencies = {'_A': 220,
                       '_B': 246,
                       '_C': 261,
                       '_D': 293,
                       '_E': 329,
                       '_F': 349,
                       '_G': 392,
                       'A': 440,
                       'B': 493,
                       'C': 523,
                       'D': 587,
                       'D#': 622,
                       'E': 659,
                       'F': 698,
                       'F#': 740,
                       'G': 784,
                       'A_': 880}
        for note, f in frequencies.items():
            plt.plot(f * np.ones(1300), label='{}{}'.format(note, f))
    
    plt.legend()
    plt.show()
    plt.figure(figsize=(14,8))
    plt.plot(np.linspace(0, 2.4, audio_parameters['loudness_db'].shape[0]), audio_parameters['loudness_db'])
    plt.plot(np.linspace(0, 2.4, waveform.flatten()[::64].shape[0]), 100 * waveform.flatten()[::64] + get_percent(audio_parameters['loudness_db'], 0.1),
                 label='waveform')
    plt.show()
    plt.figure(figsize=(14,8))
    plt.plot(audio_parameters['f0_confidence'])
    plt.show()
alt_plot(audio_parameters, waveform=resynth)

In [None]:
from wavegenie.util import standard_plot, alt_plot

alt_plot(audio_parameters, plot_freqs=True)

In [None]:
# Load model

from wavegenie.util import load_ddsp_model

model = load_ddsp_model('Violin')

In [None]:
audio_parameters = {}
audio_parameters['f0_hz'] = f0_hz.astype('float32')
audio_parameters['loudness_db'] = loudness_db.astype('float32')

In [None]:
# Resynthesize parameters

from wavegenie.util import synthesize_ddsp_audio
from wavegenie.util import preview_audio

resynth = synthesize_ddsp_audio(model, audio_parameters)

preview_audio(resynth)
save_wav('output_16k_mono_f32.wav', resynth, DDSP_DEFAULT_FS_AUDIO)

In [None]:
# Resynthesize parameters

from wavegenie.util import synthesize_ddsp_audio

reresynth = synthesize_ddsp_audio(model, audio_parameters)

preview_audio(reresynth)
save_wav('output_16k_mono_f32.wav', resynth, DDSP_DEFAULT_FS_AUDIO)

In [None]:
# Resynthesize parameters

from wavegenie.util import synthesize_ddsp_audio

reresynth = synthesize_ddsp_audio(model, audio_parameters)

preview_audio(reresynth)
save_wav('output_16k_mono_f32.wav', resynth, DDSP_DEFAULT_FS_AUDIO)

In [None]:
preview_audio(audio)

In [None]:
preview_audio(audio)