In [1]:
from collections import Counter
import glob
import os.path

import tensorflow as tf
import ddsp
import IPython
import numpy as np
import scipy.fft
import wavio



In [2]:
# Set dynamic GPU memory growth
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    print(e)

1 Physical GPUs, 1 Logical GPUs


## Load data

In [3]:
data_dir = 'data/robot'
max_length_seconds = 1.0
peak_offset_seconds = 0.1

# Preprocess recordings and assign weak labels
files = sorted(glob.glob(os.path.join(data_dir, '*/*.wav')))
tool_dict = {}
object_dict = {}
tool_count = 0
object_count = 0
tool_ids = []
object_ids = []
clips = []
maxes = []
for f in files:
    # Extract and assign tool and object IDs for weak labels
    normed = os.path.normpath(f)
    desc = normed.split('/')[-2].split('-')
    tool = desc[0] + '-' + desc[1]
    struck_object = desc[2] + '-' + desc[3]
    if tool in tool_dict:
        tool_ids.append(tool_dict[tool])
    else:
        tool_ids.append(tool_count)
        tool_dict[tool] = tool_count
        tool_count += 1
    if struck_object in object_dict:
        object_ids.append(object_dict[struck_object])
    else:
        object_ids.append(object_count)
        object_dict[struck_object] = object_count
        object_count += 1
    
    # Load and normalize recording
    wav = wavio.read(f)
    fs = wav.rate
    clip = wav.data
    clip = clip[:, 0]
    clip = clip - np.mean(clip)
    maxes.append(np.max(np.abs(clip)))
    clip = clip / np.max(np.abs(clip))
    # Find peak for aligning start and end of clip
    # and initializing optimization of peak timing
    peak_index = np.argmax(np.abs(clip))
    peak_offset = peak_index - peak_offset_seconds * fs
    clip = clip[int(max(0, peak_offset)):]
    clips.append(clip)
    
min_length = min([len(c) for c in clips])
min_length = min(int(max_length_seconds * fs), min_length)
clips = [c[:min_length] for c in clips]
clips = np.array(clips)
num_clips = clips.shape[0]

print('Unfiltered Tools')
print(tool_dict)
print(tool_ids)
print('Unfiltered Objects')
print(object_dict)
print(object_ids)

# Filter recordings down to 2 loudest clips per pairing of tool and object
num_per_combo = 2
maxes = np.array(maxes, dtype=np.float32)
np_tool_ids = np.array(tool_ids, dtype=np.int32)
np_object_ids = np.array(object_ids, dtype=np.int32)
temp_clips = np.array(clips, dtype=np.float32)
clips = []
tool_ids = []
object_ids = []
for i in range(len(tool_dict)):
    for j in range(len(object_dict)):
        combo_indices = np.logical_and(np_tool_ids==i, np_object_ids==j)
        combo_maxes = maxes[combo_indices]
        combo_clips = temp_clips[combo_indices]
        max_indices = np.argsort(combo_maxes)[-num_per_combo:]
        clips.extend(combo_clips[max_indices])
        tool_ids.extend([i] * num_per_combo)
        object_ids.extend([j] * num_per_combo)
clips = np.array(clips, dtype=np.float32)
num_clips = clips.shape[0]

print('Filtered Tools')
print(tool_dict)
print(tool_ids)
print('Filtered Objects')
print(object_dict)
print(object_ids)
inv_tool_dict = {v: k for k, v in tool_dict.items()}
inv_object_dict = {v: k for k, v in object_dict.items()}

Unfiltered Tools
{'ceramic-chopstick': 0, 'polycarbonate-spoon': 1, 'steel-fork': 2, 'wood-spoon': 3}
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
Unfiltered Objects
{'ceramic-mug': 0, 'polycarbonate-cup': 1, 'steel-bowl': 2, 'wood-holder': 3}
[0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3]
Filtered Tools
{'ceramic-chopstick': 0, 'polycarbonate-spoon': 1, 'steel-fork': 2, 'wood-spoon': 3}
[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3]
Filtered Objects
{'ceramic-mug': 0, 'polycar

In [4]:
# Sample clip
ind = 0
print('Clip index:\t%i'%ind)
print("Tool:\t\t'%s'"%inv_tool_dict[tool_ids[ind]])
print("Object:\t\t'%s"%inv_object_dict[object_ids[ind]])
IPython.display.Audio(data=clips[ind, :], rate=fs)

Clip index:	0
Tool:		'ceramic-chopstick'
Object:		'ceramic-mug


## Setup differentiable modules, optimization variables

In [5]:
# Instantiate DiffImpact modules
# Use internal sampling rate 2x the original recording sampling rate
sample_factor = 2
modal_fir = ddsp.synths.ModalFIR(n_samples=int(1.5 * fs * sample_factor), sample_rate=fs * sample_factor,
                            initial_bias=-1.5, hz_max=18000.0, freq_scale_fn=ddsp.core.frequencies_critical_bands, freq_scale='mel')
impact = ddsp.synths.Impact(sample_rate=fs * sample_factor, n_samples=clips.shape[1] * sample_factor, max_impact_frequency=20, mag_scale_fn=ddsp.core.exp_sigmoid)
reverb = ddsp.effects.FilteredNoiseExpDecayReverb(trainable=False, reverb_length=96000, decay_initial_bias=4.0, n_filter_banks=64, add_dry=True)

In [6]:
# Initialize variables to optimize
tf.random.set_seed(0)

# Impact profile variables
mag = tf.Variable(tf.reshape([0.1]* clips.shape[0], (clips.shape[0], 1, 1)), dtype=tf.float32, constraint=tf.keras.constraints.NonNeg())
# Scale tau and peak times to lessen the size of optimization steps
tau_factor = 0.001
peak_time_factor = 0.01
# Initialize peak times with peak of recording waveform magnitude
peak_time = (np.argmax(np.abs(clips), axis=1) / fs / peak_time_factor).astype(np.float32)
peak_time = tf.Variable(tf.reshape(peak_time, (clips.shape[0], 1, 1)), dtype=tf.float32, constraint=tf.keras.constraints.NonNeg())
tau = tf.Variable(tf.reshape([0.1] * clips.shape[0], (clips.shape[0], 1, 1)), dtype=tf.float32, constraint=tf.keras.constraints.NonNeg())
peak_time_factor = tf.constant(peak_time_factor, dtype=tf.float32)

# Environment noise variables
noise_magnitudes = tf.Variable(tf.zeros((1, 100), dtype=tf.float32))
noise_scale = tf.Variable([[0.1]]*num_clips, dtype=tf.float32, constraint=tf.keras.constraints.NonNeg())

# Modal response variables
num_frequencies = 128
# Tool modal response variables
tool_id_counts = Counter(tool_ids)
frequencies0 = tf.Variable(tf.random.normal((len(tool_id_counts), 1, num_frequencies//2)), dtype=tf.float32)
dampings0 = tf.Variable(tf.random.normal((len(tool_id_counts), 1, num_frequencies//2)) - 1.0, dtype=tf.float32)
gains0 = tf.Variable(tf.random.normal((len(tool_id_counts), 1, num_frequencies//2)), dtype=tf.float32)
# Object modal response variables
object_id_counts = Counter(object_ids)
frequencies1 = tf.Variable(tf.random.normal((len(object_id_counts), 1, num_frequencies,)), dtype=tf.float32)
dampings1 = tf.Variable(tf.random.normal((len(object_id_counts), 1, num_frequencies)) - 1.0, dtype=tf.float32)
gains1 = tf.Variable(tf.random.normal((len(object_id_counts), 1, num_frequencies)), dtype=tf.float32)

# Reverb variables
reverb_gains = tf.Variable(1.0 + tf.zeros((len(tool_id_counts), 1, 16), dtype=tf.float32))
reverb_decay = tf.Variable(-1.0 + tf.zeros((len(tool_id_counts), 1, 16), dtype=tf.float32))

# Acceleration sound scale variable
acceleration_scale = tf.Variable([[0.1]]*num_clips, dtype=tf.float32, constraint=tf.keras.constraints.NonNeg())

# Optimization step counter
step_count = 0

## Forward generation and loss functions

In [7]:
loss = ddsp.losses.SpectralLoss(loss_type='L1', logmag_weight=0.1,
                               fft_sizes=(2048, 1024, 512, 256, 128))

tau_min = 3.0 / impact.sample_rate
def get_single_impact(magnitude, peak_time, tau):
    return tf.reduce_sum(mag * impact.hertz_gaussian(peak_time, tau * tau_factor + tau_min), axis=2)

def generate_audio(with_noise=True, with_tool_modal=True, with_accel=True, with_object=True, with_reverb=True):
    """Generate synthetic impact audio."""    
    # Generate impact profile
    impulse_profile = get_single_impact(mag, peak_time_factor * peak_time, tau)
    
    if with_tool_modal:
        # Generate tool modal impact sound
        irc0 = modal_fir.get_controls(gains0, frequencies0, dampings0)
        ir0 = modal_fir.get_signal(irc0['gains'], irc0['frequencies'], irc0['dampings'])
        ir0 = tf.gather(ir0, tool_ids)
        audio = ddsp.core.fft_convolve(impulse_profile, ir0)
    else:
        audio = tf.zeros((clips.shape[0], clips.shape[1] * sample_factor), dtype=tf.float32)
        
    if with_object:
        # Generate object modal impact sound
        irc1 = modal_fir.get_controls(gains1, frequencies1, dampings1)
        ir1 = modal_fir.get_signal(irc1['gains'], irc1['frequencies'], irc1['dampings'])
        ir1 = tf.gather(ir1, object_ids)
        audio += ddsp.core.fft_convolve(impulse_profile, ir1)
            
    if with_noise:
        # Generate and add environment noise
        unfiltered_noise = tf.random.uniform((clips.shape[0], clips.shape[1] * sample_factor), minval=-1.0, maxval=1.0)
        scaled_noise_magnitudes = ddsp.core.exp_sigmoid(noise_magnitudes - 3.5)
        scaled_noise_magnitudes = tf.tile(tf.reshape(scaled_noise_magnitudes, (1, -1)), (clips.shape[0], 1))
        noise = ddsp.core.frequency_filter(unfiltered_noise, scaled_noise_magnitudes, 64)
        audio += noise_scale * noise

    if with_accel:
        # Add acceleration noise, preventing contribution from collapsing to 0
        audio += impulse_profile * (acceleration_scale + 0.0000001)
    
    if with_reverb:
        # Generate and add tool-specific reverb
        temp_reverb_gains = tf.gather(reverb_gains, tool_ids)
        temp_reverb_decay = tf.gather(reverb_decay, tool_ids)
        revc = reverb.get_controls(audio, temp_reverb_gains, temp_reverb_decay)
        audio = reverb.get_signal(audio, revc['ir'])
    
    # Downsample from internal sampling rate to recording sampling rate
    audio = ddsp.core.resample(audio, clips.shape[1], 'linear')
    return audio

def loss_func():
    audio = generate_audio()
    return loss.call(tf.stop_gradient(clips), audio)

opt = tf.keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0)

## Optimization process

In [None]:
iters = 6000
check_interval = 50
for i in range(iters):
    if i % check_interval == 0:
        print('Step\t%i:\t%0.4f'%(step_count, loss_func()))
    opt.minimize(loss_func, var_list=[noise_magnitudes, frequencies0, dampings0, gains0, frequencies1, dampings1, gains1, tau, mag, reverb_gains,
                                      reverb_decay, acceleration_scale, peak_time, noise_scale])
    step_count += 1
print('Step\t%i:\t%0.4f'%(step_count, loss_func()))

Step	0:	1.3448
Step	50:	1.1299
Step	100:	1.0359
Step	150:	0.9793
Step	200:	0.9377
Step	250:	0.9012
Step	300:	0.8692
Step	350:	0.8422
Step	400:	0.8198
Step	450:	0.8005
Step	500:	0.7843
Step	550:	0.7731
Step	600:	0.7656
Step	650:	0.7585
Step	700:	0.7535
Step	750:	0.7491
Step	800:	0.7444
Step	850:	0.7392
Step	900:	0.7378
Step	950:	0.7327
Step	1000:	0.7293
Step	1050:	0.7261
Step	1100:	0.7199
Step	1150:	0.7167
Step	1200:	0.7129
Step	1250:	0.7109
Step	1300:	0.7079
Step	1350:	0.7034
Step	1400:	0.7028
Step	1450:	0.7011
Step	1500:	0.6952
Step	1550:	0.6937
Step	1600:	0.6915
Step	1650:	0.6908
Step	1700:	0.6887
Step	1750:	0.6850
Step	1800:	0.6847
Step	1850:	0.6824
Step	1900:	0.6790
Step	1950:	0.6776
Step	2000:	0.6763
Step	2050:	0.6748
Step	2100:	0.6735
Step	2150:	0.6716
Step	2200:	0.6694
Step	2250:	0.6677
Step	2300:	0.6647
Step	2350:	0.6632
Step	2400:	0.6637
Step	2450:	0.6610
Step	2500:	0.6601
Step	2550:	0.6571
Step	2600:	0.6550
Step	2650:	0.6538
Step	2700:	0.6499
Step	2750:	0.6482
Step	2800:	0.64

## Sample, save outputs

In [None]:
print('Current total loss:\t%0.4f'%loss_func())
ind = 28
print('Clip index:\t%i'%ind)
print("Tool:\t\t'%s'"%inv_tool_dict[tool_ids[ind]])
print("Object:\t\t'%s"%inv_object_dict[object_ids[ind]])
audio = generate_audio(with_object=False, with_tool_modal=True, with_accel=True, with_reverb=True, with_noise=False)
IPython.display.Audio(data=audio[ind, :], rate=fs)

In [None]:
output_dir = '../output/separation/synthesized000'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

def save_audio(audio, output_prefix):
    for i in range(audio.shape[0]):
        filename = os.path.join(output_dir, '%s%03i.wav'%(output_prefix, i))
        scipy.io.wavfile.write(filename, fs, np.array(32767 * audio[i, :] / np.max(np.abs(audio))).astype(np.int16))

save_audio(clips, 'original')
save_audio(generate_audio(with_object=True, with_tool_modal=True, with_accel=True, with_reverb=True, with_noise=True), 'full')
save_audio(generate_audio(with_object=True, with_tool_modal=True, with_accel=True, with_reverb=True, with_noise=False), 'denoised')
save_audio(generate_audio(with_object=False, with_tool_modal=True, with_accel=False, with_reverb=False, with_noise=False), 'tool-modal')
save_audio(generate_audio(with_object=False, with_tool_modal=True, with_accel=True, with_reverb=True, with_noise=False), 'tool-modal-accel')
save_audio(generate_audio(with_object=True, with_tool_modal=False, with_accel=False, with_reverb=False, with_noise=False), 'object-modal')
save_audio(generate_audio(with_object=True, with_tool_modal=False, with_accel=True, with_reverb=True, with_noise=False), 'object-modal-accel')