In [None]:
%load_ext autoreload
%autoreload 2

import os, sys
import glob, pickle, yaml

PROJECT_DIR = os.path.dirname(os.getcwd())
sys.path.append(PROJECT_DIR)

import numpy as np
import matplotlib.pyplot as plt
import scipy.signal as signal
import librosa
import librosa.display

from dataloader import make_supervised_dataset
from callbacks import CustomWandbCallback, ModelCheckpoint
from train_supervised import make_supervised_model
from timbre_transfer import transfer_timbre_from_path, load_model_from_config

def print_plot_play(x, Fs=16000, text='', normalize=False):
    import IPython.display as ipd
    print('%s\n' % (text))
    print('Fs = %d, x.shape = %s, x.dtype = %s' % (Fs, x.shape, x.dtype))
    plt.figure(figsize=(8, 2))
    plt.plot(x, color='gray')
    plt.xlim([0, x.shape[0]])
    plt.xlabel('Time (samples)')
    plt.ylabel('Amplitude')
    plt.tight_layout()
    plt.show()
    ipd.display(ipd.Audio(data=x, rate=Fs, normalize=normalize))

# Load a Model

In [None]:
config_path = '../wandb/run-20210818_235757-tknchpfs/files/Supervised_Violin/Supervised_Violin.yaml'
config_path = "../wandb/run-20210818_235746-36ggudop/files/Supervised_Latent_Violin/Supervised_Latent_Violin.yaml"
#config_path = "/kuacc/users/hbalim15/ddsp/wandb/run-20210817_170423-5v6s3iu1/files/Supervised_Violin_/Supervised_Violin_.yaml"
with open(config_path) as file:
    config = dict(yaml.load(file, Loader=yaml.FullLoader))

In [None]:
model = load_model_from_config(config)

# Transfer Timbre

In [None]:
input_path = "../audio_clips/singing.mp3"
resynth = transfer_timbre_from_path(model, input_path, pitch_shift=24, mfcc=config['model']['encoder'])
print_plot_play(resynth)

# Export Audio

In [None]:
output_title = "violin_to_violin-VIDOuble.wav"
write_audio(resynth, output_title, RUN_NAME, normalize=True)

,RUN_NAME = 'Supervised_Violin_ouz_Multiloss'

encoder_timesteps = 250
decoder_timesteps = 1000

preprocessor = F0LoudnessPreprocessor(timesteps=encoder_timesteps)
encoder = None #SupervisedEncoder(timesteps=encoder_timesteps)
decoder = DecoderWithoutLatent(timesteps=decoder_timesteps)
loss = MultiLoss() # SpecLoss()
tracker_names = ['spec_loss'] if loss.name=='SpecLoss' else ['spec_loss', 'perc_loss', 'total_loss']
model = SupervisedAutoencoder(preprocessor=preprocessor,
                            encoder=encoder,
                            decoder=decoder,
                            loss_fn=loss,
                            tracker_names=tracker_names,
                            add_reverb=True)
model.load(RUN_NAME+"/994/model.ckpt")

In [None]:
from postprocessing import process_track
import librosa

In [None]:
input_title = "singing.mp3" 
track, _ = librosa.load(os.path.join("../audio_clips", '{}'.format(input_title)),sr=16000)
f = process_track(track,mfcc=config['model']['encoder'], pitch_shift=2, model=model )

In [None]:
f['loudness_db'].shape

In [None]:
f = process_track(track, mfcc=True, loudness_nfft=2048, frame_size=64000, Fs=16000, frame_rate=250, pitch_shift=2, model=None, normalize=True)

# Something about the loudness ?

In [None]:
a.keys()

In [None]:
a['inputs']['f0_scaled'].numpy().max()

In [None]:
a['inputs']['f0_hz'].numpy().max()

In [None]:
print_plot_play(resynth/resynth.max(),16000)

In [None]:
print_plot_play(resynth/resynth.max(),16000)

In [None]:
track, fs = librosa.load("audio_clips/singing.mp3",sr=16000)
track_T = librosa.effects.pitch_shift(track, 16000, n_steps=24)

features = feature_extractor(track)

In [None]:
np.mean(features["loudness_db"].reshape(-1))

In [None]:
from dataloader import make_datasets, make_violin_set
train,_,_ = make_violin_set()

In [None]:
loudness = []
for batch in iter(train):
    loudness.append(batch["loudness_db"])
loudness = np.array(loudness).reshape(-1)

In [None]:
np.mean(loudness)

# Real Science 

plot_spectrogram
plot_waveform_spectrogram

dB_spectrogram = extract_dB_spectrogram(resynth, 8192, 1024, 512, center=True)

In [None]:
fig, ax = plt.subplots(figsize=(20,8), nrows=2, sharex=True, constrained_layout=True) #, dpi=50

librosa.display.specshow(dB_spectrogram, sr=fs, hop_length=512, x_axis='time', y_axis='log', ax=ax[0])

librosa.display.waveplot(resynth, sr=fs, ax=ax[1])

In [None]:
hop_length=512
dB_spectrogram = extract_dB_spectrogram(track, 8192, 1024, hop_length, center=True)

fig, ax = plt.subplots(figsize=(20,8))
librosa.display.specshow(dB_spectrogram, sr=16000, hop_length=hop_length, x_axis='time', y_axis='log')
plt.show()

In [None]:
print_plot_play(track_T/track_T.max(),fs)

In [None]:
print_plot_play(resynth/resynth.max(),fs)

In [None]:
print_plot_play(track/track.max(), fs,'track')

# Reconstruction

### Since the reconstructed 4*1sec frames are already frames inside we just concat them.

In [None]:
def generate_windowed_frames(x, frame_size, window_type):

    audio_frames = frame_generator(x, frame_size=frame_size)

    window = signal.get_window(window_type, frame_size, fftbins=False)

    windowed_frames = [frame*window for frame in audio_frames]
    
    return windowed_frames

def reconstruct(windowed_frames, frame_size):
    """
    Overlap-add method with 50% overlap
    """
    
    reconstruction = [windowed_frames[0][:frame_size//2]] # first frame's beginning
    for i in range(len(windowed_frames)-1):

         reconstruction += [windowed_frames[i][frame_size//2:] + windowed_frames[i+1][:frame_size//2]]

    reconstruction += [windowed_frames[i][frame_size//2:]] # last frames end

    reconstruction = np.array(reconstruction).reshape(-1)
    
    return reconstruction

In [None]:
# each audio_synth frame is overlapp-added separately and concat at the end
fs = 16000
frame_size = int((8/1000)*fs)

separate_reconstruction = []
for synth in audio_synth:
    
    windowed_frames = generate_windowed_frames(synth, frame_size, 'hamming')
    
    reconstruction = reconstruct(windowed_frames, frame_size)
    
    separate_reconstruction.append(reconstruction)
    
separate_reconstruction = np.array(separate_reconstruction).reshape(-1)

separate_reconstruction /= max(separate_reconstruction) # normalize


# all audio_synth frames are merged and overlap-added together

windowed_frames = generate_windowed_frames(audio_synth.reshape(-1), frame_size, 'hamming')

joint_reconstruction = reconstruct(windowed_frames, frame_size)

joint_reconstruction /= max(joint_reconstruction)


# simple concat

simple_reconstruction = audio_synth.reshape(-1)

simple_reconstruction /= max(simple_reconstruction)

In [None]:
import scipy
k_size = 5
#smooth = scipy.signal.convolve(audio_synth.reshape(-1,),np.ones(k_size)/k_size)
smooth = audio_synth.reshape(-1,)
norm = smooth/smooth.max()
print_plot_play(norm, 16000)

In [None]:
from scipy.io.wavfile import write
write("reverb.wav", 16000, norm)

In [None]:
print_plot_play(joint_reconstruction, 16000)

In [None]:
print_plot_play(separate_reconstruction, 16000)

In [None]:
64*250