In [2]:
import sys
import os
import torch
from einops import rearrange
import numpy as np
import torchaudio
import matplotlib.pyplot as plt

from scipy.io.wavfile import write

sys.path.append('../')

from src.datamodule.maestro_datamodule import MaestroDataModule

# Maestro Reference

In [15]:
datamodule = MaestroDataModule(root_dir=os.environ["MAESTRO_DATASET_DIR"], batch_size=64, num_workers=4, sample_length=44100*5)
datamodule.setup()
dataloader = datamodule.val_dataloader()
dataloader.shuffle = True
data_iter = iter(dataloader)
batch = next(data_iter)


Found 962 files. Getting durations
self.sr=44100, min: 5, max: inf
Keeping 962 of 962 files
Found 137 files. Getting durations
self.sr=44100, min: 5, max: inf
Keeping 137 of 137 files
Found 177 files. Getting durations
self.sr=44100, min: 5, max: inf
Keeping 177 of 177 files


In [None]:

for i, audio in enumerate(batch):
    audio = audio.transpose(0, 1)
    torchaudio.save(f"../eval/maestro/test/test_{i}.wav", audio, sample_rate=44100)

## MAESTRO LVL1

In [21]:
from src.model.jukebox_vqvae import JukeboxVQVAEModel

vqvae = JukeboxVQVAEModel().to("cpu")
vqvae.eval()

#batch = batch.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
embeddings = vqvae.encode(batch, 1)
batch_lvl1 = vqvae.decode(embeddings, 1)

for i, audio in enumerate(batch_lvl1):    
    audio = audio.transpose(0, 1)
    torchaudio.save(f"../eval/maestro/validation_lvl1/test_{i}.wav", audio, sample_rate=44100)

Loading Jukebox VAE from /usr/stud/steiger/.cache/huggingface/jukebox-1b-lyrics/JukeboxVQVAE.pt


## LVL0

In [23]:
from src.model.jukebox_vqvae import JukeboxVQVAEModel

vqvae = JukeboxVQVAEModel().to("cpu")
vqvae.eval()

#batch = batch.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
embeddings = vqvae.encode(batch, 0)
batch_lvl1 = vqvae.decode(embeddings, 0)

for i, audio in enumerate(batch_lvl1):    
    audio = audio.transpose(0, 1)
    torchaudio.save(f"../eval/maestro/validation_lvl1/test_{i}.wav", audio, sample_rate=44100)

Loading Jukebox VAE from /usr/stud/steiger/.cache/huggingface/jukebox-1b-lyrics/JukeboxVQVAE.pt


In [24]:
# move all files from eval/maestro/validatation_lvl1 that do not contain "16k" to eval/maestro/validation_lvl0

for f in Path("../eval/maestro/validation_lvl1").glob("*"):
    if "16k" not in f.name:
        f.rename(f"../eval/maestro/validation_lvl0/{f.name}")

In [3]:
from pathlib import Path

p = Path("../eval/maestro/")

In [11]:
for f in p.glob("*/*"):
    if "16k" not in f.name:
        f.unlink()
    if "16k.wav_16k" in f.name:
        f.unlink()
    if "16k_16k" in f.name:
        f.unlink()

In [23]:
for f in p.glob("*/*"):
    # rename test_20.wav_16k.wav to test_20_16k.wav
    f.rename(f.parent / f.name.replace(".wav_16k", "_16k"))

In [12]:
def resample_files(source_dir):
    files = list(Path(source_dir).glob("**/*.wav"))
    for f in files:
        if "16k" not in f.name:
            if (f.parent / f"{f.stem}_16k.wav").exists():
                continue
            # resample with ffmpeg: ffmpeg -i '{}' -ar 16000 -ac 1 -vn -c:a pcm_s16le -y '{}'
            print(f"Resampling {f} to 16k")
            os.system(f"/usr/stud/steiger/miniconda3/bin/ffmpeg -i '{f}' -ar 16000 -ac 1 -vn -c:a pcm_s16le -y '{f.parent / f.stem}_16k.wav'")
            os.remove(f)


In [25]:
resample_files("../eval/maestro/validation_lvl0")

Resampling ../eval/maestro/validation_lvl0/test_49.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_10.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_62.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_41.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_22.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_11.wav to 16k


ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers
  built with gcc 7.3.0 (crosstool-NG 1.23.0.449-a04d0)
  configuration: --prefix=/usr/stud/steiger/miniconda3 --cc=/tmp/build/80754af9/ffmpeg_1587154242452/_build_env/bin/x86_64-conda_cos6-linux-gnu-cc --disable-doc --enable-avresample --enable-gmp --enable-hardcoded-tables --enable-libfreetype --enable-libvpx --enable-pthreads --enable-libopus --enable-postproc --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --disable-nonfree --enable-gpl --enable-gnutls --disable-openssl --enable-libopenh264 --enable-libx264
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat    58. 29.100 / 58. 29.100
  libavdevice    58.  8.100 / 58.  8.100
  libavfilter     7. 57.100 /  7. 57.100
  libavresample   4.  0.  0 /  4.  0.  0
  libswscale      5.  5.100 /  5.  5.100
  libswresample   3.  5.100 /  3.  5.100
  libpostproc    55.  

Resampling ../eval/maestro/validation_lvl0/test_20.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_43.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_8.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_1.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_42.wav to 16k


ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers
  built with gcc 7.3.0 (crosstool-NG 1.23.0.449-a04d0)
  configuration: --prefix=/usr/stud/steiger/miniconda3 --cc=/tmp/build/80754af9/ffmpeg_1587154242452/_build_env/bin/x86_64-conda_cos6-linux-gnu-cc --disable-doc --enable-avresample --enable-gmp --enable-hardcoded-tables --enable-libfreetype --enable-libvpx --enable-pthreads --enable-libopus --enable-postproc --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --disable-nonfree --enable-gpl --enable-gnutls --disable-openssl --enable-libopenh264 --enable-libx264
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat    58. 29.100 / 58. 29.100
  libavdevice    58.  8.100 / 58.  8.100
  libavfilter     7. 57.100 /  7. 57.100
  libavresample   4.  0.  0 /  4.  0.  0
  libswscale      5.  5.100 /  5.  5.100
  libswresample   3.  5.100 /  3.  5.100
  libpostproc    55.  

Resampling ../eval/maestro/validation_lvl0/test_31.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_28.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_3.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_0.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_61.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_12.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_40.wav to 16k


Guessed Channel Layout for Input Stream #0.0 : mono
Input #0, wav, from '../eval/maestro/validation_lvl0/test_12.wav':
  Duration: 00:00:05.00, bitrate: 1411 kb/s
    Stream #0:0: Audio: pcm_f32le ([3][0][0][0] / 0x0003), 44100 Hz, mono, flt, 1411 kb/s
Stream mapping:
  Stream #0:0 -> #0:0 (pcm_f32le (native) -> pcm_s16le (native))
Press [q] to stop, [?] for help
Output #0, wav, to '../eval/maestro/validation_lvl0/test_12_16k.wav':
  Metadata:
    ISFT            : Lavf58.29.100
    Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 16000 Hz, mono, s16, 256 kb/s
    Metadata:
      encoder         : Lavc58.54.100 pcm_s16le
size=     156kB time=00:00:04.99 bitrate= 256.1kbits/s speed= 709x    
video:0kB audio:156kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.048751%
ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers
  built with gcc 7.3.0 (crosstool-NG 1.23.0.449-a04d0)
  configuration: --prefix=/usr/stud/steiger/miniconda3 --cc=/tmp/build/80

Resampling ../eval/maestro/validation_lvl0/test_24.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_32.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_9.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_60.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_30.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_57.wav to 16k


ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers
  built with gcc 7.3.0 (crosstool-NG 1.23.0.449-a04d0)
  configuration: --prefix=/usr/stud/steiger/miniconda3 --cc=/tmp/build/80754af9/ffmpeg_1587154242452/_build_env/bin/x86_64-conda_cos6-linux-gnu-cc --disable-doc --enable-avresample --enable-gmp --enable-hardcoded-tables --enable-libfreetype --enable-libvpx --enable-pthreads --enable-libopus --enable-postproc --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --disable-nonfree --enable-gpl --enable-gnutls --disable-openssl --enable-libopenh264 --enable-libx264
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat    58. 29.100 / 58. 29.100
  libavdevice    58.  8.100 / 58.  8.100
  libavfilter     7. 57.100 /  7. 57.100
  libavresample   4.  0.  0 /  4.  0.  0
  libswscale      5.  5.100 /  5.  5.100
  libswresample   3.  5.100 /  3.  5.100
  libpostproc    55.  

Resampling ../eval/maestro/validation_lvl0/test_26.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_4.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_21.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_15.wav to 16k


ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers
  built with gcc 7.3.0 (crosstool-NG 1.23.0.449-a04d0)
  configuration: --prefix=/usr/stud/steiger/miniconda3 --cc=/tmp/build/80754af9/ffmpeg_1587154242452/_build_env/bin/x86_64-conda_cos6-linux-gnu-cc --disable-doc --enable-avresample --enable-gmp --enable-hardcoded-tables --enable-libfreetype --enable-libvpx --enable-pthreads --enable-libopus --enable-postproc --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --disable-nonfree --enable-gpl --enable-gnutls --disable-openssl --enable-libopenh264 --enable-libx264
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat    58. 29.100 / 58. 29.100
  libavdevice    58.  8.100 / 58.  8.100
  libavfilter     7. 57.100 /  7. 57.100
  libavresample   4.  0.  0 /  4.  0.  0
  libswscale      5.  5.100 /  5.  5.100
  libswresample   3.  5.100 /  3.  5.100
  libpostproc    55.  

Resampling ../eval/maestro/validation_lvl0/test_27.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_47.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_29.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_56.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_35.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_58.wav to 16k


ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers
  built with gcc 7.3.0 (crosstool-NG 1.23.0.449-a04d0)
  configuration: --prefix=/usr/stud/steiger/miniconda3 --cc=/tmp/build/80754af9/ffmpeg_1587154242452/_build_env/bin/x86_64-conda_cos6-linux-gnu-cc --disable-doc --enable-avresample --enable-gmp --enable-hardcoded-tables --enable-libfreetype --enable-libvpx --enable-pthreads --enable-libopus --enable-postproc --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --disable-nonfree --enable-gpl --enable-gnutls --disable-openssl --enable-libopenh264 --enable-libx264
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat    58. 29.100 / 58. 29.100
  libavdevice    58.  8.100 / 58.  8.100
  libavfilter     7. 57.100 /  7. 57.100
  libavresample   4.  0.  0 /  4.  0.  0
  libswscale      5.  5.100 /  5.  5.100
  libswresample   3.  5.100 /  3.  5.100
  libpostproc    55.  

Resampling ../eval/maestro/validation_lvl0/test_2.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_54.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_39.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_6.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_25.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_17.wav to 16k


ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers
  built with gcc 7.3.0 (crosstool-NG 1.23.0.449-a04d0)
  configuration: --prefix=/usr/stud/steiger/miniconda3 --cc=/tmp/build/80754af9/ffmpeg_1587154242452/_build_env/bin/x86_64-conda_cos6-linux-gnu-cc --disable-doc --enable-avresample --enable-gmp --enable-hardcoded-tables --enable-libfreetype --enable-libvpx --enable-pthreads --enable-libopus --enable-postproc --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --disable-nonfree --enable-gpl --enable-gnutls --disable-openssl --enable-libopenh264 --enable-libx264
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat    58. 29.100 / 58. 29.100
  libavdevice    58.  8.100 / 58.  8.100
  libavfilter     7. 57.100 /  7. 57.100
  libavresample   4.  0.  0 /  4.  0.  0
  libswscale      5.  5.100 /  5.  5.100
  libswresample   3.  5.100 /  3.  5.100
  libpostproc    55.  

Resampling ../eval/maestro/validation_lvl0/test_13.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_34.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_53.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_37.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_52.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_59.wav to 16k


size=     156kB time=00:00:04.99 bitrate= 256.1kbits/s speed= 778x    
video:0kB audio:156kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.048751%
ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers
  built with gcc 7.3.0 (crosstool-NG 1.23.0.449-a04d0)
  configuration: --prefix=/usr/stud/steiger/miniconda3 --cc=/tmp/build/80754af9/ffmpeg_1587154242452/_build_env/bin/x86_64-conda_cos6-linux-gnu-cc --disable-doc --enable-avresample --enable-gmp --enable-hardcoded-tables --enable-libfreetype --enable-libvpx --enable-pthreads --enable-libopus --enable-postproc --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --disable-nonfree --enable-gpl --enable-gnutls --disable-openssl --enable-libopenh264 --enable-libx264
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat    58. 29.100 / 58. 29.100
  libavdevice    58.  8.100 / 58.  8.100
  libavfilter   

Resampling ../eval/maestro/validation_lvl0/test_14.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_19.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_44.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_7.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_63.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_16.wav to 16k


ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers
  built with gcc 7.3.0 (crosstool-NG 1.23.0.449-a04d0)
  configuration: --prefix=/usr/stud/steiger/miniconda3 --cc=/tmp/build/80754af9/ffmpeg_1587154242452/_build_env/bin/x86_64-conda_cos6-linux-gnu-cc --disable-doc --enable-avresample --enable-gmp --enable-hardcoded-tables --enable-libfreetype --enable-libvpx --enable-pthreads --enable-libopus --enable-postproc --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --disable-nonfree --enable-gpl --enable-gnutls --disable-openssl --enable-libopenh264 --enable-libx264
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat    58. 29.100 / 58. 29.100
  libavdevice    58.  8.100 / 58.  8.100
  libavfilter     7. 57.100 /  7. 57.100
  libavresample   4.  0.  0 /  4.  0.  0
  libswscale      5.  5.100 /  5.  5.100
  libswresample   3.  5.100 /  3.  5.100
  libpostproc    55.  

Resampling ../eval/maestro/validation_lvl0/test_18.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_50.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_38.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_23.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_51.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_36.wav to 16k


ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers
  built with gcc 7.3.0 (crosstool-NG 1.23.0.449-a04d0)
  configuration: --prefix=/usr/stud/steiger/miniconda3 --cc=/tmp/build/80754af9/ffmpeg_1587154242452/_build_env/bin/x86_64-conda_cos6-linux-gnu-cc --disable-doc --enable-avresample --enable-gmp --enable-hardcoded-tables --enable-libfreetype --enable-libvpx --enable-pthreads --enable-libopus --enable-postproc --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --disable-nonfree --enable-gpl --enable-gnutls --disable-openssl --enable-libopenh264 --enable-libx264
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat    58. 29.100 / 58. 29.100
  libavdevice    58.  8.100 / 58.  8.100
  libavfilter     7. 57.100 /  7. 57.100
  libavresample   4.  0.  0 /  4.  0.  0
  libswscale      5.  5.100 /  5.  5.100
  libswresample   3.  5.100 /  3.  5.100
  libpostproc    55.  

Resampling ../eval/maestro/validation_lvl0/test_33.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_45.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_48.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_55.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_46.wav to 16k
Resampling ../eval/maestro/validation_lvl0/test_5.wav to 16k


size=     156kB time=00:00:04.99 bitrate= 256.1kbits/s speed= 698x    
video:0kB audio:156kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.048751%
ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers
  built with gcc 7.3.0 (crosstool-NG 1.23.0.449-a04d0)
  configuration: --prefix=/usr/stud/steiger/miniconda3 --cc=/tmp/build/80754af9/ffmpeg_1587154242452/_build_env/bin/x86_64-conda_cos6-linux-gnu-cc --disable-doc --enable-avresample --enable-gmp --enable-hardcoded-tables --enable-libfreetype --enable-libvpx --enable-pthreads --enable-libopus --enable-postproc --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --disable-nonfree --enable-gpl --enable-gnutls --disable-openssl --enable-libopenh264 --enable-libx264
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat    58. 29.100 / 58. 29.100
  libavdevice    58.  8.100 / 58.  8.100
  libavfilter   