In [1]:
import torchaudio
import platform

if platform.system() == "Windows":
    torchaudio.USE_SOUNDFILE_LEGACY_INTERFACE = False
    torchaudio.set_audio_backend("soundfile")
else:
    torchaudio.set_audio_backend("sox_io")

  'The interface of "soundfile" backend is planned to change in 0.8.0 to '


In [2]:
import yaml
import json
import torch
from pathlib import Path
import IPython.display as ipd
from audio_datasets.preprocessing import get_mel_spectro_transform

# Load the Configuration

In [None]:
conf = yaml.load(open(Path('configs') / 'config.yaml'), Loader=yaml.FullLoader)
conf_file_ds  = yaml.load(open(Path('configs') / conf['data']['config_file']), Loader=yaml.FullLoader)

conf['data'] = {**conf['data'], **conf_file_ds}
conf['device'] = "cuda" if torch.cuda.is_available() else "cpu"
conf['env']['world_size'] = 1
conf['env']['use_data_parallel'] = False

conf['train']['batch_size'] = 32
conf['model']['apc']['prenet']['num_layers'] = 5
conf['model']['apc']['rnn']['num_layers'] = 4

conf['data']['augmentation']['use_augmentation'] = False
conf['masking']['add_metadata'] = False
conf['masking']['n_frames'] = 120
conf['masking']['k_frames'] = 25
conf['load_weights'] = 'treasured-deluge-60_backup'

print(json.dumps(conf, indent=2, default=str))

# Load the File and Convert it to a Mel-Spectrogram

In [4]:
transform = get_mel_spectro_transform(conf).to('cpu')

In [5]:
file_path = 'D:/Projekte/temporal-speech-context/data/TIMIT/SA2.WAV'

In [6]:
waveform = torchaudio.load(file_path)[0]
mel_spectro = transform(waveform)
print(mel_spectro.shape)

torch.Size([1, 80, 216])


  normalized, onesided, return_complex)
  normalized, onesided, return_complex)


In [7]:
# Waveform
ipd.display(ipd.Audio(waveform, rate=conf['data']['transform']['sample_rate']))

# Reconstruct Signal using Librosa

In [8]:
from librosa.feature.inverse import mel_to_audio
ipd.display(ipd.Audio(mel_to_audio(mel_spectro[0].numpy(), hop_length=conf['data']['transform']['hop_length'], sr=conf['data']['transform']['sample_rate'], n_fft=conf['data']['transform']['n_fft']), rate=conf['data']['transform']['sample_rate']))

# Reconstruct Signal using PyTorch

In [9]:
from torchaudio.transforms import InverseMelScale, GriffinLim

inverse_mel = InverseMelScale(n_stft=257, n_mels=80, sample_rate=16000, f_min=0.0, f_max=8000)
griffin_lim = GriffinLim(n_fft=512, win_length=400, hop_length=200)

ipd.display(ipd.Audio(griffin_lim(inverse_mel(mel_spectro)) , rate=conf['data']['transform']['sample_rate']))

  normalized, onesided, length, return_complex)


In [10]:
mel_spectro.shape

torch.Size([1, 80, 216])