In [1]:
import IPython.display as ipd 
from whisperweranalysis.mel_utilities import audio2mel, plot_mel, griffin_lim

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
import sys
import json
import torch
sys.path.insert(0, './hifigan')
from hifigan.env import AttrDict
from hifigan.models import Generator

device = torch.device("cuda")

In [3]:
%load_ext autoreload
%autoreload 2

## audio2mel(filename)

This is used to convert a .wav file to mel spectrogram.

**Please note if the audio does not have a sample rate of 22050 this will resample to 22050.**

In [4]:
filename = "data/arctic_temp.wav"

In [5]:
mel = audio2mel(filename)
mel.shape

torch.Size([80, 339])

## plot_mel(mel, title)

Plot the converted mel spectrogram

In [6]:
plot_mel(mel, title="Test Mel")

## giffin_lim(filename)

Run the griffin lim algorithm which reads a file, performs a stft transformation and converts to waveform back using the griffin lim algorithm

In [7]:
copy_synthesis, sr = griffin_lim(filename)

In [8]:
ipd.display(ipd.Audio(copy_synthesis, rate=sr))

In [9]:
# load the hifi-gan model
hifigan_loc = 'hifigan/'
config_file = hifigan_loc + 'config_v1.json'
hifi_checkpoint_file = 'generator_v1_T2'
with open(config_file) as f:
    data = f.read()
json_config = json.loads(data)


def load_checkpoint(filepath, device):
    print(filepath)
    assert os.path.isfile(filepath)
    print("Loading '{}'".format(filepath))
    checkpoint_dict = torch.load(filepath, map_location=device)
    print("Complete.")
    return checkpoint_dict


h = AttrDict(json_config)
torch.manual_seed(h.seed)
generator = Generator(h).to(device)
state_dict_g = load_checkpoint(hifi_checkpoint_file, device)
generator.load_state_dict(state_dict_g['generator'])
generator.eval()
generator.remove_weight_norm()

generator_v1_T2
Loading 'generator_v1_T2'
Complete.
Removing weight norm...


In [11]:
mel_output = mel.unsqueeze(0).cuda().float()
audio = generator(mel_output)
ipd.display(ipd.Audio(audio[0].data.cpu().numpy(), rate=22050))