In [1]:
%%capture
!git clone https://github.com/huawei-noah/Speech-Backbones.git
%cd Speech-Backbones/DiffVC/
!pip install -q webrtcvad

In [3]:
!gdown 1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW
!mv /content/Speech-Backbones/DiffVC/g_02500000 /content/Speech-Backbones/DiffVC/checkpts/vocoder
!mv /content/Speech-Backbones/DiffVC/checkpts/vocoder/g_02500000 /content/Speech-Backbones/DiffVC/checkpts/vocoder/generator

Downloading...
From: https://drive.google.com/uc?id=1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW
To: /content/Speech-Backbones/DiffVC/g_02500000
100% 55.8M/55.8M [00:01<00:00, 30.5MB/s]
mv: cannot stat '/content/g_02500000': No such file or directory
mv: cannot stat '/content/Speech-Backbones/DiffVC/checkpts/vocoder/g_02500000': No such file or directory


In [2]:
!gdown 18Xbme0CTVo58p2vOHoTQm8PBGW7oEjAy
!mv vc_libritts_wodyn.pt /content/Speech-Backbones/DiffVC/checkpts/vc/
!gdown 12s9RPmwp9suleMkBCVetD8pub7wsDAy4
!mv vc_vctk_wodyn.pt /content/Speech-Backbones/DiffVC/checkpts/vc/

Downloading...
From (original): https://drive.google.com/uc?id=18Xbme0CTVo58p2vOHoTQm8PBGW7oEjAy
From (redirected): https://drive.google.com/uc?id=18Xbme0CTVo58p2vOHoTQm8PBGW7oEjAy&confirm=t&uuid=be596097-1f14-4d60-92a1-9b5ba1900ed8
To: /content/Speech-Backbones/DiffVC/vc_libritts_wodyn.pt
100% 505M/505M [00:05<00:00, 86.7MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=12s9RPmwp9suleMkBCVetD8pub7wsDAy4
From (redirected): https://drive.google.com/uc?id=12s9RPmwp9suleMkBCVetD8pub7wsDAy4&confirm=t&uuid=3df4462c-5f5b-48f2-863a-1fa60af54bae
To: /content/Speech-Backbones/DiffVC/vc_vctk_wodyn.pt
100% 505M/505M [00:08<00:00, 57.7MB/s]


In [4]:
%load_ext autoreload
%autoreload 2
import argparse
import json
import os
import numpy as np
import IPython.display as ipd
from tqdm import tqdm
from scipy.io.wavfile import write

import torch
use_gpu = torch.cuda.is_available()

import librosa
from librosa.core import load
from librosa.filters import mel as librosa_mel_fn
mel_basis = librosa_mel_fn(sr=22050, n_fft=1024, n_mels=80, fmin=0, fmax=8000)

import params
from model import DiffVC

import sys
sys.path.append('hifi-gan/')
from env import AttrDict
from models import Generator as HiFiGAN

sys.path.append('speaker_encoder/')
from encoder import inference as spk_encoder
from pathlib import Path

In [5]:
def get_mel(wav_path):
    wav, _ = load(wav_path, sr=22050)
    wav = wav[:(wav.shape[0] // 256)*256]
    wav = np.pad(wav, 384, mode='reflect')
    stft = librosa.core.stft(wav, n_fft=1024, hop_length=256, win_length=1024, window='hann', center=False)
    stftm = np.sqrt(np.real(stft) ** 2 + np.imag(stft) ** 2 + (1e-9))
    mel_spectrogram = np.matmul(mel_basis, stftm)
    log_mel_spectrogram = np.log(np.clip(mel_spectrogram, a_min=1e-5, a_max=None))
    return log_mel_spectrogram

def get_embed(wav_path):
    wav_preprocessed = spk_encoder.preprocess_wav(wav_path)
    embed = spk_encoder.embed_utterance(wav_preprocessed)
    return embed

def noise_median_smoothing(x, w=5):
    y = np.copy(x)
    x = np.pad(x, w, "edge")
    for i in range(y.shape[0]):
        med = np.median(x[i:i+2*w+1])
        y[i] = min(x[i+w+1], med)
    return y

def mel_spectral_subtraction(mel_synth, mel_source, spectral_floor=0.02, silence_window=5, smoothing_window=5):
    mel_len = mel_source.shape[-1]
    energy_min = 100000.0
    i_min = 0
    for i in range(mel_len - silence_window):
        energy_cur = np.sum(np.exp(2.0 * mel_source[:, i:i+silence_window]))
        if energy_cur < energy_min:
            i_min = i
            energy_min = energy_cur
    estimated_noise_energy = np.min(np.exp(2.0 * mel_synth[:, i_min:i_min+silence_window]), axis=-1)
    if smoothing_window is not None:
        estimated_noise_energy = noise_median_smoothing(estimated_noise_energy, smoothing_window)
    mel_denoised = np.copy(mel_synth)
    for i in range(mel_len):
        signal_subtract_noise = np.exp(2.0 * mel_synth[:, i]) - estimated_noise_energy
        estimated_signal_energy = np.maximum(signal_subtract_noise, spectral_floor * estimated_noise_energy)
        mel_denoised[:, i] = np.log(np.sqrt(estimated_signal_energy))
    return mel_denoised

In [6]:
# loading voice conversion model
vc_path = 'checkpts/vc/vc_libritts_wodyn.pt' # path to voice conversion model

generator = DiffVC(params.n_mels, params.channels, params.filters, params.heads,
                   params.layers, params.kernel, params.dropout, params.window_size,
                   params.enc_dim, params.spk_dim, params.use_ref_t, params.dec_dim,
                   params.beta_min, params.beta_max)
if use_gpu:
    generator = generator.cuda()
    generator.load_state_dict(torch.load(vc_path))
else:
    generator.load_state_dict(torch.load(vc_path, map_location='cpu'))
generator.eval()

print(f'Number of parameters: {generator.nparams}')

Number of parameters: 126259128


In [10]:
# loading HiFi-GAN vocoder
hfg_path = 'checkpts/vocoder/' # HiFi-GAN path

with open(hfg_path + 'config.json') as f:
    h = AttrDict(json.load(f))

if use_gpu:
    hifigan_universal = HiFiGAN(h).cuda()
    hifigan_universal.load_state_dict(torch.load(hfg_path + 'generator')['generator'])
else:
    hifigan_universal = HiFiGAN(h)
    hifigan_universal.load_state_dict(torch.load(hfg_path + 'generator',  map_location='cpu')['generator'])

_ = hifigan_universal.eval()
hifigan_universal.remove_weight_norm()



Removing weight norm...


In [11]:
# loading speaker encoder
enc_model_fpath = Path('checkpts/spk_encoder/pretrained.pt') # speaker encoder path
if use_gpu:
    spk_encoder.load_model(enc_model_fpath, device="cuda")
else:
    spk_encoder.load_model(enc_model_fpath, device="cpu")

Loaded encoder "pretrained.pt" trained to step 1564501


In [34]:
# loading source and reference wavs, calculating mel-spectrograms and speaker embeddings
# src_path = 'example/Male_English_P3_2.wav' # path to source utterance
# tgt_path = 'example/Female_English_P2_1.wav' # path to reference utterance

def convert(src_path, tgt_path):
  mel_source = torch.from_numpy(get_mel(src_path)).float().unsqueeze(0)
  if use_gpu:
      mel_source = mel_source.cuda()
  mel_source_lengths = torch.LongTensor([mel_source.shape[-1]])
  if use_gpu:
      mel_source_lengths = mel_source_lengths.cuda()

  mel_target = torch.from_numpy(get_mel(tgt_path)).float().unsqueeze(0)
  if use_gpu:
      mel_target = mel_target.cuda()
  mel_target_lengths = torch.LongTensor([mel_target.shape[-1]])
  if use_gpu:
      mel_target_lengths = mel_target_lengths.cuda()

  embed_target = torch.from_numpy(get_embed(tgt_path)).float().unsqueeze(0)
  if use_gpu:
      embed_target = embed_target.cuda()

  # performing voice conversion
  mel_encoded, mel_ = generator.forward(mel_source, mel_source_lengths, mel_target, mel_target_lengths, embed_target,
                                        n_timesteps=30, mode='ml')
  mel_synth_np = mel_.cpu().detach().squeeze().numpy()
  mel_source_np = mel_.cpu().detach().squeeze().numpy()
  mel = torch.from_numpy(mel_spectral_subtraction(mel_synth_np, mel_source_np, smoothing_window=1)).float().unsqueeze(0)
  if use_gpu:
      mel = mel.cuda()
  return mel

import soundfile as sf
def convert_and_save(src_path, tgt_path):
  mel = convert(src_path, tgt_path)
  with torch.no_grad():
      audio = hifigan_universal.forward(mel).cpu().squeeze().clamp(-1, 1)
  # src path without .wav
  src_path = src_path.split('/')[-1].split('.')[0]
  tgt_path = tgt_path.split('/')[-1].split('.')[0]
  save_path = f'example/C_{src_path}_{tgt_path}.wav'
  sf.write(save_path, audio, 22050)

In [35]:
convert_and_save('example/AliBandari.wav', 'example/OldMan.wav')
convert_and_save('example/Female_Persian_P1_1.wav', 'example/OldMan.wav')
convert_and_save('example/Female_Persian_P1_2.wav', 'example/Whisper.wav')
convert_and_save('example/Ghaderpanah.wav', 'example/Female_Persian_P1_2.wav')
convert_and_save('example/Ghaderpanah.wav', 'example/Whisper.wav')
convert_and_save('example/ZhoPodcast.wav', 'example/OldMan.wav')

In [27]:
src_path = 'example/Male_English_P3_2.wav' # path to source utterance
tgt_path = 'example/Female_English_P2_1.wav' # path to reference utterance

mel = convert(src_path, tgt_path)
with torch.no_grad():
    audio = hifigan_universal.forward(mel).cpu().squeeze().clamp(-1, 1)
display(ipd.Audio(audio, rate=22050))

src_path = 'example/Female_English_P2_1.wav' # path to source utterance
tgt_path = 'example/Female_English_P1_1.wav' # path to reference utterance

mel = convert(src_path, tgt_path)
with torch.no_grad():
    audio = hifigan_universal.forward(mel).cpu().squeeze().clamp(-1, 1)
display(ipd.Audio(audio, rate=22050))

src_path = 'example/Female_English_P2_1.wav' # path to source utterance
tgt_path = 'example/Male_English_P2_2.wav' # path to reference utterance

mel = convert(src_path, tgt_path)
with torch.no_grad():
    audio = hifigan_universal.forward(mel).cpu().squeeze().clamp(-1, 1)
display(ipd.Audio(audio, rate=22050))

src_path = 'example/Male_English_P2_2.wav' # path to source utterance
tgt_path = 'example/Male_English_P3_2.wav' # path to reference utterance

mel = convert(src_path, tgt_path)
with torch.no_grad():
    audio = hifigan_universal.forward(mel).cpu().squeeze().clamp(-1, 1)
display(ipd.Audio(audio, rate=22050))

In [22]:
# source utterance (vocoded)
with torch.no_grad():
    audio = hifigan_universal.forward(mel_source).cpu().squeeze().clamp(-1, 1)
ipd.display(ipd.Audio(audio, rate=22050))

In [23]:
# reference utterance (vocoded)
with torch.no_grad():
    audio = hifigan_universal.forward(mel_target).cpu().squeeze().clamp(-1, 1)
ipd.display(ipd.Audio(audio, rate=22050))

In [24]:
# converted speech
with torch.no_grad():
    audio = hifigan_universal.forward(mel).cpu().squeeze().clamp(-1, 1)
ipd.display(ipd.Audio(audio, rate=22050))