Clone the repo from GitHub

In [None]:
!git clone https://github.com/Ryan-Rudes/tacotron2
%cd tacotron2
!git submodule init; git submodule update

Create file lists from train and validation sets

In [None]:
import pandas as pd

metadata = pd.read_csv("../../input/johnoliver/metadata.csv")
metadata = metadata[metadata['include']]

total = len(metadata)
split = 0.9
train = int(total * split)

train_metadata = metadata[:train]
val_metadata = metadata[train:]

with open('filelists/audio_text_train_filelist.txt', 'w') as f:
    for _, (index, _, _, _, text, _) in train_metadata.iterrows():
        filepath = '../../input/johnoliver/wav/%05d.wav' % index
        f.write(filepath + '|' + text + '\n')

with open('filelists/audio_text_test_filelist.txt', 'w') as f:
    for _, (index, _, _, _, text, _) in val_metadata.iterrows():
        filepath = '../../input/johnoliver/wav/%05d.wav' % index
        f.write(filepath + '|' + text + '\n')

Install dependencies

In [None]:
!pip install tensorflow==1.15
!pip install unidecode
!pip install inflect

Download pretrained model from Google Drive for transfer learning

In [None]:
!pip install gdown
!gdown https://drive.google.com/uc?id=1c5ZTuT7J08wLUoVZ2KkUs_VdZuJ86ZqA

Train the model for 1000 epochs with a batch size of 32

In [None]:
!python train.py --output_directory=outdir --log_directory=logdir -c tacotron2_statedict.pt --warm_start

## Inference
### Synthesize generated audio samples from text

In [None]:
from multiprocessing import Pool
import matplotlib.pylab as plt
%matplotlib inline
from tqdm.notebook import tqdm
import IPython.display as ipd
from time import time, sleep
import scipy.io.wavfile
import numpy as np
import matplotlib
import torch
import sys

sys.path.append('waveglow/')

from audio_processing import griffin_lim
from layers import TacotronSTFT, STFT
from hparams import create_hparams
from text import text_to_sequence
from denoiser import Denoiser
from train import load_model
from model import Tacotron2

In [None]:
def plot_data(data, figsize=(16, 4)):
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        axes[i].imshow(data[i], aspect='auto', origin='lower', interpolation='none')

In [None]:
hparams = create_hparams()
hparams.sampling_rate = 22000

In [None]:
checkpoint = int(input("Enter steps at latest checkpoint: "))
checkpoint_path = f"outdir/checkpoint_{checkpoint}"
model = load_model(hparams)
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
_ = model.cuda().eval().half()

In [None]:
!gdown https://drive.google.com/uc?id=1rpK8CzAAirq9sWZhe9nlfvxMF1dRgFbF

In [None]:
waveglow_path = 'waveglow_256channels_universal_v5.pt'
waveglow = torch.load(waveglow_path)['model']
waveglow.cuda().eval().half()
for k in waveglow.convinv:
    k.float()
denoiser = Denoiser(waveglow)

In [None]:
def synthesize(word, n=5, cleaners=['english_cleaners'], sigma=0.666, strength=0.01):
    sequence = np.array(text_to_sequence(word, cleaners))[None, :]
    sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
    with torch.no_grad():
        mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
    audio = waveglow.infer(mel_outputs_postnet, sigma=sigma)
    audio_denoised = denoiser(audio, strength=strength)[:, 0]
    audio = audio[0].data.cpu().numpy().tolist()
    mel_outputs = mel_outputs.float().data.cpu().numpy()[0]
    mel_outputs_postnet = mel_outputs_postnet.float().data.cpu().numpy()[0]
    alignments = alignments.float().data.cpu().numpy()[0]
    return audio, mel_outputs, mel_outputs_postnet, alignments

In [None]:
tests = [
         "Scientists at the CERN laboratory say they have discovered a new particle.",
         "The state of Florida reports a surge in coronavirus deaths as restrictions are upended.",
         "How much wood would a woodchuck chuck if a woodchuck could chuck wood?",
         "A woodchuck would chuck all the wood it could chuck if a woodchuck could chuck wood.",
         "Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?",
         "Sally sells seashells by the seashore. The shells she sells are seashells I'm sure.",
         "The blue lagoon is a nineteen eighty American romance adventure film.",
         "George Washington was the first President of the United States.",
         "Basilar membrane and otolaryngology are not auto-correlations.",
         "Biden holds first foreign meeting with Canada's Justin Trudeau."
]

In [None]:
for text in tests:
    audio, mel_outputs, mel_outputs_postnet, alignments = synthesize(text, n=15)
    ipd.display_html(ipd.HTML(f"""
    <h3>{text}</h3>
    <br/>
    """))
    ipd.display(ipd.Audio(data = audio, rate = 22000, autoplay = False))