# Matcha-TTS: A fast TTS architecture with conditional flow matching
---
[Shivam Mehta](https://www.kth.se/profile/smehta), [Ruibo Tu](https://www.kth.se/profile/ruibo), [Jonas Beskow](https://www.kth.se/profile/beskow), [Éva Székely](https://www.kth.se/profile/szekely), and [Gustav Eje Henter](https://people.kth.se/~ghe/)

We introduce Matcha-TTS, a new encoder-decoder architecture for speedy TTS acoustic modelling, trained using optimal-transport conditional flow matching (OT-CFM). This yields an ODE-based decoder capable of high output quality in fewer synthesis steps than models trained using score matching. Careful design choices additionally ensure each synthesis step is fast to run. The method is probabilistic, non-autoregressive, and learns to speak from scratch without external alignments. Compared to strong pre-trained baseline models, the Matcha-TTS system has the smallest memory footprint, rivals the speed of the fastest models on long utterances, and attains the highest mean opinion score in a listening test.

Demo Page: https://shivammehta25.github.io/Matcha-TTS \
Code: https://github.com/shivammehta25/Matcha-TTS




In [56]:
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0


In [57]:
# Chỉnh lại optimizer function của model
# Tao nghĩ là nó có thể nằm ở trong file models hay là decoder gì đó
# Nếu 
import datetime as dt
from pathlib import Path

import IPython.display as ipd
import numpy as np
import soundfile as sf
import torch
from tqdm.auto import tqdm

# Hifigan imports
from matcha.hifigan.config import v1
from matcha.hifigan.denoiser import Denoiser
from matcha.hifigan.env import AttrDict
from matcha.hifigan.models import Generator as HiFiGAN
# Matcha imports
from phonemizer.backend.espeak.wrapper import EspeakWrapper
_ESPEAK_LIBRARY = r'/opt/homebrew/Cellar/espeak-ng/1.52.0/lib/libespeak-ng.dylib'
EspeakWrapper.set_library(_ESPEAK_LIBRARY)
from matcha.models.matcha_tts import MatchaTTS
from matcha.text import sequence_to_text, text_to_sequence
from matcha.utils.model import denormalize
from matcha.utils.utils import get_user_data_dir, intersperse
from jiwer import wer
import speech_recognition as sr
from scipy.io.wavfile import write
import io

In [58]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
# This allows for real time code changes being reflected in the notebook, no need to restart the kernel

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [59]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Filepaths

In [60]:
MATCHA_CHECKPOINT = "/Users/vudinhthi2304/Desktop/Matcha_TTS_Modification/matcha_ljspeech.ckpt"
HIFIGAN_CHECKPOINT = "/Users/vudinhthi2304/Desktop/Matcha_TTS_Modification/generator_v1"
OUTPUT_FOLDER = "synth_output"

## Load Matcha-TTS

In [61]:
def load_model(checkpoint_path):
    model = MatchaTTS.load_from_checkpoint(checkpoint_path, map_location=device)
    model.eval()
    return model
count_params = lambda x: f"{sum(p.numel() for p in x.parameters()):,}"


model = load_model(MATCHA_CHECKPOINT)
print(f"Model loaded! Parameter count: {count_params(model)}")

Model loaded! Parameter count: 18,204,193


## Load HiFi-GAN (Vocoder)

In [62]:
def load_vocoder(checkpoint_path):
    h = AttrDict(v1)
    hifigan = HiFiGAN(h).to(device)
    hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)['generator'])
    _ = hifigan.eval()
    hifigan.remove_weight_norm()
    return hifigan

vocoder = load_vocoder(HIFIGAN_CHECKPOINT)
denoiser = Denoiser(vocoder, mode='zeros')

Removing weight norm...


### Helper functions to synthesise

In [63]:
@torch.inference_mode()
def process_text(text: str):
    x = torch.tensor(intersperse(text_to_sequence(text, ['english_cleaners2'])[0], 0),dtype=torch.long, device=device)[None]
    x_lengths = torch.tensor([x.shape[-1]],dtype=torch.long, device=device)
    x_phones = sequence_to_text(x.squeeze(0).tolist())
    return {
        'x_orig': text,
        'x': x,
        'x_lengths': x_lengths,
        'x_phones': x_phones
    }


@torch.inference_mode()
def synthesise(text, spks=None):
    text_processed = process_text(text)
    start_t = dt.datetime.now()
    output = model.synthesise(
        text_processed['x'], 
        text_processed['x_lengths'],
        n_timesteps=n_timesteps,
        temperature=temperature,
        spks=spks,
        length_scale=length_scale
    )
    # merge everything to one dict    
    output.update({'start_t': start_t, **text_processed})
    return output

@torch.inference_mode()
def to_waveform(mel, vocoder):
    audio = vocoder(mel).clamp(-1, 1)
    audio = denoiser(audio.squeeze(0), strength=0.00025).cpu().squeeze()
    return audio.cpu().squeeze()
    
def save_to_folder(filename: str, output: dict, folder: str):
    folder = Path(folder)
    folder.mkdir(exist_ok=True, parents=True)
    np.save(folder / f'{filename}', output['mel'].cpu().numpy())
    sf.write(folder / f'{filename}.wav', output['waveform'], 22050, 'PCM_24')

# Function to convert audio to text
def audio_to_text(audio_data):
    recognizer = sr.Recognizer()
    audio_array = audio_data.numpy()
    audio_array = (audio_array * 32767).astype(np.int16)  # Convert to 16-bit PCM format
    
    # Create an in-memory buffer instead of saving to a file
    audio_buffer = io.BytesIO()
    write(audio_buffer, 22050, audio_array)
    audio_buffer.seek(0)
    
    with sr.AudioFile(audio_buffer) as source:
        audio_content = recognizer.record(source)
        text = recognizer.recognize_google(audio_content)
    
    return text


## Setup text to synthesise

In [64]:
texts = [
    "I only half heard what the sheriff said. My attention was fully occupied by the dogs down in Ten Mile Valley below us. I couldn't see them but I could hear them bugling down there in the cedar thickets. Baying slowly and mournfully, they were searching for the lost trail. A creek ran down the middle of the valley. Probably Tommy Sonofagun had crossed the creek and thrown the bloodhounds off his trail. Tommy might be a moron but he had enough animal cunning to lose a pack of hounds that were after him."
]

### Hyperparameters

In [65]:
## Number of ODE Solver steps
n_timesteps = 10

## Changes to the speaking rate
length_scale=1.0

## Sampling temperature
temperature = 0.667

## Synthesis

In [66]:
outputs, rtfs = [], []
rtfs_w = []
wers = []
for i, text in enumerate(tqdm(texts)):
    output = synthesise(text) #, torch.tensor([15], device=device, dtype=torch.long).unsqueeze(0))
    output['waveform'] = to_waveform(output['mel'], vocoder)

    # Compute Real Time Factor (RTF) with HiFi-GAN
    t = (dt.datetime.now() - output['start_t']).total_seconds()
    rtf_w = t * 22050 / (output['waveform'].shape[-1])

    hypothesis = audio_to_text(output['waveform'])

    wer = wer(text, hypothesis)


    ## Pretty print
    print(f"{'*' * 53}")
    print(f"Input text - {i}")
    print(f"{'-' * 53}")
    print(output['x_orig'])
    print(f"{'*' * 53}")
    print(f"Phonetised text - {i}")
    print(f"{'-' * 53}")
    print(output['x_phones'])
    print(f"{'*' * 53}")
    print(f"RTF:\t\t{output['rtf']:.6f}")
    print(f"RTF Waveform:\t{rtf_w:.6f}")
    print(f"{'*' * 53}")
    print(f"Word Error Rate:\t{wer:.6f}")
    rtfs.append(output['rtf'])
    rtfs_w.append(rtf_w)
    wers.append(wer)


    ## Display the synthesised waveform
    ipd.display(ipd.Audio(output['waveform'], rate=22050))

    ## Save the generated waveform
    save_to_folder(i, output, OUTPUT_FOLDER)

print(f"Number of ODE steps: {n_timesteps}")
print(f"Mean RTF:\t\t\t\t{np.mean(rtfs):.6f} ± {np.std(rtfs):.6f}")
print(f"Mean RTF Waveform (incl. vocoder):\t{np.mean(rtfs_w):.6f} ± {np.std(rtfs_w):.6f}")
print(f"Mean WER:\t\t\t\t{np.mean(wers):.6f} ± {np.std(wers):.6f}")

  0%|          | 0/1 [00:00<?, ?it/s]

*****************************************************
Input text - 0
-----------------------------------------------------
I only half heard what the sheriff said. My attention was fully occupied by the dogs down in Ten Mile Valley below us. I couldn't see them but I could hear them bugling down there in the cedar thickets. Baying slowly and mournfully, they were searching for the lost trail. A creek ran down the middle of the valley. Probably Tommy Sonofagun had crossed the creek and thrown the bloodhounds off his trail. Tommy might be a moron but he had enough animal cunning to lose a pack of hounds that were after him.
*****************************************************
Phonetised text - 0
-----------------------------------------------------
_ˈ_a_ɪ_ _ˈ_o_ʊ_n_l_i_ _h_ˈ_æ_f_ _h_ˈ_ɜ_ː_d_ _w_ʌ_t_ _ð_ə_ _ʃ_ˈ_ɛ_ɹ_ɪ_f_ _s_ˈ_ɛ_d_._ _m_a_ɪ_ _ɐ_t_ˈ_ɛ_n_ʃ_ə_n_ _w_ʌ_z_ _f_ˈ_ʊ_l_i_ _ˈ_ɑ_ː_k_j_ʊ_p_ˌ_a_ɪ_d_ _b_a_ɪ_ _ð_ə_ _d_ˈ_ɑ_ː_ɡ_z_ _d_ˌ_a_ʊ_n_ _ɪ_n_ _t_ˈ_ɛ_n_ _m_ˈ_a_ɪ_l_ _v_ˈ_æ_l_i_ _b_ᵻ_l_ˈ

Number of ODE steps: 10
Mean RTF:				0.076301 ± 0.000000
Mean RTF Waveform (incl. vocoder):	0.367887 ± 0.000000
Mean WER:				0.287234 ± 0.000000
