In [73]:
import glob
import numpy as np
import os
import pandas as pd

In [74]:
base_dir = "./audio_files/"
datasets = ["Blizzard"]
intro_dir = base_dir + "Intro/"
ref_dir = base_dir + "Reference_Mel/"
first_dir = base_dir + "First/"
last_dir = base_dir + "Last/"
incpos_dir = base_dir + "IncreasingPOS/"
decpos_dir = base_dir + "DecreasingPOS/"
emos = ["Angry", "Happy", "Sad", "Surprise"]

In [75]:
def ordinal(n: int):
    if 11 <= (n % 100) <= 13:
        suffix = 'th'
    else:
        suffix = ['th', 'st', 'nd', 'rd', 'th'][min(n % 10, 4)]
    return str(n) + suffix

def GetHTML(indices, mode, title, description, section_list, headers, columns, add_text=False, text_width=4, text_bold=True):
    print(f'\t<div class="section">')
    print(f'\t\t<hr class="class-1" />')
    print(f'\t\t<h1>{title}</h1>')
    print(f'\t\t<hr class="class-1" />')
    print(f'\t\t<p class="body">{description}</p>')
    print(f'\t\t<hr>')

    for sec in section_list:
        if len(section_list)>1:
            print(f'\t\t<h2>{get_section_title(sec, mode)}</h2>')
            print(f'\t\t<hr>')

        print(f'\t\t<div class="table-container">')
        print(f'\t\t\t<table>')
        print(f'\t\t\t\t<tr>')
        if add_text:
            for _ in range(text_width):
                print(f'\t\t\t\t\t<th></th>')
        for head in headers:
            print(f'\t\t\t\t\t<th>{get_header(head, mode, sec)}</th>')
        print(f'\t\t\t\t</tr>')

        dataset = sec
        for idx in indices[dataset]:
            print(f'\t\t\t\t<tr>')
            if add_text:
                if text_bold:
                    print(f'\t\t\t\t\t<td colspan="{text_width}"><h4>{get_text(idx, mode)}</h4></td>')
                else:
                    print(f'\t\t\t\t\t<td colspan="{text_width}">{get_text(idx, mode)}</td>')
            for col in columns:
                print(f'\t\t\t\t<td class="center-align">')
                print(f'\t\t\t\t\t<div class="audio-container">')
                print(f'\t\t\t\t\t\t<audio controls>')
                file = get_file(dataset, idx, col, mode, sec)
                print(f'\t\t\t\t\t\t\t<source src="{file}" type="audio/wav" />')
                print(f'\t\t\t\t\t\t\tYour browser does not support the audio element.')
                print(f'\t\t\t\t\t\t</audio>')
                print(f'\t\t\t\t\t</div>')
                print(f'\t\t\t\t</td>')
            print(f'\t\t\t\t</tr>')
            print()

        print(f'\t\t\t</table>')
        print(f'\t\t</div>')
        print(f'\t\t<hr>')
    print(f'\t</div>')
    
def get_file(dataset, idx, col, mode="ref", sec=None):
    if mode=="ref":
        if dataset=="ESD":
            file = glob.glob(ref_dir + f"{dataset}/{idx}_*_{col}.wav")[0]
        elif dataset=="Blizzard":
            file = glob.glob(ref_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="ran":
        if dataset=="ESD":
            file = glob.glob(ran_dir + f"{dataset}/{idx}_*_{col}.wav")[0]
        elif dataset=="Blizzard":
            file = glob.glob(ran_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="utbase":
        file = glob.glob(utbase_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="utinc":
        file = glob.glob(utinc_dir + f"{dataset}/{idx}_{sec}_{col}.wav")[0]
    elif mode=="utmix":
        file = glob.glob(utmix_dir + f"{dataset}/{idx}_{sec}_Surprise_{col}.wav")[0]
    elif mode=="first":
        file = glob.glob(first_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="last":
        file = glob.glob(last_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="pos":
        file = glob.glob(pos_dir + f"{dataset}/{idx}_{sec}_{col}.wav")[0]
    elif mode=="incpos":
        file = glob.glob(incpos_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="decpos":
        file = glob.glob(decpos_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="intro":
        file = glob.glob(intro_dir + f"{dataset}/{idx}_{col}.wav")[0]
    return file

In [76]:
datasets = ["Blizzard"]
modifications = ["original", "modified"]

modelnames = ["Test", "FS", "BERT", "SED_FS", "SED_BERT"]

In [77]:
indices = {}
for mode in ["intro", "ref", "first", "last", "incpos", "decpos"]:
    indices[mode] = {}
    for dataset in datasets:
        filelists = glob.glob(eval(f"{mode}_dir") + f"{dataset}/*.wav")
        filelists.sort()
        indices[mode][dataset] = [int(os.path.basename(p).split("_")[0]) for p in filelists]
        indices[mode][dataset] = list(set(indices[mode][dataset]))
        indices[mode][dataset].sort()

# Display

In [78]:
def get_section_title(sec, mode="ref"):
    if mode=="ref":
        output = None
    elif mode=="first":
        output = None
    elif mode=="last":
        output = None
    elif mode=="incpos":
        output = None
    elif mode=="decpos":
        output = None
    elif mode=="intro":
        output = None
    return output

def get_header(head, mode="ref", sec=None):
    if mode=="ref":
        if head=="Test":
            output = "Target Audio"
        elif head=="FS":
            output = "FS"
        elif head=="BERT":
            output = "FS + BE + PT"
        elif head=="SED_FS":
            output = "FS + ED (ours)"
        elif head=="SED_BERT":
            output = "FS + ED + BE + PT (ours)"
    elif mode in ["first", "last"]:
        if head=="original":
            head = "Original"
        output = f"{head}"
    elif mode in ["incpos", "decpos"]:
        output = f"{head}"
    elif mode=="intro":
        if head=="original":
            output = "Original Audio"
        elif head=="modified":
            output = "Edited Audio"
    return output

def get_text(idx, mode):
    if mode=="first":
        text = np.load(glob.glob(first_dir + f"{dataset}/{idx}_text.npy")[0]).item()
        text = text.replace("，", ", ")
        words = text.split()
        # output = f'Transcript: {{{" ".join(words[:3])}}} {" ".join(words[3:])}'
        # output = f'{{{" ".join(words[:3])}}} {" ".join(words[3:])}'
        output = get_partial_label(words, [1, 3])
    elif mode=="last":
        text = np.load(glob.glob(last_dir + f"{dataset}/{idx}_text.npy")[0]).item()
        text = text.replace("，", ", ")
        words = text.split(" ")
        if len(words[-1])==1:
            start = -4
        else:
            start = -3
        # output = f'Transcript: {" ".join(words[:start])} {{{" ".join(words[start:])}}}'
        # output = f'{" ".join(words[:start])} {{{" ".join(words[start:])}}}'
        output = get_partial_label(words, [-4, -2])
    elif mode=="intro":
        text = np.load(glob.glob(intro_dir + f"{dataset}/{idx}_text.npy")[0]).item()
        output = text.replace("，", ", ")
    return output

In [79]:
def get_partial_label(words, selected):
    for i, s in enumerate(selected):
        if s<0:
            selected[i] = len(words) + s

    output = ""
    for w, word in enumerate(words):
        if w in selected: 
            output += "{" + word + "} "
        else:
            output += word + " "
    return output[:-1]

In [80]:
mode = "intro"
title = "Audio Samples"
description =  "\
This demo accompanies our paper titled 'Hierarchical Emotion Prediction and Control\
in Text-to-speech Synthesis', in which we introduce a novel \
emotion representation called Sequential Emotion Distribution (Sequential ED). \
This representation facilitates modeling of emotion intensity at varying granularities. \
The following audio files showcase examples of our model's capabilities. \
You can try more audio samples in the following sections."
section_list = datasets
headers = modifications
columns = modifications
GetHTML(indices[mode], mode, title, description, section_list, headers, columns, True, 2, True)

mode = "ref"
title = "Section 1: Emotion Expressiveness: Reproducibility"
description = "The first section presents a demo to evaluate our model's \
reproducibility. The Sequential ED is predicted from the linguistic embedding \
generated by the encoder and used to reproduce the audio. We selected FastSpeech2 \
as a baseline. We define the following symbols: (1) FS: FastSpeech2, \
(2) ED: Integration of sequential ED predictor, (3) BE: Replacement \
with a BERT-based encoder, (4) PT: Pretraining \
the encoder with a large text corpus."
section_list = datasets
headers = modelnames
columns = modelnames
GetHTML(indices[mode], mode, title, description, section_list, headers, columns)

mode = "first"
title = "Section 2: Maximizing Emotion Intensities of the 2nd and the 4th Words"
description = "\
In this section, we demonstrate the control of emotion \
intensities in a specific segment of speech, primarily focusing on the \
2nd and the 4th words. We elevate the word-level emotion intensities of \
these words, and the corresponding phoneme-level intensities, to 1.0,\
while maintaining the other emotion intensities constant. \
This showcases our model's ability to adjust emotion granularly\
within specified speech segments."
section_list = datasets
headers = ["original"] + emos
columns = ["original"] + emos
GetHTML(indices[mode], mode, title, description, section_list, headers, columns, True, 4)

mode = "last"
title = "Section 3: Maximizing Emotion Intensities of the 'Last' 2nd and the 4th Words"
description = "In this section, we demonstrate the control of emotion \
intensities in a specific segment of speech, primarily focusing on the \
LAST 2nd and the 4th words. We elevate the word-level emotion intensities of \
these words, and the corresponding phoneme-level intensities, to 1.0,\
while maintaining the other emotion intensities constant. \
This showcases our model's ability to adjust emotion granularly\
within specified speech segments."
section_list = datasets
headers = ["original"] + emos
columns = ["original"] + emos
GetHTML(indices[mode], mode, title, description, section_list, headers, columns, True, 4)

mode = "incpos"
title = "Section 4: Gradually Increasing Emotion Intensity throughout a speech"
description = "In this section, we exhibit our model's capability to \
gradually INCREASE intensities of one of the emotions throughout a speech. \
We maintain the utterance-level intensities and word and phoneme-level intensities of \
non-selected emotions."
section_list = datasets
headers = emos
columns = emos
GetHTML(indices[mode], mode, title, description, section_list, headers, columns, False)

mode = "decpos"
title = "Section 5: Gradually Decreasing Emotion Intensity throughout a speech"
description = "In this section, we exhibit our model's capability to \
gradually DECREASE intensities of one of the emotions throughout a speech. \
We maintain the utterance-level intensities and word and phoneme-level intensities of \
non-selected emotions."
section_list = datasets
headers = emos
columns = emos
GetHTML(indices[mode], mode, title, description, section_list, headers, columns, False)

	<div class="section">
		<hr class="class-1" />
		<h1>Audio Samples</h1>
		<hr class="class-1" />
		<p class="body">This demo accompanies our paper titled 'Hierarchical Emotion Prediction and Controlin Text-to-speech Synthesis', in which we introduce a novel emotion representation called Sequential Emotion Distribution (Sequential ED). This representation facilitates modeling of emotion intensity at varying granularities. The following audio files showcase examples of our model's capabilities. You can try more audio samples in the following sections.</p>
		<hr>
		<div class="table-container">
			<table>
				<tr>
					<th></th>
					<th></th>
					<th>Original Audio</th>
					<th>Edited Audio</th>
				</tr>
				<tr>
					<td colspan="2"><h4>Increase 'Happy' intensity of the first two words ()</h4></td>
				<td class="center-align">
					<div class="audio-container">
						<audio controls>
							<source src="./audio_files/Intro/Blizzard/10_original.wav" type="audio/wav" />
							Your b