In [78]:
import glob
import numpy as np
import os
import pandas as pd

In [109]:
base_dir = "./audio_files/"
datasets = ["ESD"]
intro_dir = base_dir + "Intro/"
ref_dir = base_dir + "Reference_Mel/"
ran_dir = base_dir + "RandomED/"
utbase_dir = base_dir + "Utterance/BaseEmotion/"
utinc_dir = base_dir + "Utterance/IncreasingEmotion/"
utmix_dir = base_dir + "Utterance/MixedEmotion/"
first_dir = base_dir + "First/"
last_dir = base_dir + "Last/"
pos_dir = base_dir + "POS/"
incpos_dir = base_dir + "IncreasingPOS/"
decpos_dir = base_dir + "DecreasingPOS/"
emos = ["Angry", "Happy", "Sad", "Surprise"]

In [110]:
def ordinal(n: int):
    if 11 <= (n % 100) <= 13:
        suffix = 'th'
    else:
        suffix = ['th', 'st', 'nd', 'rd', 'th'][min(n % 10, 4)]
    return str(n) + suffix

def GetHTML(indices, mode, title, description, section_list, headers, columns, add_text=False, text_width=4, text_bold=True):
    print(f'\t<div class="section">')
    print(f'\t\t<hr class="class-1" />')
    print(f'\t\t<h1>{title}</h1>')
    print(f'\t\t<hr class="class-1" />')
    print(f'\t\t<p class="body">{description}</p>')
    print(f'\t\t<hr>')

    for sec in section_list:
        if len(section_list)>1:
            print(f'\t\t<h2>{get_section_title(sec, mode)}</h2>')
            print(f'\t\t<hr>')

        print(f'\t\t<div class="table-container">')
        print(f'\t\t\t<table>')
        print(f'\t\t\t\t<tr>')
        if add_text:
            for _ in range(text_width):
                print(f'\t\t\t\t\t<th></th>')
        for head in headers:
            print(f'\t\t\t\t\t<th>{get_header(head, mode, sec)}</th>')
        print(f'\t\t\t\t</tr>')

        if mode in ["ref", "ran"]:
            dataset = sec
        else:
            dataset = "ESD"
        for idx in indices[dataset]:
            print(f'\t\t\t\t<tr>')
            if add_text:
                if text_bold:
                    print(f'\t\t\t\t\t<td colspan="{text_width}"><h4>{get_text(idx, mode)}</h4></td>')
                else:
                    print(f'\t\t\t\t\t<td colspan="{text_width}">{get_text(idx, mode)}</td>')
            for col in columns:
                print(f'\t\t\t\t<td class="center-align">')
                print(f'\t\t\t\t\t<div class="audio-container">')
                print(f'\t\t\t\t\t\t<audio controls>')
                file = get_file(dataset, idx, col, mode, sec)
                print(f'\t\t\t\t\t\t\t<source src="{file}" type="audio/wav" />')
                print(f'\t\t\t\t\t\t\tYour browser does not support the audio element.')
                print(f'\t\t\t\t\t\t</audio>')
                print(f'\t\t\t\t\t</div>')
                print(f'\t\t\t\t</td>')
            print(f'\t\t\t\t</tr>')
            print()

        print(f'\t\t\t</table>')
        print(f'\t\t</div>')
        print(f'\t\t<hr>')
    print(f'\t</div>')
    
def get_file(dataset, idx, col, mode="ref", sec=None):
    if mode=="ref":
        if dataset=="ESD":
            file = glob.glob(ref_dir + f"{dataset}/{idx}_*_{col}.wav")[0]
        elif dataset=="Blizzard":
            file = glob.glob(ref_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="ran":
        if dataset=="ESD":
            file = glob.glob(ran_dir + f"{dataset}/{idx}_*_{col}.wav")[0]
        elif dataset=="Blizzard":
            file = glob.glob(ran_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="utbase":
        file = glob.glob(utbase_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="utinc":
        file = glob.glob(utinc_dir + f"{dataset}/{idx}_{sec}_{col}.wav")[0]
    elif mode=="utmix":
        file = glob.glob(utmix_dir + f"{dataset}/{idx}_{sec}_Surprise_{col}.wav")[0]
    elif mode=="first":
        file = glob.glob(first_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="last":
        file = glob.glob(last_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="pos":
        file = glob.glob(pos_dir + f"{dataset}/{idx}_{sec}_{col}.wav")[0]
    elif mode=="incpos":
        file = glob.glob(incpos_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="decpos":
        file = glob.glob(decpos_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="intro":
        file = glob.glob(intro_dir + f"{dataset}/{idx}_{col}.wav")[0]
    return file

In [111]:
all_datasets = ["Blizzard", "ESD"]
ESD_datasets = ["ESD"]
modifications = ["original", "modified"]
seed_list = [str(i) for i in range(5)]
intensities = ["0.2", "0.8", "1.4", "2.0"]
positions = [str(i) for i in range(3)]

modelnames = ["Test", "GST", "SED"]

In [112]:
indices = {}
for mode in ["intro", "ref", "ran", "utbase", "utinc", "utmix", "first", "last", "pos", "incpos", "decpos"]:
    if mode in ["ref", "ran"]:
        datasets = all_datasets
    else:
        datasets = ESD_datasets
    indices[mode] = {}
    for dataset in datasets:
        filelists = glob.glob(eval(f"{mode}_dir") + f"{dataset}/*.wav")
        filelists.sort()
        indices[mode][dataset] = [int(os.path.basename(p).split("_")[0]) for p in filelists]
        indices[mode][dataset] = list(set(indices[mode][dataset]))
        indices[mode][dataset].sort()

# Display

In [130]:
def get_section_title(sec, mode="ref"):
    if mode in ["ref", "ran"]:
        if sec=="ESD":
            sec = "Emotional Speech Database (ESD)"
        elif sec=="Blizzard":
            sec = "Blizzard Challenge 2013"
        output = f"Dataset: {sec}"
    elif mode=="utbase":
        output = None
    elif mode=="utinc":
        output = f"Emotion: {sec}"
    elif mode=="utmix":
        output = f"Fixed Emotion: {sec} --- Changed Emotion: Surprise"
    elif mode=="first":
        output = None
    elif mode=="last":
        output = None
    elif mode=="pos":
        output = f"Emotion: {sec}"
    elif mode=="incpos":
        output = None
    elif mode=="decpos":
        output = None
    elif mode=="intro":
        output = None
    return output

def get_header(head, mode="ref", sec=None):
    if mode=="ref":
        if head=="Test":
            output = "Target Audio"
        elif head=="GST":
            output = "FastSpeech2 + GST (Baseline)"
        elif head=="SED":
            output = "FastSpeech2 + Sequential ED (ours)"
    elif mode=="ran":
        output = f"Random Seed {head}"
    elif mode=="utbase":
        if head=="original":
            head = "Original"
        output = f"{head}"
    elif mode=="utinc":
        output = f"{sec} Intensity: {head}"
    elif mode=="utmix":
        output = f"{sec}: 2.0 --- Surprise: {head}"
    elif mode in ["first", "last"]:
        if head=="original":
            head = "Original"
        output = f"{head}"
    elif mode=="pos":
        start = int(head)*3+1
        output = f"{ordinal(start)} {ordinal(start+1)} {ordinal(start+2)}"
    elif mode in ["incpos", "decpos"]:
        output = f"{head}"
    elif mode=="intro":
        if head=="original":
            output = "Original Audio"
        elif head=="modified":
            output = "Edited Audio"
    return output

def get_text(idx, mode):
    if mode=="first":
        text = np.load(glob.glob(first_dir + f"{dataset}/{idx}_text.npy")[0]).item()
        text = text.replace("，", ", ")
        words = text.split()
        # output = f'Transcript: {{{" ".join(words[:3])}}} {" ".join(words[3:])}'
        output = f'{{{" ".join(words[:3])}}} {" ".join(words[3:])}'
    elif mode=="last":
        text = np.load(glob.glob(last_dir + f"{dataset}/{idx}_text.npy")[0]).item()
        text = text.replace("，", ", ")
        words = text.split(" ")
        if len(words[-1])==1:
            start = -4
        else:
            start = -3
        # output = f'Transcript: {" ".join(words[:start])} {{{" ".join(words[start:])}}}'
        output = f'{" ".join(words[:start])} {{{" ".join(words[start:])}}}'
    elif mode=="pos":
        text = np.load(glob.glob(pos_dir + f"{dataset}/{idx}_text.npy")[0]).item()
        text = text.replace("，", ", ")
        # output = "Transcript: " + text
        output = text
    elif mode=="intro":
        text = np.load(glob.glob(intro_dir + f"{dataset}/{idx}_text.npy")[0]).item()
        output = text.replace("，", ", ")
    return output

In [135]:
mode = "intro"
title = "Audio Samples"
description =  "This demo accompanies our paper titled 'Fine-Grained Quantitative \
Control of Emotion Rendering for Speech Generation', in which we introduce a novel \
emotion representation called Sequential Emotion Distribution (Sequential ED). \
This representation facilitates modeling of emotion intensity at varying granularities. \
The following audio files showcase examples of our model's capabilities, \
illustrating both global and fine-grained emotion adjustments. \
You can try more audio samples in the following sections."
section_list = ESD_datasets
headers = modifications
columns = modifications
GetHTML(indices[mode], mode, title, description, section_list, headers, columns, True, 2, True)

mode = "ref"
title = "Section 1: Emotion Expressiveness: Reproducibility"
description = "The first section presents a demo to evaluate our model's\
reproducibility. Our model extracts the Sequential ED from the target audio\
and uses it to reproduce the audio. We selected Global Style Tokens (GST)\
as a baseline, integrating it with FastSpeech2. This model derives the prosody\
embedding from the mel-spectrogram of the target audio."
section_list = all_datasets
headers = modelnames
columns = modelnames
GetHTML(indices[mode], mode, title, description, section_list, headers, columns)

mode = "ran"
title = "Section 2: Generation using randomly generated Sequential Emotion Distribution"
description = "This section presents audio files synthesized from randomly generated \
Sequential ED, which doesn't necessitate input audio. Our random generation scheme, \
detailed in the paper, relies on reference dataset statistics. \
To investigate stability, we employed five distinct random seeds."
section_list = all_datasets
headers = seed_list
columns = seed_list
GetHTML(indices[mode], mode, title, description, section_list, headers, columns)

mode = "utbase"
title = "Section 3: Utterance-level Control: Maximize Emotion Intensity"
description = "This section includes a demo where we amplify the utterance-level \
emotion in each audio file. Specifically, the intensities of the indicated emotions \
are heightened to 2.0, while other emotions are set to 0.0. The emotions at word and \
phoneme levels remain unchanged."
section_list = ESD_datasets
headers = ["original"] + emos
columns = ["original"] + emos
GetHTML(indices[mode], mode, title, description, section_list, headers, columns)

mode = "utinc"
title = "Section 4: Utterance-level Control: Increase Emotion Intensity"
description = "In this section, we provide a demo where we generate the audio files to \
display varying emotion intensities for a given emotion, specifically at levels \
0.2, 0.8, 1.4, and 2.0. The utterance-level intensities of all other emotions are set to 0.0, \
while the emotions at the word and phoneme levels remained unchanged."

section_list = emos
headers = intensities
columns = intensities
GetHTML(indices[mode], mode, title, description, section_list, headers, columns)

# mode = "utmix"
# title = "Section 5: Utterance-level Control (Mixed Emotion)"
# description = "This is the description of the reference mel"
# section_list = emos[:-1]
# headers = intensities
# columns = intensities
# GetHTML(indices[mode], mode, title, description, section_list, headers, columns)

mode = "first"
title = "Section 5: Maximizing Emotion Intensities of the 'First' Three Words"
description = "In this section, we demonstrate the control of emotion \
intensities in a specific segment of speech, primarily focusing on the \
FIRST three words. We elevate the word-level emotion intensities of \
these words, and the corresponding phoneme-level intensities, to 1.0,\
while maintaining the other emotion intensities constant. \
This showcases our model's ability to adjust emotion granularly\
within specified speech segments."
section_list = ESD_datasets
headers = ["original"] + emos
columns = ["original"] + emos
GetHTML(indices[mode], mode, title, description, section_list, headers, columns, True, 4)

mode = "last"
title = "Section 6: Maximizing Emotion Intensities of the 'Last' Three Words"
description = "In this section, we demonstrate the control of emotion \
intensities in a specific segment of speech, primarily focusing on the \
LAST three words. We elevate the word-level emotion intensities of \
these words, and the corresponding phoneme-level intensities, to 1.0,\
while maintaining the other emotion intensities constant. \
This showcases our model's ability to adjust emotion granularly\
within specified speech segments."
section_list = ESD_datasets
headers = ["original"] + emos
columns = ["original"] + emos
GetHTML(indices[mode], mode, title, description, section_list, headers, columns, True, 4)

# mode = "pos"
# title = "Section 8: Partial Control: Different Positions"
# description = "This is the description of the reference mel"
# section_list = emos
# headers = positions
# columns = positions
# GetHTML(indices[mode], mode, title, description, section_list, headers, columns, True)

mode = "incpos"
title = "Section 7: Gradually Increasing Emotion Intensity throughout a speech"
description = "In this section, we exhibit our model's capability to \
gradually INCREASE emotion intensities throughout a speech. \
We maintain the utterance-level intensities of the other emotions \
at 0.0, while word and phoneme-level intensities are randomly generated \
as detailed in Section 2 of this demo, and further described in our paper."
section_list = ESD_datasets
headers = emos
columns = emos
GetHTML(indices[mode], mode, title, description, section_list, headers, columns, False)

mode = "decpos"
title = "Section 8: Gradually Decreasing Emotion Intensity throughout a speech"
description = "In this section, we exhibit our model's capability to \
gradually DECREASE emotion intensities throughout a speech. \
We maintain the utterance-level intensities of the other emotions \
at 0.0, while word and phoneme-level intensities are randomly generated \
as detailed in Section 2 of this demo, and further described in our paper."
section_list = ESD_datasets
headers = emos
columns = emos
GetHTML(indices[mode], mode, title, description, section_list, headers, columns, False)

	<div class="section">
		<hr class="class-1" />
		<h1>Audio Samples</h1>
		<hr class="class-1" />
		<p class="body">This demo accompanies our paper titled 'Fine-Grained Quantitative Control of Emotion Rendering for Speech Generation', in which we introduce a novel emotion representation called Sequential Emotion Distribution (Sequential ED). This representation facilitates modeling of emotion intensity at varying granularities. The following audio files showcase examples of our model's capabilities, illustrating both global and fine-grained emotion adjustments. You can try more audio samples in the following sections.</p>
		<hr>
		<div class="table-container">
			<table>
				<tr>
					<th></th>
					<th></th>
					<th>Original Audio</th>
					<th>Edited Audio</th>
				</tr>
				<tr>
					<td colspan="2"><h4>Utterance-level Control: 'Happy' -> 'Angry' (Female)</h4></td>
				<td class="center-align">
					<div class="audio-container">
						<audio controls>
							<source src="./audio_fi

	<div class="section">
		<hr class="class-1" />
		<h1>Section 7: Gradually Increasing Emotion Intensity throughout a speech</h1>
		<hr class="class-1" />
		<p class="body">In this section, we exhibit our model's capability to gradually INCREASE emotion intensities throughout a speech. We maintain the utterance-level intensities of the other emotions at 0.0, while word and phoneme-level intensities are randomly generated as detailed in Section 2 of this demo, and further described in our paper.</p>
		<hr>
		<div class="table-container">
			<table>
				<tr>
					<th>Angry</th>
					<th>Happy</th>
					<th>Sad</th>
					<th>Surprise</th>
				</tr>
				<tr>
				<td class="center-align">
					<div class="audio-container">
						<audio controls>
							<source src="./audio_files/IncreasingPOS/ESD/90_Angry.wav" type="audio/wav" />
							Your browser does not support the audio element.
						</audio>
					</div>
				</td>
				<td class="center-align">
					<div class="audio-container">
						