In [None]:
import glob
import numpy as np
import os
import pandas as pd

In [None]:
base_dir = "./audio_files/"
intro_dir = base_dir + "Intro/"
ref_dir = base_dir + "Reference_Mel/"
long_dir = base_dir + "Long/"
utt_dir = base_dir + "Utterance/"
blizzard = ["Blizzard"]
datasets = ["Blizzard"]
# incpos_dir = base_dir + "IncreasingPOS/"
# decpos_dir = base_dir + "DecreasingPOS/"
emos = ["Angry", "Happy", "Sad", "Surprise"]

In [None]:
def ordinal(n: int):
    if 11 <= (n % 100) <= 13:
        suffix = 'th'
    else:
        suffix = ['th', 'st', 'nd', 'rd', 'th'][min(n % 10, 4)]
    return str(n) + suffix

def GetHTML(indices, mode, title, description, section_list, headers, columns, add_text=False, text_width=4, text_bold=True, filename="", dataset=None):
    print(f'\t<div class="section">')
    print(f'\t\t<hr class="class-1" />')
    print(f'\t\t<h1>{title}</h1>')
    print(f'\t\t<hr class="class-1" />')
    print(f'\t\t<p class="body">{description}</p>')
    print(f'\t\t<hr>')

    for sec in section_list:
        if len(section_list)>1:
            print(f'\t\t<h2>{get_section_title(sec, mode)}</h2>')
            print(f'\t\t<hr>')

        print(f'\t\t<div class="table-container">')
        print(f'\t\t\t<table>')
        print(f'\t\t\t\t<tr>')
        if add_text:
            for _ in range(text_width):
                print(f'\t\t\t\t\t<th></th>')
        for head in headers:
            print(f'\t\t\t\t\t<th>{get_header(head, mode, sec)}</th>')
        print(f'\t\t\t\t</tr>')

        if dataset==None:
            init = True
            dataset = sec
        else:
            init = False
        for idx in indices[dataset]:
            print(f'\t\t\t\t<tr>')
            if add_text:
                if text_bold:
                    print(f'\t\t\t\t\t<td colspan="{text_width}"><h4>{get_text(idx, mode, dataset)}</h4></td>')
                else:
                    print(f'\t\t\t\t\t<td colspan="{text_width}">{get_text(idx, mode, dataset)}</td>')
            for col in columns:
                print(f'\t\t\t\t<td class="center-align">')
                file = get_file(dataset, idx, col, mode, sec, filename)
                if file!="blank":
                    print(f'\t\t\t\t\t<div class="audio-container">')
                    print(f'\t\t\t\t\t\t<audio controls>')
                    print(f'\t\t\t\t\t\t\t<source src="{file}" type="audio/wav" />')
                    print(f'\t\t\t\t\t\t\tYour browser does not support the audio element.')
                    print(f'\t\t\t\t\t\t</audio>')
                    print(f'\t\t\t\t\t</div>')
                else:
                    print(f'\t\t\t\t\t<p>---</p>')
                print(f'\t\t\t\t</td>')
            print(f'\t\t\t\t</tr>')
            print()
        if init:
            dataset = None

        print(f'\t\t\t</table>')
        print(f'\t\t</div>')
        print(f'\t\t<hr>')
    print(f'\t</div>')


In [None]:
modifications = ["original", "modified"]

modelnames = ["Test", "MsEI", "HED"]
# modelnames = ["Test", "SED_FS", "SED_BERT"]

In [None]:
indices = {}
# for mode in ["intro", "ref", "first", "last", "long", "incpos", "decpos"]:
for mode in ["intro", "ref", "long", "utt"]:
    indices[mode] = {}
    for dataset in datasets:
        filelists = glob.glob(eval(f"{mode}_dir") + f"{dataset}/*.wav")
        filelists.sort()
        indices[mode][dataset] = [int(os.path.basename(p).split("_")[0]) for p in filelists]
        indices[mode][dataset] = list(set(indices[mode][dataset]))
        indices[mode][dataset].sort()
        indices[mode][dataset] = indices[mode][dataset]
indices

# Display

In [None]:
def get_section_title(sec, mode="ref"):
    if mode in ["intro", "ref", "incpos", "decpos"]:
        if sec=="ESD":
            sec = "Emotional Speech Database (ESD)"
        elif sec=="Blizzard":
            sec = "Blizzard Challenge 2013"
        output = f"Dataset: {sec}"
    elif mode in ["long", "utt"]:
        if sec=="MsEI":
            sec = "MsEmoTTS (baseline)"
        elif sec=="HED":
            sec = "Hierarchical ED (ours)"
        output = f"Model: {sec}"
    elif mode=="first":
        output = None
    elif mode=="last":
        output = None
    elif mode=="incpos":
        output = None
    elif mode=="decpos":
        output = None
    elif mode=="intro":
        output = None
    return output

def get_header(head, mode="ref", sec=None):
    if mode=="ref":
        if head=="Test":
            output = "Target Audio"
        elif head=="MsEI":
            output = "MsEmoTTS"
        elif head=="HED":
            output = "Hierarchical ED (ours)"
    elif mode in ["first", "last", "long", "utt"]:
        if head=="original":
            head = "Original"
        output = f"{head}"
    elif mode in ["incpos", "decpos"]:
        output = f"{head}"
    elif mode=="intro":
        if head=="original":
            output = "Original Audio"
        elif head=="modified":
            output = "Edited Audio"
    return output

def get_text(idx, mode, dataset="Blizzard"):
    if mode=="first":
        text = np.load(glob.glob(first_dir + f"{dataset}/{idx}_text.npy")[0]).item()
        text = text.replace("，", ", ")
        words = text.split()
        # output = f'Transcript: {{{" ".join(words[:3])}}} {" ".join(words[3:])}'
        # output = f'{{{" ".join(words[:3])}}} {" ".join(words[3:])}'
        output = get_partial_label(words, [1, 3])
    elif mode=="last":
        text = np.load(glob.glob(last_dir + f"{dataset}/{idx}_text.npy")[0]).item()
        text = text.replace("，", ", ")
        words = text.split(" ")
        if len(words[-1])==1:
            start = -4
        else:
            start = -3
        # output = f'Transcript: {" ".join(words[:start])} {{{" ".join(words[start:])}}}'
        # output = f'{" ".join(words[:start])} {{{" ".join(words[start:])}}}'
        output = get_partial_label(words, [-4, -2])
    elif mode=="long":
        text = np.load(glob.glob(long_dir + f"{dataset}/{idx}_text.npy")[0]).item()
        text = text.replace("，", ", ")
        words = text.split(" ")
        chidx = list(np.load(glob.glob(long_dir + f"{dataset}/{idx}_chidx.npy")[0]))
        if len(words[-1])==1:
            start = -4
        else:
            start = -3
        output = get_partial_label(words, chidx)
    elif mode=="utt":
        text = np.load(glob.glob(utt_dir + f"{dataset}/{idx}_text.npy")[0]).item()
        output = text.replace("，", ", ")
    elif mode=="intro":
        text = np.load(glob.glob(intro_dir + f"{dataset}/{idx}_text.npy")[0]).item()
        output = text.replace("，", ", ")
    return output

In [None]:
def get_partial_label(words, selected):
    for i, s in enumerate(selected):
        if s<0:
            selected[i] = len(words) + s

    output = ""
    for w, word in enumerate(words):
        if w in selected: 
            output += "{" + word + "} "
        else:
            output += word + " "
    return output[:-1]

In [None]:
def get_file(dataset, idx, col, mode="ref", sec=None, filename=""):
    if mode=="ref":
        filename = "" if col=="Test" else f"_{filename}"
        if dataset=="ESD":
            file = glob.glob(ref_dir + f"{dataset}/{idx}_{col}{filename}.wav")[0]
        elif dataset=="Blizzard":
            file = glob.glob(ref_dir + f"{dataset}/{idx}_{col}{filename}.wav")[0]
    elif mode=="ran":
        if dataset=="ESD":
            file = glob.glob(ran_dir + f"{dataset}/{idx}_*_{col}.wav")[0]
        elif dataset=="Blizzard":
            file = glob.glob(ran_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="utbase":
        file = glob.glob(utbase_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="utinc":
        file = glob.glob(utinc_dir + f"{dataset}/{idx}_{sec}_{col}.wav")[0]
    elif mode=="utmix":
        file = glob.glob(utmix_dir + f"{dataset}/{idx}_{sec}_Surprise_{col}.wav")[0]
    elif mode=="first":
        file = glob.glob(first_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="last":
        file = glob.glob(last_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="pos":
        file = glob.glob(pos_dir + f"{dataset}/{idx}_{sec}_{col}.wav")[0]
    elif mode=="incpos":
        file = glob.glob(incpos_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="decpos":
        file = glob.glob(decpos_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="intro":
        file = glob.glob(intro_dir + f"{dataset}/{idx}_{col}.wav")[0]
    elif mode=="long":
        try:
            file = glob.glob(long_dir + f"{dataset}/{idx}_{sec}_{col}.wav")[0]
        except:
            n = 0
            for emo in emos:
                file = glob.glob(long_dir + f"{dataset}/{idx}_{sec}_{emo}.wav")
                n += len(file)
            if n > 0:
                file = "blank"
            else:
                assert False, "Wrong file"
    elif mode=="utt":
        file = glob.glob(utt_dir + f"{dataset}/{idx}_{sec}_{col}.wav")[0]
    return file

In [None]:
mode = "intro"
title = "Audio Samples"
description =  "\
This demo accompanies our paper titled 'Fine-Grained Quantitative Emotion Editing for Speech Generation', in which we introduce a novel \
emotion representation called Hierarchical Emotion Distribution (Hierarchical ED). \
This representation facilitates modeling of emotion intensity at varying granularities. \
The following audio files showcase examples of our model's capabilities. \
You can try more audio samples in the following sections."
section_list = datasets
headers = modifications
columns = modifications
GetHTML(indices[mode], mode, title, description, section_list, headers, columns, True, 2, True)

mode = "ref"
title = "Section 1: Emotion Expressiveness: Reproducibility via Ground-Truth Emotion Information"
description = "The first section presents a demo to evaluate our model's \
reproducibility. The Hierarchical ED of our model and The emotion intensities of MsEmoTTS \
are obtained by the reference audio."
section_list = datasets
headers = modelnames
columns = modelnames
GetHTML(indices[mode], mode, title, description, section_list, headers, columns, filename="Audio")

mode = "long"
title = "Section 2: Maximizing Emotion Intensities of Two Long Words"
description = "In this section, we demonstrate the control of emotion \
intensities in a specific segment of speech, primarily focusing on two words \
with the longest phoneme sequence. We elevate the word-level emotion intensities of \
these words, and the corresponding phoneme-level intensities, to 1.0, \
while maintaining the other emotion intensities constant. \
This showcases our model's ability to adjust emotion granularly \
within specified speech segments. The modifed words are represented in the curly blankets. \
the MsEmoTTS baseline is only able to change the intensity of the ground-truth emotion so \
only one emotion control is available."
section_list = modelnames[1:]
headers = ["original"] + emos
columns = ["original"] + emos
GetHTML(indices[mode], mode, title, description, section_list, headers, columns, True, 4, dataset="Blizzard")

mode = "utt"
title = "Section 3: Maximizing Utterance-level Emotion Intensities (Hierarchical ED)"
description = "In this section, we demonstrate the control of utterance-level emotion \
intensities. \
We elevate the emotion intensities of the emotion to 1.0 \
while maintaining the other emotion intensities 0.0. \
We only present our model's demo since MsEmoTTS does not cover this control."
section_list = modelnames[2:]
headers = emos
columns = emos
GetHTML(indices[mode], mode, title, description, section_list, headers, columns, True, 4, dataset="Blizzard")

mode = "ref"
title = "Appendix: Emotion Expressiveness: Reproducibility via Predicted Emotion Information"
description = "The appendix section presents a demo to show our model's \
reproducibility without using the reference audio. The Hierarchical ED and The Emotion Intensity of MsEmoTTS \
are predicted from the linguistic embedding \
generated by the linguistic encoder and used to reproduce the audio."
section_list = blizzard
headers = modelnames
columns = modelnames
GetHTML(indices[mode], mode, title, description, section_list, headers, columns, filename="Text")