In [None]:
import glob
import os
import numpy as np

# base_dir = "./audio/"
base_dir = "./audiosamples/"
other_dir = "../seq2seq-vc/datasetgeneration/LLM_responses/08-Others_multi-lingual_text/"
speakers = [str(i).zfill(4) for i in range(11, 21)]

def read_file(path):
    encodings = ['utf-8', 'utf-16', 'iso-8859-1', 'cp1252']
    for encoding in encodings:
        try:
            with open(path, 'r', encoding=encoding) as file:
                return file.readlines(), encoding
        except (UnicodeError, UnicodeDecodeError) as e:
            continue
    raise Exception("Failed to decode file with any of the specified encodings.")
    
transcription_dir = "../Dataset/ESD/"
transcriptions = {}
emotions = {}
for spk in speakers:
    path = transcription_dir+spk+f"/{spk}.txt"
    try:
        text, used_encoding = read_file(path)
        print(f"File read successfully with encoding: {used_encoding}")
    except Exception as e:
        print(str(e))
    text, used_encoding = read_file(path)
                
    for txt in text:
        if len(txt.split())<2:
            continue
        try:
            fn, tcp, emotion = txt[:-1].split("\t")
        except ValueError:
            fn = txt.split(" ")[0]
            txt = " ".join(txt.split(" ")[1:])
            tcp, emotion = txt[:-1].split("\t")
        
        transcriptions[fn] = tcp
        emotions[fn] = emotion

In [None]:
webtitle = "Hierarchical ED Demo Page"
title = "Hierarchical Control of Emotion Rendering in Speech Synthesis"
abstract = "Emotional text-to-speech synthesis (TTS) aims to generate realistic emotional speech from input text. However, quantitatively controlling multi-level emotion rendering remains challenging. In this paper, we propose a diffusion-based emotional TTS framework with a novel approach for emotion intensity modeling to facilitate fine-grained control over emotion rendering at the phoneme, word, and utterance levels. We introduce a hierarchical emotion distribution (ED) extractor that captures a quantifiable ED embedding across different speech segment levels. Additionally, we explore various acoustic features and assess their impact on emotion intensity modeling. During TTS training, the hierarchical ED embedding effectively captures the variance in emotion intensity from the reference audio and correlates it with linguistic and speaker information. During inference, the TTS model not only generates emotional speech but also quantitatively controls the emotion rendering over the speech constituents. Both objective and subjective evaluations demonstrate the effectiveness of our framework in terms of speech quality, emotional expressiveness, and hierarchical emotion control."
github_url = "https://github.com/shinshoji01/HED-project-page"
# base_repo_dir = "/"
base_repo_dir = "/HED-Demo/"
style_dir = base_repo_dir + "statics/"
fig_path = base_repo_dir + "images/emotion_intensity.png"

initial = f"""
<!DOCTYPE html>
<html>
  <head>
    <meta charset="utf-8">
    <meta content="IE=edge" http-equiv="X-UA-Compatible">
    <meta content="width=device-width, initial-scale=1" name="viewport">
    <title>{webtitle}</title>
    <link href="{style_dir}bootstrap-5.2.3-dist/css/bootstrap.min.css" rel="stylesheet">
    <link href="{style_dir}my.css" rel="stylesheet">
  </head>
  <body>
    <div class="container">
      <div class="row">
        <div class="container pt-5 mt-5 shadow p-5 mb-5 bg-white rounded">
          <div class="text-center">
            <h2>{title}</h2>
            <br>
            <h5 class="mb-4">Sho Inoue<sup>1,2</sup>, Kun Zhou<sup>3</sup>, Shuai Wang<sup>2†</sup>, Haizhou Li<sup>1,2</sup></h5>
              <p>
                <sup>1</sup>School of Data Science, <sup>2</sup>Shenzhen Research Institute of Big Data<br>
                The Chinese University of Hong Kong, Shenzhen (CUHK-Shenzhen), Shenzhen, China<br>
                <sup>3</sup>Alibaba Group, Singapore
              </p>
          </div>
          <br>
          <figure class="text-center">
            <img src="{fig_path}" alt="overall diagram of the pipeline" class="img-fluid" style="width: 900px; height: auto;">
          </figure>
          <br>
          <h3>Abstract</h3>
          <p class="lead">
          {abstract}
          </p>
          <p class="lead">You can visit the project page of this paper: <a href="{github_url}">Github Repository</a>.
          </p>
        </div>
"""[1:]

closure = f"""
      </div>
    </div>
    <script src="{style_dir}jquery/jquery-3.7.1.slim.min.js"></script>
    <script src="{style_dir}bootstrap-5.2.3-dist/bootstrap.min.js"></script>
"""[1:]
closure += """
  </body>
  <script>
    $(function(){
        $("audio").on("play", function() {
            $("audio").not(this).each(function(index, audio) {
                audio.pause();
                audio.currentTime = 0;
            });
        });
    });
    </script>
</html>
"""[1:]

In [None]:
experiments = {
    "expressiveness": {
        "ground_truth": "Reference Speech",
        "expressiveness/msemotts_OpenSMILE_esd": "MsEmoTTS (Baseline)",
        "expressiveness/relative-attributes_OpenSMILE_esd": "SVM-based HED (Baseline)",
        "expressiveness/OpenSMILE-OpenSMILE-WavLM_SER-globaloutput-GRL_esd_fcn": "Proposed w/ SER",
        "expressiveness/OpenSMILE-OpenSMILE-WavLM_EPR-globaloutput-GRL_esd_fcn": "Proposed w/ EPR",
    },
    "utterance": {
        "ground_truth": "Ground Truth",
        "utterance_remain/relative-attributes_OpenSMILE_esd": "SVM-based HED (Baseline)",
        "utterance_remain/OpenSMILE-OpenSMILE-WavLM_SER-globaloutput-GRL_esd_fcn": "Proposed w/ SER",
        "utterance_remain/OpenSMILE-OpenSMILE-WavLM_EPR-globaloutput-GRL_esd_fcn": "Proposed w/ EPR",
    },
    "word": {
        "ground_truth": "Ground Truth",
        "word-phoneme/msemotts_OpenSMILE_esd": "MsEmoTTS (Baseline)",
        "word-phoneme/msemotts_OpenSMILE_esd---cut": "___OnlyModified___",
        "word-phoneme/relative-attributes_OpenSMILE_esd": "SVM-based HED (Baseline)",
        "word-phoneme/relative-attributes_OpenSMILE_esd---cut": "___OnlyModified___",
        "word-phoneme/OpenSMILE-OpenSMILE-WavLM_SER-globaloutput-GRL_esd_fcn": "Proposed w/ SER",
        "word-phoneme/OpenSMILE-OpenSMILE-WavLM_SER-globaloutput-GRL_esd_fcn---cut": "___OnlyModified___",
        "word-phoneme/OpenSMILE-OpenSMILE-WavLM_EPR-globaloutput-GRL_esd_fcn": "Proposed w/ EPR",
        "word-phoneme/OpenSMILE-OpenSMILE-WavLM_EPR-globaloutput-GRL_esd_fcn---cut": "___OnlyModified___",
    },
}

In [None]:
def wavfiletext(wavfile):
    a = f"""
                      <td>
    """[1:]
    a += f"""
                        <audio controls="" preload="none" style="width: 240px">
                          <source src="{wavfile}" type="audio/wav">
                        </audio>
    """[1:]
    a += f"""
                      </td>
    """[1:]
    return a

In [None]:
filenames = [os.path.basename(a).split("-")[0] for a in glob.glob("./audio/" + "expressiveness/relative-attributes_OpenSMILE_esd/00*")]
filenames = list(set(filenames))
filenames.sort()
np.random.seed(0)
filenames = list(np.random.choice(filenames, 20, replace=False))
# filenames.sort()
controlfns = ["0014_000732", "0015_000726", "0013_000731", "0018_000037"]

emos = ["Angry", "Happy", "Sad", "Surprise"]
intensities = [0.0, 0.4, 0.6, 1.0]
emotioncolors = {
    "Angry": "red",
    "Happy": "orange",
    "Sad": "blue",
    "Surprise": "green",
}
modifiedparts = {
"Who is been repeating all that hard stuff to you?": "Who is been <u>repeating</u> all that <u>hard</u> <u>stuff</u> to you?", 
"Let's make the noise a snake.": "<u>Let's</u> make the <u>noise</u> a <u>snake</u>.", 
"All smile were real and the happier，the more sincere .": "All <u>smile</u> were real and the <u>happier</u>，the more <u>sincere</u> .", 
"I think it'll encourage me.": "I <u>think</u> <u>it'll</u> <u>encourage</u> me.",
}

In [None]:
titles = {
    "expressiveness": "Section 1: Emotion Expressiveness: Reproducibility via Ground-Truth Emotion Information (Emotion Transfer)",
    "utterance": "Section 2: Utterance-level Emotion Intensity Control",
    "word": "Section 3: Word-level Emotion Intensity Control",
}
explanations = {
    "expressiveness": "\
The first section presents a demo to evaluate our model's reproducibility. The Hierarchical ED of our proposed models and the emotion intensities of MsEmoTTS are obtained by the reference audio. We compared the following five samples: \
<ul> \
  <li><b>Reference Speech</b>: The reference audio<br></li> \
  <li><b>MsEmoTTS (Baseline)</b><br></li> \
  <li><b>SVM-based HED (Baseline)</b>: Hierarchical ED is obtained by SVM-based relative functions<br></li> \
  <li><b>Proposed w/ SER</b>: Hierarchical ED is obtained by the proposed SER-based relative functions<br></li> \
  <li><b>Proposed w/ EPR</b>: Hierarchical ED is obtained by the proposed EPR-based relative functions<br></li> \
</ul> \
",
    "utterance": "\
In this section, we demonstrate the control of utterance-level emotion intensities. We set the emotion intensities of the emotion to the values in the first row while maintaining the other emotion intensities constant. We don't present MsEmoTTS since it does not cover this control. \
<ul> \
  <li><b>Ground Truth</b>: The ground-truth audio<br></li> \
  <li><b>SVM-based HED (Baseline)</b>: Hierarchical ED is obtained by SVM-based relative functions<br></li> \
  <li><b>Proposed w/ SER</b>: Hierarchical ED is obtained by the proposed SER-based relative functions<br></li> \
  <li><b>Proposed w/ EPR</b>: Hierarchical ED is obtained by the proposed EPR-based relative functions<br></li> \
</ul> \
",
    "word": "\
In this section, we demonstrate the control of emotion intensities in a specific segment of speech, primarily focusing on <u>three words with underscores</u>. We set the word-level emotion intensities of these words, and the corresponding phoneme-level intensities, to the values in the first row, while maintaining the other emotion intensities constant. This showcases our model's ability to adjust emotion granularly within specified speech segments. We also present the speech samples focusing on the modified parts ('___OnlyModified___').<br>\
The MsEmoTTS baseline is only able to change the intensity of the ground-truth emotion so only one emotion control is available. \
<ul> \
  <li><b>Ground Truth</b>: The ground-truth audio<br></li> \
  <li><b>MsEmoTTS (Baseline)</b>: Only one emotion is available.<br></li> \
  <li><b>SVM-based HED (Baseline)</b>: Hierarchical ED is obtained by SVM-based relative functions<br></li> \
  <li><b>Proposed w/ SER</b>: Hierarchical ED is obtained by the proposed SER-based relative functions<br></li> \
  <li><b>Proposed w/ EPR</b>: Hierarchical ED is obtained by the proposed EPR-based relative functions<br></li> \
</ul> \
",
}
filenames_dir = {
    "expressiveness": filenames,
    "utterance": controlfns,
    "word": controlfns,
}

In [None]:
wavfiles = []
extexts = {}
for exid in experiments:
# for exid in list(experiments)[:1]:
# for exid in list(experiments)[1:]:
    text = f"""
            <div class="container pt-5 mt-5 shadow p-5 mb-5 bg-white rounded">
              <h2>{titles[exid]}</h2>
              <p class="lead">
                    {explanations[exid]}
                    </p>
    """[1:]
    emotion_list = emos if exid in ["utterance", "word"] else ["noemotion"]
    for emotion in emotion_list:
        if exid in ["utterance", "word"]:
            text += f"""
                  <br><hr><br>
                  <h4>{exid[0].upper()}{exid[1:]}-level Control: Emotion: <span style='color:{emotioncolors[emotion]};'>{emotion}</span></h4>
                  <br>
        """[1:]
        text += f"""
                  <div class="table-responsive" style="overflow-x: scroll">
                    <table class="table table-sm">
        """[1:]
        text += f"""
                      <thead>
                        <tr>
                          <th scope="col">ID</th>
        """[1:]
        for fn in filenames_dir[exid]:
            if exid=="expressiveness":
                repeat = 1
            elif exid=="utterance":
                repeat = len(intensities)
            for _ in range(repeat):
                if _==0:
                    labeltext = fn
                else:
                    labeltext = ""
                text += f"""
                          <th scope="col">{labeltext}</th>
        """[1:]
        text += """
                        </tr>
                      </thead>
        """[1:]

        text += """
                      <tbody>
                        <tr>
                          <th scope="row" style="position: sticky; left: 0; z-index:10; opacity: 1.0; background-color: white;">Text</th>
        """[1:]
        for fn in filenames_dir[exid]:
            if exid=="expressiveness":
                labeltext = f"<b>Text</b>: {transcriptions[fn]}<br><b>Emotion</b>: {emotions[fn]}"
                text += f"""
                          <td>
                            <p>{labeltext}</p>
                          </td>
        """[1:]
            else: 
                for intensity in intensities:
                    if exid=="utterance":
                        labeltext = f"<b>Text</b>: {transcriptions[fn]}<br><span style='color:{emotioncolors[emotion]};'><b>Intensity: {intensity}</b></span>"
                    elif exid=="word":
                        labeltext = f"<b>Text</b>: {modifiedparts[transcriptions[fn]]}<br><span style='color:{emotioncolors[emotion]};'><b>Intensity: {intensity}</b></span>"
                    text += f"""
                          <td>
                            <p>{labeltext}</p>
                          </td>
        """[1:]
        text += """
                        </tr>
        """[1:]
        
        for key in experiments[exid]:
            text += f"""
                        <tr>
                          <th scope="row" style="position: sticky; left: 0; z-index:10; opacity: 1.0; background-color: white;">{experiments[exid][key]}</th>
        """[1:]
            for fn in filenames_dir[exid]:
                wavbasefile = base_dir + key.split("---")[0] + f"/{fn}.wav"
                if exid=="expressiveness":
                    if os.path.exists(wavbasefile):
                        wavfile = wavbasefile
                    else:
                        wavfile = wavbasefile[:-4] + "-0.wav"
                    wavfiles += [wavfile]
                    text = text + wavfiletext(wavfile)
                else:
                    for intensity in intensities:
                        if os.path.exists(wavbasefile):
                            wavfile = wavbasefile
                        else:
                            if "---cut" in key:
                                wavfile = wavbasefile[:-4] + f"-{emotion}-{intensity}-0-cut.wav"
                            else:
                                wavfile = wavbasefile[:-4] + f"-{emotion}-{intensity}-0.wav"
                        if os.path.exists(wavfile):
                            wavfiles += [wavfile]
                            text = text + wavfiletext(wavfile)
                        else:
                            text += """
                          <td>
                              <p></p>
                          </td>
        """[1:]
            text += """
                        </tr>
        """[1:]
            
        text += """
                      </tbody>
                    </table>
                  </div>
                  <p class="lead">* please scroll horizontally to explore additional columns in the table.</p>
        """[1:]
    text += """
                </div>
    """[1:]
    # extexts[exid] = headerfn + headertext + body + tableclosure
    # extexts[exid] = headerfn + headertext + text
    extexts[exid] = text
    extexts[exid] = "\n".join([a[4:] for a in extexts[exid].split("\n")])

In [None]:
wholetext = ""
wholetext += initial
for exid in experiments:
# for exid in list(experiments)[:1]:
# for exid in list(experiments)[1:]:
    wholetext += extexts[exid]
wholetext += closure
f = open("index.html", "w")
f.write(wholetext)
f.close()
print(wholetext)

In [None]:
import shutil

wavfiles = list(set(wavfiles))
wavfiles.sort()

copy = False
tgtdir = "./audiosamples/"
for path in wavfiles:
    src = path.replace("audiosamples", "audio")
    if copy and os.path.exists(src) and not(os.path.exists(path)):
        # print(path)
        os.makedirs(os.path.dirname(path), exist_ok=True)
        shutil.copy(src, path)