In [None]:
import glob
import os
import numpy as np

base_dir = "./audio/"
# base_dir = "./audiosamples/"
# other_dir = "../seq2seq-vc/datasetgeneration/LLM_responses/08-Others_multi-lingual_text/"
speakers = [str(i).zfill(4) for i in range(11, 21)]

def read_file(path):
    encodings = ['utf-8', 'utf-16', 'iso-8859-1', 'cp1252']
    for encoding in encodings:
        try:
            with open(path, 'r', encoding=encoding) as file:
                return file.readlines(), encoding
        except (UnicodeError, UnicodeDecodeError) as e:
            continue
    raise Exception("Failed to decode file with any of the specified encodings.")
    
transcription_dir = "./audio/text_dir_LibriTTS.npy"
transcriptions = np.load(transcription_dir, allow_pickle=True).item()

transcription_dir = "../Dataset/ESD/"
emotions = {}
for spk in speakers:
    path = transcription_dir+spk+f"/{spk}.txt"
    try:
        text, used_encoding = read_file(path)
        print(f"File read successfully with encoding: {used_encoding}")
    except Exception as e:
        print(str(e))
    text, used_encoding = read_file(path)
                
    for txt in text:
        if len(txt.split())<2:
            continue
        try:
            fn, tcp, emotion = txt[:-1].split("\t")
        except ValueError:
            fn = txt.split(" ")[0]
            txt = " ".join(txt.split(" ")[1:])
            tcp, emotion = txt[:-1].split("\t")
        
        transcriptions[fn] = tcp
        emotions[fn] = emotion
        
def wavfiletext(wavfile):
    a = f"""
                      <td>
    """[1:]
    a += f"""
                        <audio controls="" preload="none" style="width: 240px">
                          <source src="{wavfile}" type="audio/wav">
                        </audio>
    """[1:]
    a += f"""
                      </td>
    """[1:]
    return a

In [None]:
webtitle = "Multi-Step Prediction of Hierarchical Emotion Distribution Demo Page"
title = "Prediction and Control of Hierarchical Emotion Distribution for Text-to-Speech Synthesis"
abstract = "This paper extends our previous work by investigating hierarchical emotion distribution (ED) for achieving multi-level quantitative control of emotion rendering in text-to-speech synthesis (TTS). We introduce a novel multi-step hierarchical ED prediction module that quantifies emotion variance at the utterance, word, and phoneme levels. By predicting emotion variance in a multi-step manner, our TTS framework leverages global emotional context to refine local emotional variations, thereby capturing the intrinsic hierarchical structure of speech emotion.  Our approach is validated through its integration into a variance adaptor and an external module design compatible with various TTS systems.  Both objective and subjective evaluations demonstrate that our framework significantly enhances emotional expressiveness and enables precise control of emotion rendering across multiple speech granularities."
github_url = "https://github.com/shinshoji01/multi-step-prediction-HED-project-page"
# base_repo_dir = "/"
base_repo_dir = "/multi-step-prediction-HED/"
style_dir = base_repo_dir + "statics/"
fig_path = base_repo_dir + "images/emotion_intensity.png"

initial = f"""
<!DOCTYPE html>
<html>
  <head>
    <meta charset="utf-8">
    <meta content="IE=edge" http-equiv="X-UA-Compatible">
    <meta content="width=device-width, initial-scale=1" name="viewport">
    <title>{webtitle}</title>
    <link href="{style_dir}bootstrap-5.2.3-dist/css/bootstrap.min.css" rel="stylesheet">
    <link href="{style_dir}my.css" rel="stylesheet">
  </head>
  <body>
    <div class="container">
      <div class="row">
        <div class="container pt-5 mt-5 shadow p-5 mb-5 bg-white rounded">
          <div class="text-center">
            <h2>{title}</h2>
            <br>
            <h5 class="mb-4">Sho Inoue<sup>1,2</sup>, Kun Zhou<sup>3</sup>, Shuai Wang<sup>2†</sup>, Haizhou Li<sup>1,2</sup></h5>
              <p>
                <sup>1</sup>School of Data Science, <sup>2</sup>Shenzhen Research Institute of Big Data<br>
                The Chinese University of Hong Kong, Shenzhen (CUHK-Shenzhen), Shenzhen, China<br>
                <sup>3</sup>Alibaba Group, Singapore
              </p>
          </div>
          <br>
          <figure class="text-center">
            <img src="{fig_path}" alt="overall diagram of the pipeline" class="img-fluid" style="width: 900px; height: auto;">
          </figure>
          <br>
          <h3>Abstract</h3>
          <p class="lead">
          {abstract}
        </div>
"""[1:]

closure = f"""
      </div>
    </div>
    <script src="{style_dir}jquery/jquery-3.7.1.slim.min.js"></script>
    <script src="{style_dir}bootstrap-5.2.3-dist/bootstrap.min.js"></script>
"""[1:]
closure += """
  </body>
  <script>
    $(function(){
        $("audio").on("play", function() {
            $("audio").not(this).each(function(index, audio) {
                audio.pause();
                audio.currentTime = 0;
            });
        });
    });
    </script>
</html>
"""[1:]

In [None]:
filenames = [p for p in transcriptions if len(p.split("_"))>=3]
filenames.sort()
np.random.seed(0)
filenames = list(np.random.choice(filenames, 20, replace=False))
# filenames.sort()
# controlfns = ["0014_000732", "0015_000726", "0013_000731", "0018_000037"]
controlfns = ["0015_000011", "0014_000354", "0018_000009", "0020_000018"]

emos = ["Angry", "Happy", "Sad", "Surprise"]
intensities = [0.0, 0.4, 0.6, 1.0]
emotioncolors = {
    "Angry": "red",
    "Happy": "orange",
    "Sad": "blue",
    "Surprise": "green",
}
modifiedparts = {
"Who is been repeating all that hard stuff to you?": "Who is been <u>repeating</u> all that <u>hard</u> <u>stuff</u> to you?", 
"Let's make the noise a snake.": "<u>Let's</u> make the <u>noise</u> a <u>snake</u>.", 
"All smile were real and the happier，the more sincere .": "All <u>smile</u> were real and the <u>happier</u>，the more <u>sincere</u> .", 
"I think it'll encourage me.": "I <u>think</u> <u>it'll</u> <u>encourage</u> me.",
"Chapter ten a warm welcome.": "<u>Chapter</u> ten a <u>warm</u> <u>welcome</u>.",
"They went up to the dark mass job had pointed out.": "They <u>went</u> up to the <u>dark</u> mass job had <u>pointed</u> out.",
"On the twenty second of last march.": "On the <u>twenty</u> <u>second</u> of last <u>march</u>.",
"Then sadly it is much farther.": "Then <u>sadly</u> it is <u>much</u> <u>farther</u>.",
}

In [None]:
titles = {
    "expressiveness": "Section 1: Emotion Expressiveness: Reproducibility via Ground-Truth Emotion Information (Emotion Transfer)",
    "utterance": "Section 2: Utterance-level Emotion Intensity Control",
    "word": "Section 3: Word-level Emotion Intensity Control",
}
explanations = {
    "expressiveness": "\
The first section presents a demo to evaluate our model's reproducibility. The Hierarchical EDs are obtained by the reference audio for those models with 'GT'. Instead, the Hierarchical EDs are predicted by textual cues for those with 'Pred'. 'External' and 'VA' denote different integration methods for ED modeling within TTS frameworks. In the 'External' approach, we employ a model-agnostic pipeline and incorporate a hierarchical ED embedding following text processing. In contrast, the 'VA' approach integrates hierarchical ED modeling directly within the variance adaptor of FastSpeech2: \
<ul> \
  <li><b>Reference Speech</b>: The reference audio<br></li> \
  <li><b>GT | External</b>: 'External' Model with Ground-Truth Hierarchical ED<br></li> \
  <li><b>GT | VA (Baseline)</b>: 'VA' Model with Ground-Truth Hierarchical ED<br></li> \
  <li><b>GT | VA(Multi-Step)</b>: 'VA(Multi-Step)' Model with Ground-Truth Hierarchical ED<br></li> \
  <li><b>Pred | External | One-Step (Baseline)</b>: 'External' Model with Predicted Hierarchical ED from One-Step Predictor<br></li> \
  <li><b>Pred | External | Multi-Step</b>: 'External' Model with Predicted Hierarchical ED from Multi-Step Predictor<br></li> \
  <li><b>Pred | VA (Baseline)</b>: 'VA' Model with Predicted Hierarchical ED by One-Step Predictor<br></li> \
  <li><b>Pred | VA(Multi-Step)</b>: 'VA(Multi-Step)' Model with Predicted Hierarchical ED by Multi-Step Predictor<br></li> \
</ul> \
",
    "utterance": "\
In this section, we demonstrate the control of utterance-level emotion intensities. We set the emotion intensities of the emotion to the values in the first row while setting the other emotion intensities 0.0. \
<ul> \
  <li><b>Ground Truth</b>: The Ground Truth sample reconstructed by Vocoder<br></li> \
  <li><b>External</b>: 'External' Model<br></li> \
  <li><b>VA(Multi-Step)</b>: 'VA(Multi-Step)' Model<br></li> \
</ul> \
",
    "word": "\
In this section, we demonstrate the control of emotion intensities in a specific segment of speech, primarily focusing on <u>three words with underscores</u>. We set the word-level emotion intensities of these words, and the corresponding phoneme-level intensities, to the values in the first row, while maintaining the other emotion intensities constant. This showcases our model's ability to adjust emotion granularly within specified speech segments. We also present the speech samples focusing on the modified parts ('___OnlyModified___'). \
<ul> \
  <li><b>Ground Truth</b>: The Ground Truth sample reconstructed by Vocoder<br></li> \
  <li><b>External</b>: 'External' Model<br></li> \
  <li><b>VA(Multi-Step)</b>: 'VA(Multi-Step)' Model<br></li> \
</ul> \
",
}
filenames_dir = {
    "expressiveness": filenames,
    "utterance": controlfns,
    "word": controlfns,
}

In [None]:
experiments = {
    "expressiveness": {
        "expressiveness/ground-truth": "Reference Speech",
        
        "expressiveness/External/gtHED": "GT | External",
        "expressiveness/Variance-Adaptor/gtHED": "GT | VA",
        "expressiveness/Variance-Adaptor_seq/gtHED": "GT | VA(Multi-Step)",
        
        "expressiveness/External/predHED---External---noseq": "Pred | External | One-Step",
        "expressiveness/External/predHED---External": "Pred | External | Multi-Step",
        
        "expressiveness/Variance-Adaptor/predHED---Variance-Adaptor": "Pred | VA",
        "expressiveness/Variance-Adaptor_seq/predHED---Variance-Adaptor_seq": "Pred | VA(Multi-Step)",
    },
    "utterance": {
        "control/ground-truth": "Ground Truth",
        "control/External_Embedding/utterance": "External",
        "control/Variance-Adaptor_Embedding_seq/utterance": "VA(Multi-Step)",
    },
    "word": {
        "control/ground-truth": "Ground Truth",
        "control/External_Embedding/words-phonemes": "External",
        "control/External_Embedding/words-phonemes---cut": "___OnlyModified___",
        "control/Variance-Adaptor_Embedding_seq/words-phonemes": "VA(Multi-Step)",
        "control/Variance-Adaptor_Embedding_seq/words-phonemes---cut": "___OnlyModified___",
    },
}

In [None]:
wavfiles = []
extexts = {}
for exid in experiments:
    text = f"""
            <div class="container pt-5 mt-5 shadow p-5 mb-5 bg-white rounded">
              <h2>{titles[exid]}</h2>
              <p class="lead">
                    {explanations[exid]}
                    </p>
    """[1:]
    emotion_list = emos if exid in ["utterance", "word"] else ["noemotion"]
    for emotion in emotion_list:
        if exid in ["utterance", "word"]:
            text += f"""
                  <br><hr><br>
                  <h4>{exid[0].upper()}{exid[1:]}-level Control: Emotion: <span style='color:{emotioncolors[emotion]};'>{emotion}</span></h4>
                  <br>
        """[1:]
        text += f"""
                  <div class="table-responsive" style="overflow-x: scroll">
                    <table class="table table-sm">
        """[1:]
        text += f"""
                      <thead>
                        <tr>
                          <th scope="col">ID</th>
        """[1:]
        for fn in filenames_dir[exid]:
            if exid=="expressiveness":
                repeat = 1
            elif exid=="utterance":
                repeat = len(intensities)
            for _ in range(repeat):
                if _==0:
                    labeltext = fn
                else:
                    labeltext = ""
                text += f"""
                          <th scope="col">{labeltext}</th>
        """[1:]
        text += """
                        </tr>
                      </thead>
        """[1:]

        text += """
                      <tbody>
                        <tr>
                          <th scope="row" style="position: sticky; left: 0; z-index:10; opacity: 1.0; background-color: white;">Text</th>
        """[1:]
        for fn in filenames_dir[exid]:
            if exid=="expressiveness":
                labeltext = f"<b>Text</b>: {transcriptions[fn]}<br>"
                text += f"""
                          <td>
                            <p>{labeltext}</p>
                          </td>
        """[1:]
            else: 
                for intensity in intensities:
                    if exid=="utterance":
                        labeltext = f"<b>Text</b>: {transcriptions[fn]}<br><span style='color:{emotioncolors[emotion]};'><b>Intensity: {intensity}</b></span>"
                    elif exid=="word":
                        # labeltext = f"<b>Text</b>: {transcriptions[fn]}<br><span style='color:{emotioncolors[emotion]};'><b>Intensity: {intensity}</b></span>"
                        labeltext = f"<b>Text</b>: {modifiedparts[transcriptions[fn]]}<br><span style='color:{emotioncolors[emotion]};'><b>Intensity: {intensity}</b></span>"
                    text += f"""
                          <td>
                            <p>{labeltext}</p>
                          </td>
        """[1:]
        text += """
                        </tr>
        """[1:]
        
        for key in experiments[exid]:
            text += f"""
                        <tr>
                          <th scope="row" style="position: sticky; left: 0; z-index:10; opacity: 1.0; background-color: white;">{experiments[exid][key]}</th>
        """[1:]
            for fn in filenames_dir[exid]:
                if exid=="expressiveness":
                    a = fn.split("_")
                    d1, d2 = a[0], a[1]
                    wavbasefile = base_dir + f"{key}/dev-clean/{d1}/{d2}/{fn}.wav"
                    if os.path.exists(wavbasefile):
                        wavfile = wavbasefile
                    else:
                        a = wavbasefile[:-4] + "-0.wav"
                        if os.path.exists(a):
                            wavfile = a
                    wavfiles += [wavfile]
                    text = text + wavfiletext(wavfile)
                else:
                    spk = fn.split("_")[0]
                    keyname = key.split("---")[0]
                    if "ground-truth" in key:
                        wavbasefile = base_dir + f"{keyname}/{spk}/{emotions[fn]}/evaluation/{fn}.wav"
                    else:
                        wavbasefile = base_dir + f"{keyname}/{spk}/{emotions[fn]}/evaluation/{fn}"
                    for intensity in intensities:
                        if os.path.exists(wavbasefile):
                            wavfile = wavbasefile
                        else:
                            if "---cut" in key:
                                wavfile = wavbasefile + f"---{emotion}---{intensity}___split.wav"
                            else:
                                wavfile = wavbasefile + f"---{emotion}---{intensity}.wav"
                        if os.path.exists(wavfile):
                            wavfiles += [wavfile]
                            text = text + wavfiletext(wavfile)
                        else:
                            text += """
                          <td>
                              <p></p>
                          </td>
        """[1:]
            text += """
                        </tr>
        """[1:]
            
        text += """
                      </tbody>
                    </table>
                  </div>
                  <p class="lead">* please scroll horizontally to explore additional columns in the table.</p>
        """[1:]
    text += """
                </div>
    """[1:]
    # extexts[exid] = headerfn + headertext + body + tableclosure
    # extexts[exid] = headerfn + headertext + text
    extexts[exid] = text
    extexts[exid] = "\n".join([a[4:] for a in extexts[exid].split("\n")])

In [None]:
wholetext = ""
wholetext += initial
# for exid in experiments:
for exid in list(experiments.keys()):
    wholetext += extexts[exid]
wholetext += closure
f = open("index.html", "w")
f.write(wholetext)
f.close()
print(wholetext)

In [None]:
# import shutil
# files = glob.glob("./audio/control/*/*/*/*/evaluation/*")
# files.sort()
# delete_files = []
# save_files = []
# for path in files:
#     if np.array([fn in path for fn in controlfns]).sum()>0:
#         save_files += [path]
#     else:
#         delete_files += [path]
# for path in delete_files:
#     os.remove(path)

In [None]:
# import shutil

# wavfiles = list(set(wavfiles))
# wavfiles.sort()

# copy = False
# tgtdir = "./audiosamples/"
# for path in wavfiles:
#     savepath = tgtdir + path[len(base_dir):]
#     if copy:
#         os.makedirs(os.path.dirname(savepath), exist_ok=True)
#         shutil.copy(path, savepath)