In [430]:
import glob
import os
import numpy as np

In [449]:
base_dir = "./audio/"
transcription_dir = "../seq2seq-vc/datasetgeneration/LLM_responses/08-A_multi-lingual_text/"
other_dir = "../seq2seq-vc/datasetgeneration/LLM_responses/08-Others_multi-lingual_text/"

In [468]:
webtitle = "MAcST Demo Page"
title = "MAcST: Multi-Accent Speech Synthesis via Text Transliteration for Accent Conversion"
abstract = "In accented voice conversion or accent conversion, we seek to convert the accent in speech from one another while preserving speaker identity and semantic content. In this study, we formulate a novel method for creating multi-accented speech samples, thus pairs of accented speech samples by the same speaker, through text transliteration for training accent conversion systems. We begin by generating transliterated text with a Large Language Model (LLM), which are then fed into multilingual TTS models to synthesize accented English speech. As a reference system, we built a sequence-to-sequence model on the synthetic parallel corpus for accent conversion. We validated the proposed method for both native and non-native English speakers. Subjective and objective evaluations further validate our dataset's effectiveness in accent conversion studies. "
github_url = "https://github.com/shinshoji01/MAcST-project-page"
style_dir = "/MAcST-Demo/statics/"
# style_dir = "/statics/"

initial = f"""
<!DOCTYPE html>
<html>
  <head>
    <meta charset="utf-8">
    <meta content="IE=edge" http-equiv="X-UA-Compatible">
    <meta content="width=device-width, initial-scale=1" name="viewport">
    <title>{webtitle}</title>
    <link href="{style_dir}bootstrap-5.2.3-dist/css/bootstrap.min.css" rel="stylesheet">
    <link href="{style_dir}my.css" rel="stylesheet">
  </head>
  <body>
    <div class="container">
      <div class="row">
        <div class="container pt-5 mt-5 shadow p-5 mb-5 bg-white rounded">
          <h2 style="text-align: center">{title}<br>
          </h2><br>
          <h3>Abstract</h3>
          <p class="lead">
          {abstract}
          </p>
          <p class="lead">You can visit the project page of this paper: <a href="{github_url}">Github Repository</a>.
          </p>
        </div>
"""[1:]

closure = f"""
      </div>
    </div>
    <script src="{style_dir}jquery/jquery-3.7.1.slim.min.js"></script>
    <script src="{style_dir}bootstrap-5.2.3-dist/bootstrap.min.js"></script>
"""[1:]
closure += """
  </body>
  <script>
    $(function(){
        $("audio").on("play", function() {
            $("audio").not(this).each(function(index, audio) {
                audio.pause();
                audio.currentTime = 0;
            });
        });
    });
    </script>
</html>
"""[1:]

In [469]:
experiments = {
    "hindi": {
        "CMU-ARCTIC/SLT": "Ground Truth (SLT/American)",
        "PD-AST/SLT/English": "SPAccenT (SLT/American)",
        "PD-AST/SLT/Hindi": "SPAccenT (SLT/Hindi)",
        "L2-ARCTIC/ASI": "Ground Truth (ASI/Hindi)",
        "PD-AST/ASI/Hindi": "SPAccenT (ASI/Hindi)",
        "L2-ARCTIC/TNI": "Ground Truth (TNI/Hindi)",
        "PD-AST/TNI/Hindi": "SPAccenT (TNI/Hindi)",
    },
    "korean": {
        "CMU-ARCTIC/SLT": "Ground Truth (SLT/American)",
        "PD-AST/SLT/English": "SPAccenT (SLT/American)",
        "PD-AST/SLT/Korean": "SPAccenT (SLT/Korean)",
        "L2-ARCTIC/HKK": "Ground Truth (HKK/Korean)",
        "PD-AST/HKK/Korean": "SPAccenT (HKK/Korean)",
        "L2-ARCTIC/YDCK": "Ground Truth (YDCK/Korean)",
        "PD-AST/YDCK/Korean": "SPAccenT (YDCK/Korean)",
    },
    "ac": {
        "CMU-ARCTIC/SLT": "Ground Truth (SLT/American/source)",
        "PD-AST/SLT/English": "SPAccenT (SLT/American)",
        "PD-AST/SLT/Hindi": "SPAccenT (SLT/Hindi/target)",
        "VTN_fine-tuning_nocondition_gt2syn_hifiganmelhifiganmel_hubert_norepeating/100000": "AC w/o Data Augmentation",
        "VTN_fine-tuning_nocondition_mix2synVCTK3hr_hifiganmelhifiganmel_hubert_norepeating/100000": "AC w/ Data Augmentation (ours)",
    },
    "others": {
        "Others/English": "American",
        "Others/Hindi": "Hindi",
        "Others/Korean": "Korean",
        "Others/Japanese": "Japanese",
        "Others/Russian": "Russian",
        "Others/Arabic": "Arabic",
        # "Others/French": "French",
        # "Others/Mandarin": "Mandarin",
    },
}

In [470]:
# filenames = [os.path.basename(a)[:-4] for a in glob.glob(base_dir + "CMU-ARCTIC___SLT/*")]
filenames = ['arctic_a0024', 'arctic_a0029', 'arctic_a0058', 'arctic_a0073', 'arctic_a0085', 'arctic_a0092', 'arctic_a0099', 'arctic_a0131', 'arctic_a0152', 'arctic_a0170', 'arctic_a0202', 'arctic_a0210', 'arctic_a0245', 'arctic_a0258', 'arctic_a0274', 'arctic_a0315', 'arctic_a0369', 'arctic_a0374', 'arctic_a0378', 'arctic_a0384', 'arctic_a0389', 'arctic_a0449', 'arctic_a0544', 'arctic_a0545', 'arctic_a0552', 'arctic_a0556', 'arctic_a0561', 'arctic_b0008', 'arctic_b0019', 'arctic_b0047', 'arctic_b0067', 'arctic_b0083', 'arctic_b0113', 'arctic_b0142', 'arctic_b0147', 'arctic_b0163', 'arctic_b0190', 'arctic_b0210', 'arctic_b0224', 'arctic_b0253', 'arctic_b0257', 'arctic_b0296', 'arctic_b0317', 'arctic_b0318', 'arctic_b0339', 'arctic_b0372', 'arctic_b0380', 'arctic_b0381', 'arctic_b0469', 'arctic_b0474', 'arctic_b0497', 'arctic_b0508']
np.random.seed(0)
filenames = list(np.random.choice(filenames, 20, replace=False))
filenames.sort()
otherfns = ['arctic_a0058', 'arctic_a0085', 'arctic_a0210', 'arctic_a0561', 'arctic_b0019']
otherfns.sort()

In [471]:
titles = {
    "hindi": "Speech Samples of SPAccenT (Hindi Accent)",
    "korean": "Speech Samples of SPAccenT (Korean Accent)",
    "ac": "Speech Samples of Accent Conversion (Hindi Accent)",
    "others": "Other Accents from SPAccenT",
}
explanations = {
    "hindi": "\
This section includes audio samples generated by our proposed generation method, SPAccenT. \
There are three speakers involved in this section.<br> \
<ul> \
  <li><b>SLT</b>: American speaker from CMU-ARCTIC <br></li> \
  <li><b>ASI</b>: Hindi speaker from L2-ARCTIC <br></li> \
  <li><b>TNI</b>: Hindi speaker from L2-ARCTIC <br></li> \
</ul> \
In SPAccenT, the languages in curly brackets indicate the transliteration languages. Each column contains the original transcription and its Hindi-transliterated text.\
",
    "korean": "\
This section includes audio samples generated by our proposed generation method, SPAccenT. \
There are three speakers involved in this section.<br> \
<ul> \
  <li><b>SLT</b>: American speaker from CMU-ARCTIC <br></li> \
  <li><b>HKK</b>: Korean speaker from L2-ARCTIC <br></li> \
  <li><b>YDCK</b>: Korean speaker from L2-ARCTIC <br></li> \
</ul> \
In SPAccenT, the languages in curly brackets indicate the transliteration languages. Each column contains the original transcription and its Korean-transliterated text.\
",
    "ac": "\
This section includes audio samples generated by Accent Conversion (AC) Models in Hindi Accent. \
The input is a native accent while the output is a Hindi accent. \
We used SLT for the whole experiment. \
We compared two models: \
the first (<b>AC w/o Data Augmentation</b>) was trained on paired data with ground-truth input from CMU-ARCTIC and synthetic target output from SPAccenT; \
the second model (<b>AC w/ Data Augmentation (ours)</b>) incorporated additional synthetic speech pairs from SPAccenT. We augmented our dataset with 1 hour from ARCTIC transcriptions and an extra 3 hours from VCTK to generate American and Hindi-accented speech pairs. \
",
    "others": "\
This section includes audio samples of additional accents, generated by SPAccenT including Japanese, Russian, and Arabic. \
We used a speaker provided by <a href='https://elevenlabs.io/'>11ElevenLabs</a>. \
Each audio sample is accompanied by the transliterated text corresponding to the accent. \
",
}
accents = {
    "hindi": "Hindi",
    "korean": "Korean",
    "ac": "Hindi",
    "others": False,
}
textincell = {
    "hindi": False,
    "korean": False,
    "ac": False,
    "others": True,
}
filenames_dir = {
    "hindi": filenames,
    "korean": filenames,
    "ac": filenames,
    "others": filenames,
}

In [472]:
extexts = {}
for exid in experiments:
    text = f"""
            <div class="container pt-5 mt-5 shadow p-5 mb-5 bg-white rounded">
              <h3>{titles[exid]}</h3>
              <p class="lead">
                    {explanations[exid]}
                    </p>
              <div class="table-responsive" style="overflow-x: scroll">
                <table class="table table-sm">
    """[1:]
    text += f"""
                  <thead>
                    <tr>
                      <th scope="col">ID</th>
    """[1:]
    for fn in filenames_dir[exid]:
        text += f"""
                      <th scope="col">{fn}</th>
    """[1:]
    text += """
                    </tr>
                  </thead>
    """[1:]
    headerfn = text

    text = """
                  <tbody>
                    <tr>
                      <th scope="row" style="position: sticky; left: 0; z-index:10; opacity: 1.0; background-color: white;">Text</th>
    """[1:]
    for fn in filenames_dir[exid]:
        data = np.load(transcription_dir + f"{fn}.npy", allow_pickle=True).item()
        if accents[exid]:
            text += f"""
                      <td>
                        <p><b>Original</b>: {data['Original English']}<br><br><b>Transliterated</b>: {data[accents[exid]]}</p>
                      </td>
    """[1:]
        else:
            text += f"""
                      <td>
                        <p>{data['Original English']}</p>
                      </td>
    """[1:]
    text += """
                    </tr>
    """[1:]
    headertext = text

    text = ""
    for key in experiments[exid]:
        text += f"""
                    <tr>
                      <th scope="row" style="position: sticky; left: 0; z-index:10; opacity: 1.0; background-color: white;">{experiments[exid][key]}</th>
    """[1:]
        for fn in filenames_dir[exid]:
            wavfile = base_dir + "___".join(key.split("/")) + f"/{fn}.wav"
            text += f"""
                      <td>
    """[1:]
            text += f"""
                        <audio controls="" preload="none" style="width: 240px">
                          <source src="{wavfile}" type="audio/wav">
                        </audio>
    """[1:]
            if textincell[exid]:
                data = np.load(other_dir + f"{fn}.npy", allow_pickle=True).item()
                integrated_text = data[key.split("/")[-1]]
                text += f"""
                        <p>{integrated_text}</p>
    """[1:]
            text += f"""
                      </td>
    """[1:]
        text += """
                    </tr>
    """[1:]
    body = text

    text = """
                  </tbody>
                </table>
              </div>
              <p class="lead">* please scroll horizontally to explore additional columns in the table.</p>
            </div>
    """[1:]
    tableclosure = text
    extexts[exid] = headerfn + headertext + body + tableclosure
    extexts[exid] = "\n".join([a[4:] for a in extexts[exid].split("\n")])

In [473]:
wholetext = ""
wholetext += initial
for exid in experiments:
    wholetext += extexts[exid]
wholetext += closure
f = open("index.html", "w")
f.write(wholetext)
f.close()
print(wholetext)

<!DOCTYPE html>
<html>
  <head>
    <meta charset="utf-8">
    <meta content="IE=edge" http-equiv="X-UA-Compatible">
    <meta content="width=device-width, initial-scale=1" name="viewport">
    <title>MAcST Demo Page</title>
    <link href="/MAcST-Demo/statics/bootstrap-5.2.3-dist/css/bootstrap.min.css" rel="stylesheet">
    <link href="/MAcST-Demo/statics/my.css" rel="stylesheet">
  </head>
  <body>
    <div class="container">
      <div class="row">
        <div class="container pt-5 mt-5 shadow p-5 mb-5 bg-white rounded">
          <h2 style="text-align: center">MAcST: Multi-Accent Speech Synthesis via Text Transliteration for Accent Conversion<br>
          </h2><br>
          <h3>Abstract</h3>
          <p class="lead">
          In accented voice conversion or accent conversion, we seek to convert the accent in speech from one another while preserving speaker identity and semantic content. In this study, we formulate a novel method for creating multi-accented speech samples, thus