In [1]:
import warnings
warnings.filterwarnings("ignore")
import glob
import os
from tqdm import tqdm
import numpy as np
import librosa
import pandas as pd

import sys
sys.path.append("../../cuhksz-phd/sho_util/")
from pyfiles.sound import play_audio

In [2]:
import re
from whisper.normalizers.english import EnglishNumberNormalizer, EnglishSpellingNormalizer, remove_symbols_and_diacritics
# keep numbers, -, and '
class EnglishTextNormalizer:
    def __init__(self):
        self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b"
        self.replacers = {
            # common contractions
            r"\bwon't\b": "will not",
            r"\bcan't\b": "can not",
            r"\blet's\b": "let us",
            r"\bain't\b": "aint",
            r"\by'all\b": "you all",
            r"\bwanna\b": "want to",
            r"\bgotta\b": "got to",
            r"\bgonna\b": "going to",
            r"\bi'ma\b": "i am going to",
            r"\bimma\b": "i am going to",
            r"\bwoulda\b": "would have",
            r"\bcoulda\b": "could have",
            r"\bshoulda\b": "should have",
            r"\bma'am\b": "madam",
            # contractions in titles/prefixes
            r"\bmr\b": "mister ",
            r"\bmrs\b": "missus ",
            r"\bst\b": "saint ",
            r"\bdr\b": "doctor ",
            r"\bprof\b": "professor ",
            r"\bcapt\b": "captain ",
            r"\bgov\b": "governor ",
            r"\bald\b": "alderman ",
            r"\bgen\b": "general ",
            r"\bsen\b": "senator ",
            r"\brep\b": "representative ",
            r"\bpres\b": "president ",
            r"\brev\b": "reverend ",
            r"\bhon\b": "honorable ",
            r"\basst\b": "assistant ",
            r"\bassoc\b": "associate ",
            r"\blt\b": "lieutenant ",
            r"\bcol\b": "colonel ",
            r"\bjr\b": "junior ",
            r"\bsr\b": "senior ",
            r"\besq\b": "esquire ",
            # prefect tenses, ideally it should be any past participles, but it's harder..
            r"'d been\b": " had been",
            r"'s been\b": " has been",
            r"'d gone\b": " had gone",
            r"'s gone\b": " has gone",
            r"'d done\b": " had done",  # "'s done" is ambiguous
            r"'s got\b": " has got",
            # general contractions
            r"n't\b": " not",
            r"'re\b": " are",
            # r"'s\b": " is",
            r"'d\b": " would",
            r"'ll\b": " will",
            r"'t\b": " not",
            r"'ve\b": " have",
            r"'m\b": " am",
        }
        self.standardize_numbers = EnglishNumberNormalizer()
        self.standardize_spellings = EnglishSpellingNormalizer()

    def __call__(self, s: str):
        s = s.lower()

        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
        s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
        s = re.sub(self.ignore_patterns, "", s)
        # s = re.sub(r"\s+'", "'", s)  # when there's a space before an apostrophe

        for pattern, replacement in self.replacers.items():
            s = re.sub(pattern, replacement, s)

        s = re.sub(r"(\d),(\d)", r"\1\2", s)  # remove commas between digits
        s = re.sub(r"\.([^0-9]|$)", r" \1", s)  # remove periods not followed by numbers
        s = remove_symbols_and_diacritics(s, keep=".%$¢€£-'")  # keep numeric symbols

        # s = self.standardize_numbers(s)
        s = self.standardize_spellings(s)

        # now remove prefix/suffix symbols that are not preceded/followed by numbers
        s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
        s = re.sub(r"([^0-9])%", r"\1 ", s)

        s = re.sub(r"\s+", " ", s)  # replace any successive whitespaces with a space
        
        return s
normalizer = EnglishTextNormalizer()

# Transcription Size

- LibriTTS

In [2]:
dataset_dir = "/mntcephfs/lee_dataset/tts/LibriTTS_R/"
files = glob.glob(dataset_dir+"train-clean-100/*/*/*.normalized.txt")

num = 0
for path in tqdm(files):
    f = open(path, "r")
    text = f.read()
    num += len(text.split())
print(num)

100%|██████████| 33233/33233 [00:16<00:00, 2070.32it/s]

574015





In [3]:
dataset_dir = "/mntcephfs/lee_dataset/tts/LibriTTS_R/"
speakers = [os.path.basename(a) for a in glob.glob(dataset_dir+"train-clean-100/*")]
speakers.sort()
nums = []
for spk in tqdm(speakers):
    files = glob.glob(dataset_dir+f"train-clean-100/{spk}/*/*.normalized.txt")
    files.sort()
    num = 0
    numfiles = 0
    for path in files:
        f = open(path, "r")
        text = f.read()
        a = len(text.split())
        if a<=15:
            num += a
            numfiles += 1
    nums += [[spk, num, numfiles]]
    
df = pd.DataFrame(np.array(nums), columns=["spk", "num", "numfiles"])
df.loc[:, "num"] = df.loc[:, "num"].values.astype(int)
df.loc[:, "numfiles"] = df.loc[:, "numfiles"].values.astype(int)
df.sort_values("numfiles", ascending=False)

100%|██████████| 247/247 [00:01<00:00, 184.23it/s]


Unnamed: 0,spk,num,numfiles
170,6209,2574,354
147,5393,1868,252
168,6147,2147,250
217,8088,2179,250
116,4195,1872,237
...,...,...,...
64,2764,3,2
25,163,18,2
87,332,11,2
154,5688,3,1


In [31]:
files = glob.glob(dataset_dir+f"train-clean-100/{spk}/*/*.normalized.txt")
files.sort()

In [36]:
dataset_dir = "/mntcephfs/lee_dataset/tts/LibriTTS_R/"
minnum = 10
spk_list = df[df["numfiles"]>minnum]["spk"].values
shortfiles = {}
num = 0
for spk in tqdm(spk_list):
    shortfiles[spk] = []
    files = glob.glob(dataset_dir+f"train-clean-100/{spk}/*/*.normalized.txt")
    files.sort()
    np.random.seed(0)
    files = np.random.choice(files, len(files), False)
    now = 0
    for path in files:
        f = open(path, "r")
        text = f.read()
        a = len(text.split())
        if a>15:
            continue
        num += a
        now += 1
        if now>minnum:
            break
        shortfiles[spk] += [path]
        # print(text)
        # wavpath =  path.split(".normali")[0]+".wav"
        # play_audio(wavpath, 22050)

100%|██████████| 220/220 [00:04<00:00, 46.23it/s]


In [42]:
for spk in shortfiles:
    for path in shortfiles[spk]:
        f = open(path, "r")
        text = f.read()
        print(text)
        print("    ", normalizer(text))

Oh, mr Cuthbert!!!"
     oh mister cuthbert
"Oh, I'm so glad.
     oh i am so glad
But it isn't-it's firmly fastened at one end.
     but it is not-it's firmly fastened at one end
Orchard Slope's the name of his place.
     orchard slope's the name of his place
"No, she didn't-really she didn't.
     no she did not-really she did not
But you can't where you are.
     but you can not where you are
But I'm glad to think of getting home.
     but i am glad to think of getting home
They had simply rounded a curve in the road and found themselves in the "Avenue."
     they had simply rounded a curve in the road and found themselves in the avenue
I can't feel exactly perfectly happy because-well, what color would you call this?"
     i can not feel exactly perfectly happy because-well what color would you call this
Maybe they were out of boys of the brand you wanted."
     maybe they were out of boys of the brand you wanted
"The weakness of a murderer!"
     the weakness of a murderer
Come, 

In [140]:
num = 0
for spk in spk_list:
    num += len(shortfiles[spk])
num

1461

- ARCTIC

In [111]:
dataset_dir = "./SPAT/transliteration/"
files = glob.glob(dataset_dir+"*.npy")
num = []
for path in tqdm(files):
    text = np.load(path, allow_pickle=True).item()["English"]
    a = len(text.split())
    num += [a]
np.array(num).sum()

100%|██████████| 1132/1132 [00:00<00:00, 2225.98it/s]


10055

- VCTK

In [3]:
dataset_dir = "/mntcephfs/data/audiow/shoinoue/Dataset/VCTK/"
files = glob.glob(dataset_dir+"txt/*/*.txt")
print(len(files))
num = 0
shortfiles = []
for path in tqdm(files):
    f = open(path, "r")
    text = f.read()
    a = len(text.split())
    if a<=15:
        num += len(text.split())
        shortfiles += [path]
print(num)

44583


100%|██████████| 44583/44583 [00:32<00:00, 1371.67it/s]

299795





In [4]:
shortfiles.sort()
np.random.seed(0)
files = np.random.choice(shortfiles, 4500, False)
num = 0
textdir = {}
for path in files:
    bn = os.path.basename(path)[:-4]
    f = open(path, "r")
    text = f.read()
    text = " ".join(text.split())
    textdir[bn] = text
    num += len(text.split())
    print(text)
    print("    ", normalizer(text))
np.save("./SPAT/VCTK/transcriptions.npy", textdir)

My heart is with the artist.
     my heart is with the artist
She said they would appeal.
     she said they would appeal
My place is in the kitchen.
     my place is in the kitchen
The technology looks excellent.
     the technology looks excellent
I didn't play well last year.
     i did not play well last year
Clearly we are looking at this very carefully.
     clearly we are looking at this very carefully
It's just not funny!
     it's just not funny
I came to complete.
     i came to complete
The following are the principal provisions.
     the following are the principal provisions
Of course, it is a tough assignment.
     of course it is a tough assignment
My Dad is a hero.
     my dad is a hero
That, though, was again according to the manager McCall.
     that though was again according to the manager mccall
She started to put on weight.
     she started to put on weight
We want to see the maximum of change.
     we want to see the maximum of change
IS Tony Blair on drugs?
    

# SPAT speech size

In [63]:
dataset_dir = "/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/English/wav/"
files = glob.glob(dataset_dir + "*.wav")
sec = 0
for path in tqdm(files): 
    x, sr = librosa.load(path)
    sec += len(x)/sr
print(sec/60)

100%|██████████| 1132/1132 [01:29<00:00, 12.70it/s]

55.445768707483005



