In [None]:
import warnings
warnings.filterwarnings("ignore")

import opensmile
import glob
import numpy as np
import librosa
import os
from tqdm import tqdm
import tgt

def get_words_phones_dir(textgrid):
    tier_w = textgrid.get_tier_by_name("words")
    text_w = [[interval.end_time, interval.text] for interval in tier_w.intervals]
    tier_p = textgrid.get_tier_by_name("phones")
    text_p = [[interval.end_time, interval.text] for interval in tier_p.intervals]
    word_dir = {}
    idx = 0
    for i, w in enumerate(text_w):
        time, word = w
        key = f"{i}-"+word
        word_dir[key] = []
        while True:
            time_p, word_p = text_p[idx]
            word_dir[key] += [word_p]
            idx += 1
            if time==time_p:
                break
    return word_dir, text_w, text_p

In [None]:
###########################################
########## Adjustable Parameters ##########
###########################################

fs = 16000
dataset_dir = "../Dataset/ESD/"
feature_dir = "../Features/ESD/"
depth = 3
wav2tgt = {path: ("../Dataset/ESD/textgrid_corpus_directory/"+"/".join(path.split("/")[-(depth+1):])).replace(".wav", ".TextGrid") for path in glob.glob(dataset_dir + "*/"*depth + "*")}
reset = True

###########################################
###########################################
###########################################

smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02, 
    feature_level=opensmile.FeatureLevel.Functionals,
    sampling_rate=fs,
)

notexists = []
files = glob.glob(dataset_dir+"*/"*depth+"*.wav")
files.sort()
for path in tqdm(files[:1750]):
    dn = "/".join(files[0].split("/")[-(depth+1):-1])+"/"
    bn = os.path.basename(path)[:-4]
    feature_path = f"{feature_dir}opensmile/{dn}{bn}.npy"
    worddir_path = f"{feature_dir}words_phones_dir/{dn}{bn}.npy"
    if not(reset) and os.path.exists(feature_path) and os.path.exists(worddir_path):
        continue
    tg_path = wav2tgt[path]
    try:
        textgrid = tgt.read_textgrid(tg_path)
    except FileNotFoundError:
        notexists += [tg_path]
        continue
    word_dir, _, _ = get_words_phones_dir(textgrid)
    audio, _ = librosa.load(path, sr=None)
    if _!=fs:
        audio, _ = librosa.load(path, sr=fs)
    alignments = {}

    for align in ["utterance", "words", "phones"]:
        if align=="utterance":
            tier = textgrid.get_tier_by_name("words")
            start = int(tier.intervals[0].start_time*fs)
            end = int(tier.intervals[-1].end_time*fs)
            segment = audio[start:end]
            collections = np.array(smile.process_signal(segment, fs))
            alignments[align] = collections
        else:
            tier = textgrid.get_tier_by_name(align)
            collections = []
            for interval in tier.intervals:
                segmented_x = audio[int(interval.start_time*fs):int(interval.end_time*fs)]
                collections.append(np.array(smile.process_signal(segmented_x, fs))[0])
            collections = np.array(collections)
            alignments[align] = collections

    os.makedirs(os.path.dirname(feature_path), exist_ok=True)
    os.makedirs(os.path.dirname(worddir_path), exist_ok=True)
    np.save(feature_path, alignments)
    np.save(worddir_path, word_dir)
    
print("The following files are not processed due to missing files of TextGrid")
print(notexists)

In [None]:
save = True

notexists = []
files = glob.glob(dataset_dir+"*/"*depth+"*.wav")
files.sort()
for path in tqdm(files):
    dn = "/".join(path.split("/")[-(depth+1):-1])+"/"
    bn = os.path.basename(path)[:-4]
    feature_path = f"{feature_dir}opensmile/{dn}{bn}.npy"
    worddir_path = f"{feature_dir}words_phones_dir/{dn}{bn}.npy"
    if not(reset) and os.path.exists(feature_path) and os.path.exists(worddir_path):
        continue
    tg_path = wav2tgt[path]
    try:
        textgrid = tgt.read_textgrid(tg_path)
    except FileNotFoundError:
        notexists += [tg_path]
        continue
    word_dir, _, _ = get_words_phones_dir(textgrid)
    audio, _ = librosa.load(path, sr=None)
    if _!=fs:
        audio, _ = librosa.load(path, sr=fs)
    alignments = {}

    for align in ["utterance", "words", "phones"]:
        if align=="utterance":
            tier = textgrid.get_tier_by_name("words")
            start = int(tier.intervals[0].start_time*fs)
            end = int(tier.intervals[-1].end_time*fs)
            segment = audio[start:end]
            collections = np.array(smile.process_signal(segment, fs))
            alignments[align] = collections
        else:
            tier = textgrid.get_tier_by_name(align)
            collections = []
            for interval in tier.intervals:
                segmented_x = audio[int(interval.start_time*fs):int(interval.end_time*fs)]
                collections.append(np.array(smile.process_signal(segmented_x, fs))[0])
            collections = np.array(collections)
            alignments[align] = collections

    if save:
        os.makedirs(os.path.dirname(feature_path), exist_ok=True)
        os.makedirs(os.path.dirname(worddir_path), exist_ok=True)
        np.save(feature_path, alignments)
        np.save(worddir_path, word_dir)