In [1]:
%load_ext autoreload
%autoreload 2

This notebooks extracts voicing-related `openSMILE` low-level-descriptors (LLD's) on the parsed (i.e, normalized and converted to 16KhZ mono, see [this notebook](0.4_Parse_Audio_Data.ipynb)) audio data.

In [2]:
import sys
from pathlib import Path
from typing import List, Optional

import pandas as pd
from IPython.display import display
from tqdm.auto import tqdm
from scipy.io import wavfile
import numpy as np
import opensmile
from multiprocessing import Pool
import traceback
from typing import Tuple

sys.path.append("..")
from sgs_utils.path_conf import loc_data_dir, interim_speech_data_dir


In [3]:
# we utilize the cleaned parquet file obtained by running the 0.1_EDA notebook
df_session = pd.read_parquet(loc_data_dir.joinpath("df_session_tot_cleaned.parquet"))


In [4]:
# The emobase LLD withholds the "voiceProb_sma" feature
lld_emobase = opensmile.Smile(
    feature_set=opensmile.FeatureSet.emobase,
    feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
)

# The compare LLD withholds the "voicingFinalUnclipped_sma" feature
lld_compare = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
)

### Whole file duration

In [5]:
def _extract_parse_smile_df(s: opensmile.Smile, f: Path) -> pd.DataFrame:
    df_feat = s.process_signal(np.load(f), sampling_rate=16_000, file=f)
    df_feat = df_feat.reset_index(drop=False)
    df_feat["file"] = df_feat["file"].astype("str")

    # df_feat["fileName"] = f.name
    df_feat["pic_name"] = f.name.split("__")[0]
    df_feat["time_str"] = f.name.split("__")[1].split(".")[0]
    df_feat["DB"] = f.parent.name
    df_feat["ID"] = f.parent.parent.name.split("__")[-1]
    return df_feat


def _extract_opensmile_f(file: Path) -> Tuple[pd.DataFrame, ...]:
    # calculate the global utterance features
    return (
        _extract_parse_smile_df(lld_emobase, f=file),
        _extract_parse_smile_df(lld_compare, f=file),
    )


out = None
with Pool(processes=6) as pool:
    # NOTE how we use here the parsed numpy files
    npy_files = list(interim_speech_data_dir.glob("full_dur_16khz_norm/*/*/*.npy"))
    results = pool.imap_unordered(_extract_opensmile_f, npy_files)
    results = tqdm(results, total=len(npy_files))
    try:
        out = [f for f in results]
    except:
        traceback.print_exc()
        pool.terminate()
    finally:
        pool.close()
        pool.join()


def _parse_concat_df(df_conc: pd.DataFrame) -> pd.DataFrame:
    df_conc["DB"] = df_conc["DB"].astype("category")
    df_conc["pic_name"] = df_conc["pic_name"].astype("category")
    df_conc["ID"] = df_conc["ID"].astype("category")
    return df_conc


df_emobase_lld = _parse_concat_df(pd.concat([o[0] for o in out], ignore_index=True))
df_compare_lld = _parse_concat_df(pd.concat([o[1] for o in out], ignore_index=True))

del (
    out,
    _extract_opensmile_f,
    _parse_concat_df,
    _extract_parse_smile_df,
)


  0%|          | 0/3005 [00:00<?, ?it/s]

In [None]:
# we filter out the LLD features that are not relevant and join both LLD/s in a single dataframe
# The analysis will be performed in the 0.5.1. Opensmile voicing analysis notebook.eEEEv

In [8]:
# Parse both the compare and emobase LLD
df_compare_lld = df_compare_lld.drop(
    columns=set(df_compare_lld.columns).difference(
        {
            "start",
            # "end", Note -> we also drop the end column as both LLD's have a different window size
            "time_str",
            "pic_name",
            "DB",
            "ID",
            # Feature columns
            "voicingFinalUnclipped_sma",
            "logHNR_sma",
            "F0final_sma",
        }
    )
)
df_emobase_lld = df_emobase_lld.drop(
    columns=set(df_emobase_lld.columns).difference(
        {
            "start",
            # "end",
            "time_str",
            "pic_name",
            "DB",
            "ID",
            "voiceProb_sma",
            "pcm_intensity_sma",
            "pcm_loudness_sma",
            "F0_sma",
        }
    )
)

for c in ["time_str"]:
    df_emobase_lld[c] = df_emobase_lld[c].astype("category")
    df_compare_lld[c] = df_compare_lld[c].astype("category")


In [9]:
# TODO -> these can be removed
df_emobase_lld.to_parquet(loc_data_dir / 'voiced_emo_lld.parquet', engine="fastparquet")
df_compare_lld.to_parquet(loc_data_dir / 'voiced_comp_lld.parquet', engine="fastparquet")

In [3]:
# TODO -> these can be removed
df_emobase_lld = pd.read_parquet(loc_data_dir / 'voiced_emo_lld.parquet', engine="fastparquet")
df_compare_lld = pd.read_parquet(loc_data_dir / 'voiced_comp_lld.parquet', engine="fastparquet")

In [4]:
df_voiced = pd.merge(
    df_compare_lld,
    df_emobase_lld,
    on=["ID", "DB", "pic_name", "time_str", "start"],
    how="inner",
)
df_voiced['time_s'] = pd.TimedeltaIndex(df_voiced['start'].astype('str')).total_seconds()
df_voiced.to_parquet(loc_data_dir / 'voiced_lld.parquet', engine="fastparquet")