In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from pathlib import Path
from typing import List, Optional

import pandas as pd
from IPython.display import display
from tqdm.auto import tqdm
from scipy.io import wavfile
import numpy as np
import torchaudio
import noisereduce

sys.path.append("..")
# opensmile
import opensmile

from sgs_utils.path_conf import (
    loc_data_dir,
    interim_speech_data_dir,
    speech_data_session_dir,
)
import seaborn as sns

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)


In [3]:
# Load the session data and only parse the valid sessions
df_session = pd.read_parquet(loc_data_dir.joinpath("df_session_tot_cleaned.parquet"))
print(df_session.shape[0])


3257


In [4]:
display((df_session.wav_duration_s > 15).value_counts())
wav_paths: List[Path] = (
    df_session[(df_session.wav_duration_s > 15)]
    .apply(
        lambda row: list(
            speech_data_session_dir.glob(
                f"*{row.ID}/{row.DB}/{row.pic_name}__{row.time_str}.wav"
            )
        )[0],
        axis=1
    )
    .values
)


True     3197
False      60
Name: wav_duration_s, dtype: int64

# Extracting features

useful links:
* [opensmile config folder](https://github.com/audeering/opensmile/tree/v3.0.0/config)
* difference between GeMAPS versions [here](https://github.com/audeering/opensmile/blob/v3.0.0/config/gemaps/CHANGES.txt')

**note**: `eGeMAPS` is an _extended_ version of the GeMAPS

feature-level`
* `Functionals`: global segment based features (1 feature per segment)
* `LowLevelDescriptor`: sliding window features (1 feature per window)

In [6]:
# define the feature extraction configs
func_gemaps = opensmile.Smile(
    feature_set=opensmile.FeatureSet.GeMAPSv01b,
    feature_level=opensmile.FeatureLevel.Functionals,
)

func_compare = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.Functionals,
)

func_emobase = opensmile.Smile(
    feature_set=opensmile.FeatureSet.emobase,
    feature_level=opensmile.FeatureLevel.Functionals,
)


In [7]:
from functional import seq

seq(func_gemaps.feature_names).filter(
    lambda x: any(substr in x.lower() for substr in ["jitter", "shimmer"])
).to_list()


['jitterLocal_sma3nz_amean',
 'jitterLocal_sma3nz_stddevNorm',
 'shimmerLocaldB_sma3nz_amean',
 'shimmerLocaldB_sma3nz_stddevNorm']

In [8]:
from multiprocessing import Pool
import traceback
from typing import Tuple


### Whole file duration

In [9]:
def _extract_parse_smile_df(s: opensmile.Smile, wav_file_path: Path) -> pd.DataFrame:
    # Extract the features on
    # * the WAV file
    # * The normalized signal
    # * the 16_000 Hz resampled signal
    # * the processed 16_000 Hz signal

    df_feat_list = []

    try:
        arr, fs = torchaudio.load(wav_file_path, normalize=True)
        assert arr.size(1) / fs > 10, "File too short"
    except:
        return pd.DataFrame()

    arr_16k = (
        torchaudio.functional.resample(arr, orig_freq=fs, new_freq=16_000)
        .numpy()
        .ravel()
    )
    arr = arr.numpy().ravel()

    # 1. Parse the WAV file
    df_feat = s.process_file(file=wav_file_path)
    df_feat = df_feat.reset_index(drop=False)
    df_feat["file"] = df_feat["file"].astype("str")
    df_feat["pic_name"] = wav_file_path.name.split("__")[0]
    df_feat["time_str"] = wav_file_path.name.split("__")[1].split(".")[0]
    df_feat["DB"] = wav_file_path.parent.name
    df_feat["ID"] = wav_file_path.parent.parent.name.split("__")[-1]
    df_feat["input_type"] = "wav"
    df_feat_list.append(df_feat)

    # 2. Parse the signal
    df_feat = s.process_signal(arr, sampling_rate=fs, file=wav_file_path)
    df_feat = df_feat.reset_index(drop=False)
    df_feat["file"] = df_feat["file"].astype("str")
    df_feat["pic_name"] = wav_file_path.name.split("__")[0]
    df_feat["time_str"] = wav_file_path.name.split("__")[1].split(".")[0]
    df_feat["DB"] = wav_file_path.parent.name
    df_feat["ID"] = wav_file_path.parent.parent.name.split("__")[-1]
    df_feat["input_type"] = "arr"
    df_feat_list.append(df_feat)

    # 3. Parse the 16KHz signal
    df_feat = s.process_signal(arr_16k, sampling_rate=16_000, file=wav_file_path)
    df_feat = df_feat.reset_index(drop=False)
    df_feat["file"] = df_feat["file"].astype("str")
    df_feat["pic_name"] = wav_file_path.name.split("__")[0]
    df_feat["time_str"] = wav_file_path.name.split("__")[1].split(".")[0]
    df_feat["DB"] = wav_file_path.parent.name
    df_feat["ID"] = wav_file_path.parent.parent.name.split("__")[-1]
    df_feat["input_type"] = "arr_16k"
    df_feat_list.append(df_feat)

    # 4. Parse the processed 16KHz signal
    arr_16k_nr_s = noisereduce.reduce_noise(
        arr_16k, sr=16_000, prop_decrease=0.85, n_fft=256, n_std_thresh_stationary=0.75
    )
    df_feat = s.process_signal(arr_16k_nr_s, sampling_rate=16_000, file=wav_file_path)
    df_feat = df_feat.reset_index(drop=False)
    df_feat["file"] = df_feat["file"].astype("str")
    df_feat["pic_name"] = wav_file_path.name.split("__")[0]
    df_feat["time_str"] = wav_file_path.name.split("__")[1].split(".")[0]
    df_feat["DB"] = wav_file_path.parent.name
    df_feat["ID"] = wav_file_path.parent.parent.name.split("__")[-1]
    df_feat["input_type"] = "arr_16k_nr_s"
    df_feat_list.append(df_feat)

    return pd.concat(df_feat_list, axis=0, ignore_index=True)


def _extract_opensmile_f(file: Path) -> Tuple[pd.DataFrame, ...]:
    # calculate the global utterance features
    return (
        _extract_parse_smile_df(func_gemaps, wav_file_path=file),
        # _extract_parse_smile_df(func_compare, wav_file_path=file),
        # _extract_parse_smile_df(func_emobase, wav_file_path=file),
    )


out = None
with Pool(processes=6) as pool:
    # Wav files
    wav_files = wav_paths 
    wav_files = wav_files[::30]

    results = pool.imap_unordered(_extract_opensmile_f, wav_files)
    results = tqdm(results, total=len(wav_files))
    try:
        out = [f for f in results]
    except:
        traceback.print_exc()
        pool.terminate()
    finally:
        pool.close()
        pool.join()


def _parse_concat_df(df_conc: pd.DataFrame) -> pd.DataFrame:
    df_conc["DB"] = df_conc["DB"].astype("category")
    df_conc["pic_name"] = df_conc["pic_name"].astype("category")
    df_conc["ID"] = df_conc["ID"].astype("category")
    return df_conc


df_gemaps_func = _parse_concat_df(pd.concat([o[0] for o in out], ignore_index=True))
# df_compare_func = _parse_concat_df(pd.concat([o[1] for o in out], ignore_index=True))
# df_emobase_func = _parse_concat_df(pd.concat([o[2] for o in out], ignore_index=True))

del (
    out,
    _extract_opensmile_f,
    _parse_concat_df,
    _extract_parse_smile_df,
)


  0%|          | 0/107 [00:00<?, ?it/s]

In [10]:
for df in [df_gemaps_func]: # df_emobase_func]:
    input_types = df["input_type"].unique()
    reference_input_type = "wav"

    reference = df[df.input_type == reference_input_type].set_index("file")
    series_list = []
    for input_type in set(input_types).difference({reference_input_type}):
        input = df[df.input_type == input_type].set_index("file")
        series_list.append(
            reference.corrwith(input).rename(
                f"corr_{reference_input_type}_{input_type}"
            )
        )

    cmap = sns.color_palette("coolwarm", as_cmap=True)
    display(
        pd.concat(series_list, axis=1)
        .round(3)
        .style.background_gradient(cmap=cmap, vmin=-1, vmax=1)
    )


Unnamed: 0,corr_wav_arr,corr_wav_arr_16k_nr_s,corr_wav_arr_16k
F0semitoneFrom27.5Hz_sma3nz_amean,1.0,0.871,0.889
F0semitoneFrom27.5Hz_sma3nz_stddevNorm,1.0,0.696,0.842
F0semitoneFrom27.5Hz_sma3nz_percentile20.0,1.0,0.691,0.703
F0semitoneFrom27.5Hz_sma3nz_percentile50.0,1.0,0.802,0.873
F0semitoneFrom27.5Hz_sma3nz_percentile80.0,1.0,0.764,0.917
F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,1.0,0.486,0.619
F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,1.0,0.488,0.616
F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,1.0,0.433,0.462
F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,1.0,0.085,0.481
F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,1.0,0.048,0.2
