# Speech Feature Extraction using OpenSMILE (GeMapsv01b config)

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import traceback
from multiprocessing import Pool
from pathlib import Path
from typing import List, Optional, Tuple

import numpy as np
import opensmile
import pandas as pd
from IPython.display import display
from tqdm.auto import tqdm

sys.path.append("..")
from sgs_utils.path_conf import interim_speech_data_dir, loc_data_dir


# Extracting features

useful links:
* [opensmile config folder](https://github.com/audeering/opensmile/tree/v3.0.0/config)
* difference between GeMAPS versions [here](https://github.com/audeering/opensmile/blob/v3.0.0/config/gemaps/CHANGES.txt')

**note**: `eGeMAPS` is an _extended_ version of the GeMAPS

feature-level`
* `Functionals`: global segment based features (1 feature per segment)
* `LowLevelDescriptor`: sliding window features (1 feature per window)

In [2]:
# Load the VAD dataframe
df_session: pd.DataFrame = pd.read_parquet(
    loc_data_dir.joinpath("df_session_tot_cleaned_VAD.parquet")
)

# define the feature extraction configs
func_gemaps: opensmile.Smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.GeMAPSv01b,
    feature_level=opensmile.FeatureLevel.Functionals,
)

func_compare: opensmile.Smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.Functionals,
)

(df_session.VAD_sliced_duration_s > 15).value_counts()


True     2909
False     348
Name: VAD_sliced_duration_s, dtype: int64

In [3]:
# Helper functions
def _extract_parse_smile_duration(
    s: opensmile.Smile,
    arr_path: Path,
    start_s: Optional[float] = None,
    end_s: Optional[float] = None,
) -> pd.DataFrame:
    wav_arr = np.load(arr_path)
    wav_path = arr_path.with_suffix(".wav")
    sr = 16_000

    df_feat = s.process_signal(
        signal=np.load(str(arr_path.absolute())),
        sampling_rate=sr,
        file=str(wav_path),
        start=start_s,
        end=end_s,
    )

    df_feat = df_feat.reset_index(drop=False)
    df_feat["file"] = df_feat["file"].astype(str)

    df_feat["pic_name"] = arr_path.name.split("__")[0]
    df_feat["time_str"] = arr_path.name.split("__")[1].split(".")[0]
    df_feat["DB"] = arr_path.parent.name
    df_feat["ID"] = arr_path.parent.parent.name.split("__")[-1]
    return df_feat


def _parse_concat_df(df_conc: pd.DataFrame) -> pd.DataFrame:
    df_conc["DB"] = df_conc["DB"].astype("category")
    df_conc["pic_name"] = df_conc["pic_name"].astype("category")
    df_conc["ID"] = df_conc["ID"].astype("category")
    return df_conc


## **`Functionals`**: `Fixed duration`

In [4]:
DURATION_S = 15

In [5]:
def _extract_opensmile_f_duration(file_start_end) -> Tuple[pd.DataFrame, ...]:
    file, start, end = file_start_end
    # calculate the global utterance features
    return (
        _extract_parse_smile_duration(
            func_gemaps, arr_path=file, start_s=start, end_s=end
        ),
        # _extract_parse_smile_duration(
        #     func_compare, arr_path=file, start_s=start, end_s=end
        # ),
    )

### first 15 seconds

In [6]:
# construct an array which contains the file path and the start and end time
# the start and end time are in this case [0, 15] seconds
arr_file_start_end: List[Tuple[Path, float, float]] = [
    (
        interim_speech_data_dir
        / "VAD_slice_16khz"
        / r.ID
        / r.DB
        / f"{r.pic_name}__{r.time_str}.npy",
        0,
        DURATION_S,
    )
    for _, r in df_session[df_session.VAD_sliced_duration_s > DURATION_S].iterrows()
]
print(len(arr_file_start_end))

out: List = []
with Pool(processes=8) as pool:
    results = pool.imap_unordered(_extract_opensmile_f_duration, arr_file_start_end)
    results = tqdm(results, total=len(arr_file_start_end))
    try:
        out = [f for f in results]
    except:
        traceback.print_exc()
        pool.terminate()
    finally:
        pool.close()
        pool.join()

df_gemaps_func_dur_start = _parse_concat_df(
    pd.concat([o[0] for o in out], ignore_index=True)
)
# df_compare_func_dur_start = _parse_concat_df(
#     pd.concat([o[1] for o in out], ignore_index=True)
# )

del out, pool, results, arr_file_start_end


2909


  0%|          | 0/2909 [00:00<?, ?it/s]

In [7]:
df_gemaps_func_dur_start.to_parquet(
    loc_data_dir / f"df_gemaps_{DURATION_S}s_start.parquet",
    engine="fastparquet",
)

# df_compare_func_dur_start.to_parquet(
#     loc_data_dir / f"df_compare_{DURATION_S}s_start.parquet",
#     engine="fastparquet",
# )


### Last 15 seconds

In [8]:
# construct an array which contains the file path and the start and end time
# the start and end time are in this case [V_end - 15, VAD_end] seconds
arr_file_start_end: List[Tuple[Path, float, float]] = [
    (
        interim_speech_data_dir
        / "VAD_slice_16khz"
        / r.ID
        / r.DB
        / f"{r.pic_name}__{r.time_str}.npy",
        r.VAD_sliced_duration_s - DURATION_S,
        r.VAD_sliced_duration_s,
    )
    for _, r in df_session[df_session.VAD_sliced_duration_s > DURATION_S].iterrows()
]
print(len(arr_file_start_end))

out: List = []
with Pool(processes=8) as pool:
    results = pool.imap_unordered(_extract_opensmile_f_duration, arr_file_start_end)
    results = tqdm(results, total=len(arr_file_start_end))
    try:
        out = [f for f in results]
    except:
        traceback.print_exc()
        pool.terminate()
    finally:
        pool.close()
        pool.join()

df_gemaps_func_dur_end = _parse_concat_df(
    pd.concat([o[0] for o in out], ignore_index=True)
)
# df_compare_func_dur_end = _parse_concat_df(
#     pd.concat([o[1] for o in out], ignore_index=True)
# )

del out, _extract_opensmile_f_duration


2909


  0%|          | 0/2909 [00:00<?, ?it/s]

In [9]:
df_gemaps_func_dur_end.to_parquet(
    loc_data_dir / f"df_gemaps_{DURATION_S}s_end.parquet",
    engine="fastparquet",
)

# df_compare_func_dur_end.to_parquet(
#     loc_data_dir / f"df_compare_{DURATION_S}s_end.parquet",
#     engine="fastparquet",
# )

## **`LLDs`**: `Whole duration`

In [4]:
lld_gemaps = opensmile.Smile(
    feature_set=opensmile.FeatureSet.GeMAPSv01b,
    feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
)

# we will use the ComParE LLD to calculate frequency-based features on `F0final_sma`
lld_compare = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
)

In [5]:
def _extract_opensmile_f_whole_duration(file) -> Tuple[pd.DataFrame, ...]:
    # calculate the global utterance features
    return (
        _extract_parse_smile_duration(lld_gemaps, arr_path=file),
        _extract_parse_smile_duration(lld_compare, arr_path=file),
    )


# The list of files to extract the features from
arr_files: List[Path] = list(
    (interim_speech_data_dir / "full_dur_16khz").glob("**/*.npy")
)


out = None
with Pool(processes=8) as pool:
    results = pool.imap_unordered(_extract_opensmile_f_whole_duration, arr_files)
    results = tqdm(results, total=len(arr_files))
    try:
        out = [f for f in results]
    except:
        traceback.print_exc()
        pool.terminate()
    finally:
        pool.close()
        pool.join()

df_gemaps_lld = _parse_concat_df(
    pd.concat([o[0] for o in out], ignore_index=True)
).assign(
    file=lambda D: D["file"].astype("str").astype("category"),
    time_str=lambda D: D["time_str"].astype("str").astype("category"),
    end=lambda D: D["end"].dt.total_seconds().astype(np.float32),
    start=lambda D: D["start"].dt.total_seconds().astype(np.float32),
)

df_compare_lld = _parse_concat_df(
    pd.concat([o[1] for o in out], ignore_index=True)
).assign(
    file=lambda D: D["file"].astype("str").astype("category"),
    time_str=lambda D: D["time_str"].astype("str").astype("category"),
    end=lambda D: D["end"].dt.total_seconds().astype(np.float32),
    start=lambda D: D["start"].dt.total_seconds().astype(np.float32),
)

del out, arr_files, results, pool


  0%|          | 0/3233 [00:00<?, ?it/s]

In [9]:
df_gemaps_lld.to_parquet(
    loc_data_dir / f"df_gemaps_lld_full_dur.parquet",
    engine="fastparquet",
)

In [18]:
comp_cols = [
    "F0final_sma",
    "voicingFinalUnclipped_sma",
    "jitterLocal_sma",
    "jitterDDP_sma",
    "shimmerLocal_sma",
    "logHNR_sma",
    "audspec_lengthL1norm_sma",
    "audspecRasta_lengthL1norm_sma",
    "pcm_RMSenergy_sma",
    "pcm_zcr_sma",
    "audSpec_Rfilt_sma[0]",
    "pcm_fftMag_spectralFlux_sma",
    "pcm_fftMag_spectralEntropy_sma",
    "pcm_fftMag_spectralVariance_sma",
    # "pcm_fftMag_spectralSkewness_sma",
    # "pcm_fftMag_spectralKurtosis_sma",
    # "pcm_fftMag_spectralSlope_sma",
    # "pcm_fftMag_psySharpness_sma",
    "pcm_fftMag_spectralHarmonicity_sma",
    "mfcc_sma[1]",
]

df_compare_lld[['file', 'start', 'end', 'pic_name', 'time_str', 'DB', 'ID'] + comp_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17004953 entries, 0 to 17004952
Data columns (total 23 columns):
 #   Column                              Dtype   
---  ------                              -----   
 0   file                                category
 1   start                               float32 
 2   end                                 float32 
 3   pic_name                            category
 4   time_str                            category
 5   DB                                  category
 6   ID                                  category
 7   F0final_sma                         float32 
 8   voicingFinalUnclipped_sma           float32 
 9   jitterLocal_sma                     float32 
 10  jitterDDP_sma                       float32 
 11  shimmerLocal_sma                    float32 
 12  logHNR_sma                          float32 
 13  audspec_lengthL1norm_sma            float32 
 14  audspecRasta_lengthL1norm_sma       float32 
 15  pcm_RMSenergy_sma             

In [19]:
df_compare_lld[
    ["file", "start", "end", "pic_name", "time_str", "DB", "ID"] + comp_cols
].to_parquet(loc_data_dir / "df_compare_lld_full_dur.parquet", engine="fastparquet")


# Varia: Comparing various opensmile configs

In [15]:
func_egemaps: opensmile.Smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals,
)

func_compare: opensmile.Smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.Functionals,
)

lld_gemaps: opensmile.Smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.GeMAPSv01b,
    feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
)
lld_egemaps: opensmile.Smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
)


In [20]:
# Gemaps vs. egemaps
print('Functionals')
display(set(func_gemaps.feature_names).difference(set(func_egemaps.feature_names)))
display(set(func_egemaps.feature_names).difference(set(func_gemaps.feature_names)))
print('-'*50)
print("LDDs")
display(set(lld_gemaps.feature_names).difference(set(lld_egemaps.feature_names)))
display(set(lld_egemaps.feature_names).difference(set(lld_gemaps.feature_names)))


Functionals


set()

{'F2bandwidth_sma3nz_amean',
 'F2bandwidth_sma3nz_stddevNorm',
 'F3bandwidth_sma3nz_amean',
 'F3bandwidth_sma3nz_stddevNorm',
 'equivalentSoundLevel_dBp',
 'mfcc1V_sma3nz_amean',
 'mfcc1V_sma3nz_stddevNorm',
 'mfcc1_sma3_amean',
 'mfcc1_sma3_stddevNorm',
 'mfcc2V_sma3nz_amean',
 'mfcc2V_sma3nz_stddevNorm',
 'mfcc2_sma3_amean',
 'mfcc2_sma3_stddevNorm',
 'mfcc3V_sma3nz_amean',
 'mfcc3V_sma3nz_stddevNorm',
 'mfcc3_sma3_amean',
 'mfcc3_sma3_stddevNorm',
 'mfcc4V_sma3nz_amean',
 'mfcc4V_sma3nz_stddevNorm',
 'mfcc4_sma3_amean',
 'mfcc4_sma3_stddevNorm',
 'spectralFluxUV_sma3nz_amean',
 'spectralFluxV_sma3nz_amean',
 'spectralFluxV_sma3nz_stddevNorm',
 'spectralFlux_sma3_amean',
 'spectralFlux_sma3_stddevNorm'}

--------------------------------------------------
LDDs


set()

{'F2bandwidth_sma3nz',
 'F3bandwidth_sma3nz',
 'mfcc1_sma3',
 'mfcc2_sma3',
 'mfcc3_sma3',
 'mfcc4_sma3',
 'spectralFlux_sma3'}

In [21]:
# Gemaps vs. Compare
print('Functionals')
display(set(func_gemaps.feature_names).difference(set(func_compare.feature_names)))
display(set(func_compare.feature_names).difference(set(func_gemaps.feature_names)))
print('-'*50)


Functionals


{'F0semitoneFrom27.5Hz_sma3nz_amean',
 'F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope',
 'F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope',
 'F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2',
 'F0semitoneFrom27.5Hz_sma3nz_percentile20.0',
 'F0semitoneFrom27.5Hz_sma3nz_percentile50.0',
 'F0semitoneFrom27.5Hz_sma3nz_percentile80.0',
 'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope',
 'F0semitoneFrom27.5Hz_sma3nz_stddevNorm',
 'F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope',
 'F1amplitudeLogRelF0_sma3nz_amean',
 'F1amplitudeLogRelF0_sma3nz_stddevNorm',
 'F1bandwidth_sma3nz_amean',
 'F1bandwidth_sma3nz_stddevNorm',
 'F1frequency_sma3nz_amean',
 'F1frequency_sma3nz_stddevNorm',
 'F2amplitudeLogRelF0_sma3nz_amean',
 'F2amplitudeLogRelF0_sma3nz_stddevNorm',
 'F2frequency_sma3nz_amean',
 'F2frequency_sma3nz_stddevNorm',
 'F3amplitudeLogRelF0_sma3nz_amean',
 'F3amplitudeLogRelF0_sma3nz_stddevNorm',
 'F3frequency_sma3nz_amean',
 'F3frequency_sma3nz_stddevNorm',
 'HNRdBACF_sma3nz_amean',
 'HNRdBACF_sma3nz_st

{'audSpec_Rfilt_sma[19]_meanSegLen',
 'audSpec_Rfilt_sma[8]_rqmean',
 'pcm_fftMag_spectralKurtosis_sma_de_peakMeanAbs',
 'audSpec_Rfilt_sma[23]_stddev',
 'audSpec_Rfilt_sma[6]_stddevFallingSlope',
 'audSpec_Rfilt_sma_de[8]_minSegLen',
 'pcm_fftMag_spectralSkewness_sma_de_quartile1',
 'pcm_fftMag_spectralCentroid_sma_de_upleveltime25',
 'mfcc_sma_de[6]_meanFallingSlope',
 'pcm_fftMag_spectralRollOff90.0_sma_de_peakMeanRel',
 'mfcc_sma_de[13]_lpc2',
 'audSpec_Rfilt_sma_de[3]_meanPeakDist',
 'audSpec_Rfilt_sma_de[21]_risetime',
 'audSpec_Rfilt_sma[9]_iqr1-3',
 'pcm_fftMag_spectralRollOff90.0_sma_de_percentile99.0',
 'pcm_fftMag_spectralKurtosis_sma_qregc3',
 'audSpec_Rfilt_sma[5]_rqmean',
 'pcm_fftMag_fband1000-4000_sma_quartile2',
 'mfcc_sma_de[13]_peakDistStddev',
 'audSpec_Rfilt_sma_de[10]_iqr2-3',
 'mfcc_sma_de[9]_minPos',
 'audSpec_Rfilt_sma[14]_upleveltime90',
 'pcm_fftMag_spectralRollOff25.0_sma_de_lpc4',
 'pcm_fftMag_spectralSlope_sma_flatness',
 'pcm_fftMag_spectralHarmonicity_sm

--------------------------------------------------


---