# Speech Feature Extraction using OpenSMILE (GeMapsv01b + ComParE config)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from pathlib import Path
from typing import List, Optional

import pandas as pd
from IPython.display import display
from tqdm.auto import tqdm
from scipy.io import wavfile

sys.path.append("..")
# opensmile
import opensmile

from sgs_utils.path_conf import loc_data_dir, speech_data_session_dir

# Extracting features

useful links:
* [opensmile config folder](https://github.com/audeering/opensmile/tree/v3.0.0/config)
* difference between GeMAPS versions [here](https://github.com/audeering/opensmile/blob/v3.0.0/config/gemaps/CHANGES.txt')

**note**: `eGeMAPS` is an _extended_ version of the GeMAPS

feature-level`
* `Functionals`: global segment based features (1 feature per segment)
* `LowLevelDescriptor`: sliding window features (1 feature per window)

In [3]:
df_session = pd.read_parquet(loc_data_dir.joinpath("df_session_tot.parquet"))

In [4]:
# define the feature extraction configs
func_gemaps = opensmile.Smile(
    feature_set=opensmile.FeatureSet.GeMAPSv01b,
    feature_level=opensmile.FeatureLevel.Functionals,
)

lld_gemaps = opensmile.Smile(
    feature_set=opensmile.FeatureSet.GeMAPSv01b,
    feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
)

# we will use the ComParE LLD to calculate frequency-based features on `F0final_sma`
lld_compare = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
)

In [5]:
from multiprocessing import Pool
import traceback
from typing import Tuple

### Whole file duration

In [None]:
def _extract_parse_smile_df(s: opensmile.Smile, f: Path) -> pd.DataFrame:
    df_feat = s.process_file(f)
    df_feat = df_feat.reset_index(drop=False)
    df_feat["file"] = df_feat["file"].astype("str")

    # df_feat["fileName"] = f.name
    df_feat["pic_name"] = f.name.split("__")[0]
    df_feat["time_str"] = f.name.split("__")[1].split('.')[0]
    df_feat["DB"] = f.parent.name
    df_feat["ID"] = f.parent.parent.name.split('__')[-1]
    return df_feat

def _extract_opensmile_f(file: Path) -> Tuple[pd.DataFrame, ...]:
    # calculate the global utterance features
    return (
        _extract_parse_smile_df(func_gemaps, f=file),
        _extract_parse_smile_df(lld_gemaps, f=file),
        _extract_parse_smile_df(lld_compare, f=file),
    )

out = None
with Pool(processes=8) as pool:
    wav_files = list(speech_data_session_dir.glob("*/*/*.wav"))
    results = pool.imap_unordered(_extract_opensmile_f, wav_files)
    results = tqdm(results, total=len(wav_files))
    try:
        out = [f for f in results]
    except:
        traceback.print_exc()
        pool.terminate()
    finally:
        pool.close()
        pool.join()

def _parse_concat_df(df_conc: pd.DataFrame) -> pd.DataFrame:
    df_conc["DB"] = df_conc["DB"].astype("category")
    df_conc["pic_name"] = df_conc["pic_name"].astype("category")
    df_conc["ID"] = df_conc["ID"].astype("category")
    return df_conc


df_gemaps_func = _parse_concat_df(pd.concat([o[0] for o in out], ignore_index=True))
df_gemaps_lld = _parse_concat_df(pd.concat([o[1] for o in out], ignore_index=True))
df_compare_lld = _parse_concat_df(pd.concat([o[2] for o in out], ignore_index=True))

del (
    out,
    _extract_opensmile_f,
    _parse_concat_df,
    _extract_parse_smile_df,
)

### **Fixed** file duration

In [6]:
DURATION_S = 15

In [7]:
def _extract_parse_smile_duration(
    s: opensmile.Smile, f: Path, duration_s: float
) -> pd.DataFrame:
    sr, wav_arr = wavfile.read(f)

    # we extract features until the penultimate second of the utterance
    t_end = (wav_arr.shape[0] / sr) - 1
    t_start = max(0, t_end - duration_s)

    df_feat = s.process_file(
        file=str(f.absolute()),
        start=t_start,
        end=t_end,
    )

    df_feat = df_feat.reset_index(drop=False)
    df_feat["file"] = df_feat['file'].astype(str)

    # df_feat["fileName"] = f.name
    df_feat["pic_name"] = f.name.split("__")[0]
    df_feat["time_str"] = f.name.split("__")[1].split(".")[0]
    df_feat["DB"] = f.parent.name
    df_feat["ID"] = f.parent.parent.name.split("__")[-1]
    return df_feat


def _extract_opensmile_f_duration(file_duration) -> Tuple[pd.DataFrame, ...]:
    file, duration = file_duration
    # calculate the global utterance features
    return (
        _extract_parse_smile_duration(func_gemaps, f=file, duration_s=duration),
        _extract_parse_smile_duration(lld_gemaps, f=file, duration_s=duration),
        _extract_parse_smile_duration(lld_compare, f=file, duration_s=duration),
    )


out = None
with Pool(processes=8) as pool:
    wav_files = list(speech_data_session_dir.glob("*/*/*.wav"))
    wav_files = [(wf, DURATION_S) for wf in wav_files]
    results = pool.imap_unordered(_extract_opensmile_f_duration, wav_files)
    results = tqdm(results, total=len(wav_files))
    try:
        out = [f for f in results]
    except:
        traceback.print_exc()
        pool.terminate()
    finally:
        pool.close()
        pool.join()


def _parse_concat_df(df_conc: pd.DataFrame) -> pd.DataFrame:
    df_conc["DB"] = df_conc["DB"].astype("category")
    df_conc["pic_name"] = df_conc["pic_name"].astype("category")
    df_conc["ID"] = df_conc["ID"].astype("category")
    return df_conc


df_gemaps_func_dur = _parse_concat_df(pd.concat([o[0] for o in out], ignore_index=True))
df_gemaps_lld_dur = _parse_concat_df(pd.concat([o[1] for o in out], ignore_index=True))
df_compare_lld_dur = _parse_concat_df(pd.concat([o[2] for o in out], ignore_index=True))

del (
    out,
    _extract_opensmile_f_duration,
    _parse_concat_df,
    _extract_parse_smile_duration,
)




  0%|          | 0/3440 [00:00<?, ?it/s]



## Extract `F0`-range from the LLD's

In [8]:
import numpy as np
from tsflex.features import FuncWrapper
from tsflex.features.utils import make_robust

In [9]:
def quantile_nz(a: np.ndarray, q=List[float]) -> List[Optional[float]]:
    a_nz = a[a > 0]
    if len(a_nz):
        return np.quantile(a_nz, q=q)
    else:
        return [None] * len(q)


def nonzero_count(a: np.ndarray) -> int:
    return sum(a > 0)


def return_func_series_list(a: np.ndarray, f_list: List[FuncWrapper]) -> pd.Series:
    s = pd.Series(dtype="float64")
    for f in f_list:
        s = pd.concat([s, pd.Series(data=f(a), index=f.output_names)])
    return s.sort_index()


qs = sum([[1 - q, q] for q in [0, 0.01, 0.02, 0.03, 0.05, 0.1, 0.15, 0.2]], []) + [0.5]
display(str(qs))

'[1, 0, 0.99, 0.01, 0.98, 0.02, 0.97, 0.03, 0.95, 0.05, 0.9, 0.1, 0.85, 0.15, 0.8, 0.2, 0.5]'

### GeMAPS LLD

#### Whole duration

In [None]:
df_gemaps_lld.filter(like="F0").columns
# logRelF0-H1-H2_sma3nz -> log freq difference between the harmonics
# SMA -> moving average window
# NZ -> no-zero

In [None]:
# define the signal on which the function will be performed, and the functions
s_name = "F0semitoneFrom27.5Hz_sma3nz"

f_gemaps_lld_funcs: List[FuncWrapper] = [
    make_robust(
        FuncWrapper(quantile_nz, output_names=[s_name + f"_q={q}" for q in qs], q=qs),
        min_nb_samples=3,
        passthrough_nans=False,
    ),
    make_robust(FuncWrapper(nonzero_count, output_names=[f"{s_name}_nzcount"])),
]

# Apply the functions on each group
df_gemaps_lld_F0 = (
    df_gemaps_lld.groupby(by=["file"])[[s_name]]
    .apply(lambda x: return_func_series_list(x.values, f_list=f_gemaps_lld_funcs))
    .reset_index()
)
display(df_gemaps_lld_F0)

#### Fixed duration

In [10]:
# define the signal on which the function will be performed, and the functions
s_name = "F0semitoneFrom27.5Hz_sma3nz"

f_gemaps_lld_funcs: List[FuncWrapper] = [
    make_robust(
        FuncWrapper(quantile_nz, output_names=[s_name + f"_q={q}" for q in qs], q=qs),
        min_nb_samples=3,
        passthrough_nans=False,
    ),
    make_robust(FuncWrapper(nonzero_count, output_names=[f"{s_name}_nzcount"])),
]

# Apply the functions on each group
df_gemaps_lld_F0_dur = (
    df_gemaps_lld_dur.groupby(by=["file"])[[s_name]]
    .apply(lambda x: return_func_series_list(x.values, f_list=f_gemaps_lld_funcs))
    .reset_index()
)
display(df_gemaps_lld_F0_dur)

Unnamed: 0,file,F0semitoneFrom27.5Hz_sma3nz_nzcount,F0semitoneFrom27.5Hz_sma3nz_q=0,F0semitoneFrom27.5Hz_sma3nz_q=0.01,F0semitoneFrom27.5Hz_sma3nz_q=0.02,F0semitoneFrom27.5Hz_sma3nz_q=0.03,F0semitoneFrom27.5Hz_sma3nz_q=0.05,F0semitoneFrom27.5Hz_sma3nz_q=0.1,F0semitoneFrom27.5Hz_sma3nz_q=0.15,F0semitoneFrom27.5Hz_sma3nz_q=0.2,F0semitoneFrom27.5Hz_sma3nz_q=0.5,F0semitoneFrom27.5Hz_sma3nz_q=0.8,F0semitoneFrom27.5Hz_sma3nz_q=0.85,F0semitoneFrom27.5Hz_sma3nz_q=0.9,F0semitoneFrom27.5Hz_sma3nz_q=0.95,F0semitoneFrom27.5Hz_sma3nz_q=0.97,F0semitoneFrom27.5Hz_sma3nz_q=0.98,F0semitoneFrom27.5Hz_sma3nz_q=0.99,F0semitoneFrom27.5Hz_sma3nz_q=1
0,/media/speech_webapp_merged/backup/2020-11-25_...,0.0,,,,,,,,,,,,,,,,,
1,/media/speech_webapp_merged/backup/2020-11-25_...,46.0,19.313572,19.334630,19.355689,19.386500,19.453890,19.963919,22.835855,23.492348,24.177898,25.293577,25.564772,25.775878,25.960395,25.970414,25.978542,25.992061,26.005579
2,/media/speech_webapp_merged/backup/2020-11-25_...,0.0,,,,,,,,,,,,,,,,,
3,/media/speech_webapp_merged/backup/2020-11-25_...,0.0,,,,,,,,,,,,,,,,,
4,/media/speech_webapp_merged/backup/2020-11-25_...,13.0,18.216820,18.261429,18.306037,18.350646,18.439864,18.633672,18.769006,18.881284,19.783018,26.095477,27.633506,28.400121,28.685352,28.703167,28.712075,28.720983,28.729891
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3435,/media/speech_webapp_merged/backup/2022-07-29_...,525.0,12.111890,13.325765,14.097616,15.764285,17.097023,17.760195,19.265684,20.329132,31.001776,35.056116,35.562463,36.620591,37.576349,38.889307,39.313846,39.500855,59.869705
3436,/media/speech_webapp_merged/backup/2022-07-29_...,454.0,12.122270,12.328709,12.565002,12.851870,13.138999,16.619678,16.980999,17.301708,30.335692,33.014997,34.022290,35.801359,37.544811,61.093627,61.491940,61.853252,62.090675
3437,/media/speech_webapp_merged/backup/2022-07-29_...,491.0,12.099364,12.526840,12.922506,13.069245,15.476135,16.882950,17.974790,19.255781,30.717026,34.006207,34.631210,35.820171,38.155293,38.973419,39.279740,39.621155,60.109604
3438,/media/speech_webapp_merged/backup/2022-07-29_...,504.0,12.042550,12.310602,12.436391,12.527343,12.898826,16.023402,16.838290,17.499307,30.053848,34.070846,35.092982,35.892273,36.452871,36.661359,37.243609,37.556779,61.561722


### ComPaRE LLD

#### whole duration

In [None]:
df_compare_lld.filter(like="F0").columns

In [None]:
s_name = "F0final_sma"

f_compare_lld_funcs: List[FuncWrapper] = [
    make_robust(
        FuncWrapper(quantile_nz, output_names=[s_name + f"_q={q}" for q in qs], q=qs),
        min_nb_samples=3,
        passthrough_nans=False,
    ),
    make_robust(FuncWrapper(nonzero_count, output_names=[f"{s_name}_nzcount"])),
]

df_compare_lld_F0 = (
    df_compare_lld.groupby(by=["file"])[[s_name]]
    .apply(lambda x: return_func_series_list(x.values, f_list=f_compare_lld_funcs))
    .reset_index()
)
display(df_compare_lld_F0)

In [None]:
del df_compare_lld, df_gemaps_lld

#### Fixed duration

In [11]:
s_name = "F0final_sma"

f_compare_lld_funcs: List[FuncWrapper] = [
    make_robust(
        FuncWrapper(quantile_nz, output_names=[s_name + f"_q={q}" for q in qs], q=qs),
        min_nb_samples=3,
        passthrough_nans=False,
    ),
    make_robust(FuncWrapper(nonzero_count, output_names=[f"{s_name}_nzcount"])),
]

df_compare_lld_F0_dur = (
    df_compare_lld_dur.groupby(by=["file"])[[s_name]]
    .apply(lambda x: return_func_series_list(x.values, f_list=f_compare_lld_funcs))
    .reset_index()
)
display(df_compare_lld_F0_dur)

Unnamed: 0,file,F0final_sma_nzcount,F0final_sma_q=0,F0final_sma_q=0.01,F0final_sma_q=0.02,F0final_sma_q=0.03,F0final_sma_q=0.05,F0final_sma_q=0.1,F0final_sma_q=0.15,F0final_sma_q=0.2,F0final_sma_q=0.5,F0final_sma_q=0.8,F0final_sma_q=0.85,F0final_sma_q=0.9,F0final_sma_q=0.95,F0final_sma_q=0.97,F0final_sma_q=0.98,F0final_sma_q=0.99,F0final_sma_q=1
0,/media/speech_webapp_merged/backup/2020-11-25_...,0.0,,,,,,,,,,,,,,,,,
1,/media/speech_webapp_merged/backup/2020-11-25_...,47.0,65.373688,70.865292,76.356897,79.820491,83.981661,84.725740,96.262971,106.407587,111.115967,118.459956,120.355673,121.832703,123.195856,123.259091,123.342804,123.604660,123.866516
2,/media/speech_webapp_merged/backup/2020-11-25_...,0.0,,,,,,,,,,,,,,,,,
3,/media/speech_webapp_merged/backup/2020-11-25_...,0.0,,,,,,,,,,,,,,,,,
4,/media/speech_webapp_merged/backup/2020-11-25_...,16.0,59.417526,59.750108,60.082690,60.415272,61.080436,63.266615,66.230623,70.227020,81.926796,113.977684,129.486975,139.303108,144.104046,144.289527,144.382268,144.475008,144.567749
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3435,/media/speech_webapp_merged/backup/2022-07-29_...,537.0,52.411278,53.815871,54.473767,55.182697,58.448596,74.618364,79.244200,86.137927,161.384293,207.541605,212.280011,226.131650,235.083456,250.921192,261.007480,266.620685,273.261505
3436,/media/speech_webapp_merged/backup/2022-07-29_...,484.0,52.487247,53.489115,53.905257,54.274060,54.706801,56.499628,60.450850,72.390379,150.863167,183.178098,185.703445,198.102008,221.243517,231.749359,238.248474,245.759638,253.532104
3437,/media/speech_webapp_merged/backup/2022-07-29_...,513.0,52.773777,53.723000,54.336057,54.840268,56.111498,65.511383,72.927167,78.666809,159.579422,192.413712,201.826782,212.936194,247.267242,254.604106,262.980389,268.844015,273.304169
3438,/media/speech_webapp_merged/backup/2022-07-29_...,512.0,53.005619,53.948271,54.624933,55.173694,56.416730,62.703044,72.016611,74.807172,155.291420,196.385416,208.469205,217.509985,225.339508,228.183186,235.059670,237.844198,480.455841


In [None]:
# df_compare_lld_F0_dur['F0final_sma_nzcount'].value_counts()

# Join into one big dataframe

## Whole duration

In [None]:
df_feat_tot = df_gemaps_func.merge(
    df_gemaps_lld_F0,
    on=["file"],
).merge(df_compare_lld_F0, on=["file"])

display(df_feat_tot)

df_feat_tot.to_parquet(loc_data_dir.joinpath("df_speech_feat_tot.parquet"), engine='fastparquet')

## Fixed duration

In [12]:
df_feat_tot_dur = df_gemaps_func_dur.merge(
    df_gemaps_lld_F0_dur,
    on=["file"],
).merge(df_compare_lld_F0_dur, on=["file"])

display(df_feat_tot_dur)

df_feat_tot_dur.to_parquet(loc_data_dir.joinpath(f"df_speech_feat_tot_{DURATION_S}s.parquet"))

Unnamed: 0,file,start,end,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,...,F0final_sma_q=0.2,F0final_sma_q=0.5,F0final_sma_q=0.8,F0final_sma_q=0.85,F0final_sma_q=0.9,F0final_sma_q=0.95,F0final_sma_q=0.97,F0final_sma_q=0.98,F0final_sma_q=0.99,F0final_sma_q=1
0,/media/speech_webapp_merged/backup/2020-11-25_...,0 days 00:00:00,-1 days +23:59:59.278639456,,,,,,,,...,,,,,,,,,,
1,/media/speech_webapp_merged/backup/2020-11-25_...,0 days 00:00:00,-1 days +23:59:59.371519274,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,
2,/media/speech_webapp_merged/backup/2020-11-25_...,0 days 00:00:00,-1 days +23:59:59.371519274,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,
3,/media/speech_webapp_merged/backup/2020-11-25_...,0 days 00:00:00,-1 days +23:59:59.325079365,,,,,,,,...,,,,,,,,,,
4,/media/speech_webapp_merged/backup/2020-11-25_...,0 days 00:00:00,-1 days +23:59:59.371519274,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3435,/media/speech_webapp_merged/backup/2022-07-29_...,0 days 00:00:00,0 days 00:00:13.933333333,28.138924,0.262838,19.160347,31.739861,33.588272,14.427925,196.351608,...,80.793561,171.155106,189.764621,195.777077,204.598672,211.460495,222.457440,236.238565,265.332869,272.520050
3436,/media/speech_webapp_merged/backup/2022-07-29_...,0 days 00:00:00,0 days 00:00:13.336000,27.315117,0.283532,18.290337,30.543329,33.412140,15.121803,193.817444,...,78.028232,158.235748,188.038403,198.451437,204.818040,216.968382,229.354390,244.414448,270.704076,287.092896
3437,/media/speech_webapp_merged/backup/2022-07-29_...,0 days 00:00:01.365333333,0 days 00:00:16.365333333,30.677828,0.252255,24.055370,31.893747,34.709621,10.654251,179.117371,...,93.011707,171.572083,201.461716,208.563187,229.595581,263.805341,273.710258,279.181036,288.200612,341.743896
3438,/media/speech_webapp_merged/backup/2022-07-29_...,0 days 00:00:00,0 days 00:00:14.274666667,29.246935,0.234704,22.871061,31.227327,32.679203,9.808142,211.052551,...,89.955653,166.530640,180.777460,184.894058,194.980646,215.270633,218.074289,219.546282,221.904251,499.169189


---

In [None]:
df_feat_tot['file'].map(lambda x: '/'.join(x.split('/')[-3:]))

In [None]:
df_feat_tot[['file', 'DB', 'pic_name', 'time_str', 'ID']]