In [1]:
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path
from typing import List, Optional

import pandas as pd
from IPython.display import display
from tqdm.auto import tqdm
from scipy.io import wavfile
import numpy as np

sys.path.append("..")
from sgs_utils.path_conf import loc_data_dir, speech_data_session_dir
from sgs_utils.data_filtering import get_valid_mask
from sgs_utils.dataframes import groupby_consecutive

In [2]:
df_session = pd.read_parquet(loc_data_dir.joinpath("df_session_tot.parquet"))

In [3]:
df_session_v = df_session[get_valid_mask(df_session)]

In [59]:
extracted_VADs = False

In [60]:
if not extracted_VADs:
    from pyannote.audio import Pipeline

    # load the pipeline
    pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection")

    # the audio files are processed via pipeline._segementation
    df_list = []
    for _, r in tqdm(df_session_v.iterrows(), total=len(df_session_v)):
        try:

            if r.wav_duration_s < 17:
                continue

            wav_file = list(
                speech_data_session_dir.glob(
                    f"*{r.ID}/{r.DB}/{r.pic_name}*{r.time_str}*.wav"
                )
            )[0]

            out = pipeline._segmentation(wav_file)
            probas = out.data.ravel()
            sliding_window = out.sliding_window
            s = pd.Series(
                probas,
                index=np.arange(
                    start=sliding_window.start + sliding_window.duration,
                    step=sliding_window.step,
                    stop=sliding_window.start
                    + sliding_window.duration
                    + sliding_window.step * probas.shape[0],
                    dtype="float64",
                )[: probas.shape[0]],
                name="voice_proba",
            )


            # slice the series to only retain 15 second data until the penultimate second
            # Note: this is the same method as with the 
            t_end = s.index[-1]
            s = s[max(1, t_end - 15 - 1): t_end - 1]

            s.index.name = "time_s"
            s = s.reset_index(drop=False)

            s["pic_name"] = wav_file.name.split("__")[0]
            s["time_str"] = wav_file.name.split("__")[1].split(".")[0]
            s["DB"] = wav_file.parent.name
            s['sw_duration'] = sliding_window.duration
            s['sw_step'] = sliding_window.step
            s["ID"] = wav_file.parent.parent.name.split("__")[-1]

            df_list.append(s)
        except Exception as e:
            pass

    df_vad = pd.concat(df_list)
    df_vad['pic_name'] = df_vad['pic_name'].astype('category')
    df_vad['DB'] = df_vad['DB'].astype('category')
    df_vad['ID'] = df_vad['ID'].astype('category')
    df_vad['time_s'] = df_vad['time_s'].astype('float32')

    df_vad.to_parquet(loc_data_dir.joinpath('df_vad_fixed_dur.parquet'))
else:
    df_vad = pd.read_parquet(loc_data_dir.joinpath('df_vad_fixed_dur.parquet'))


  0%|          | 0/3070 [00:00<?, ?it/s]

In [62]:
# Get the unique time_str & ID combination
ids = df_vad.groupby(["time_str", "ID"]).size()
ids = ids[ids > 0]
display(ids.value_counts())
ids = ids.reset_index()[['time_str', "ID"]]

889    3015
dtype: int64

In [93]:
feat_dict = []

for _, r in tqdm(ids.iterrows(), total=len(ids)):
    mask = (df_vad.ID == r.ID) & (df_vad.time_str == r.time_str)
    df_vad_rec = df_vad[mask]
    gc = groupby_consecutive(df_vad_rec['voice_proba'] > .5)
    gc['duration_s'] = (gc.end - gc.start) * df_vad_rec.sw_duration.iloc[0]

    n_speaks = len(gc[gc.voice_proba == True])
    n_silences = len(gc[gc.voice_proba == False])

    feat_dict.append(
        {
            "ID": r.ID,
            "time_str": r.time_str,
            "mean_voice_proba": df_vad_rec['voice_proba'].mean(),
            "speak_max_s" : gc[gc.voice_proba == True]['duration_s'].max(),
            "speak_mean_s" : gc[gc.voice_proba == True]['duration_s'].mean(),
            "speak_std_s": gc[gc.voice_proba == True]['duration_s'].std() if n_speaks > 1 else 0,
            "n_silences": n_silences,
            "silence_max_s": gc[gc.voice_proba == False]['duration_s'].max() if n_silences > 0 else 0,
            "silence_mean_s": gc[gc.voice_proba == False]['duration_s'].mean() if n_silences > 0 else 0,
            "silence_std_s": gc[gc.voice_proba == False]['duration_s'].std() if n_silences > 1 else 0
        }
    )

  0%|          | 0/3015 [00:00<?, ?it/s]

In [97]:
df_feat = pd.DataFrame(feat_dict).fillna(0)

In [99]:
df_feat.to_parquet(loc_data_dir.joinpath('df_vad_feat.parquet'))