In [2]:
%load_ext autoreload
%autoreload 2

In [14]:
import sys

import pandas as pd
from IPython.display import display, Audio

sys.path.append("..")

from GSSP_utils.path_conf import cgn_root_dir, cgn_ort_path, loc_data_dir
from GSSP_utils.cgn import listen_to_audio
import numpy as np
import torch
import torchaudio
from tqdm.auto import tqdm
from speechbrain.pretrained import EncoderClassifier
from uuid import uuid4

pd.options.display.max_rows = 80
pd.options.display.max_columns = None

import opensmile
from multiprocess import Pool
import traceback
from pathlib import Path
from typing import Tuple, List

func_gemaps = opensmile.Smile(
    feature_set=opensmile.FeatureSet.GeMAPSv01b,
    feature_level=opensmile.FeatureLevel.Functionals,
)


In [15]:
# Load the merged orthographic + recording + speaker metadata
df_cgn_metadata = pd.read_parquet(loc_data_dir / 'df_cgn_ort_rec_speaker.parquet')
df_cgn_metadata['uuid'] = [str(uuid4()) for _ in range(len(df_cgn_metadata))]

we will only use the components of interest

In [19]:
valid_components = ['b', 'o'] 

In [20]:
df_cgn_metadata[
    (df_cgn_metadata.duration_s > 19) &
    (df_cgn_metadata.component.isin(valid_components)) 
].groupby(
    ['text_type', 'dop', 'domain', 'sex','country']
).size().rename('#segments').to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,#segments
text_type,dop,domain,sex,country,Unnamed: 5_level_1
ttb,unscripted,private,female,Flanders,771
ttb,unscripted,private,female,Netherlands,191
ttb,unscripted,private,male,Flanders,576
ttb,unscripted,private,male,Netherlands,176
tto,scripted,private,female,Flanders,826
tto,scripted,private,female,Netherlands,58
tto,scripted,private,male,Flanders,673
tto,scripted,private,male,Netherlands,86


## listen to the audio

In [22]:
# listen_to_audio(
#     df_cgn_metadata[
#         (df_cgn_metadata.duration_s > 19)
#         & (df_cgn_metadata.component.isin(valid_components))
#         # & (df_cgn_metadata.dop == 'unscripted')
#     ]
#     .sample(1)
#     .iloc[0],
#     margin_s=-2,
# )


In [24]:
# df_cgn_metadata[df_cgn_metadata.duration_s > 15].groupby(
#     [
#         "text_type",
#         "dop",
#         "mode",
#         "domain",
#         "country"
#     ]
# ).size().rename("#segments").to_frame()

---

## Extract opensmile features

In [25]:
def _extract_parse_smile_duration(
    s: opensmile.Smile, uuid_str, wav_path: Path, start_s: float, end_s: float
) -> pd.DataFrame:
    wav_arr, _ = torchaudio.load(wav_path)
    wav_arr = wav_arr.numpy().ravel()
    sr = 16_000

    df_feat = s.process_signal(
        signal=wav_arr,
        sampling_rate=sr,
        file=str(wav_path),
        start=start_s,
        end=end_s,
    )

    df_feat = df_feat.reset_index(drop=False)
    df_feat["file"] = df_feat["file"].astype(str)
    df_feat["uuid"] = uuid_str
    return df_feat


def _extract_opensmile_f_duration(file_start_end_uuid) -> Tuple[pd.DataFrame, ...]:
    file, start, end, uuid_str = file_start_end_uuid
    # calculate the global utterance features
    return (
        _extract_parse_smile_duration(
            func_gemaps, wav_path=file, uuid_str=uuid_str, start_s=start, end_s=end
        ),
        # _extract_parse_smile_duration(
        #     func_compare, arr_path=file, start_s=start, end_s=end
        # ),
    )


In [26]:
mask = (df_cgn_metadata.duration_s > (min_duration_s + margin_s * 2)) & (
    df_cgn_metadata.component.isin(valid_components)
)

wav_file_start_end_uuid: List[Tuple[Path, float, float, str]] = []
for _, row in tqdm(df_cgn_metadata[mask].iterrows(), total=mask.sum()):
    recording_name = None
    for c in ["rec_name", "recordingID"]:
        if c in row:
            recording_name = row[c]
            break
    assert recording_name is not None

    # Load the audio data
    # TODO -> i think this glob is really slow
    file_path = list(
        cgn_root_dir.glob(f"cdroms/comp-{row.component}/*/{recording_name}.wav")
    )
    assert len(file_path) == 1

    t_start = row.t_start + margin_s
    t_end = row.t_stop - margin_s


    delta = (t_end - t_start) - min_duration_s
    # if delta > 1:
    #     # randint = np.random.randint(0, 1000 * delta)
    #     # offset = randint / 1000
    #     delta = (t_end - t_start) - min_duration_s
    t_start += delta

    wav_file_start_end_uuid.append(
        (file_path[0], t_start, t_start + min_duration_s, row.uuid)
    )


  0%|          | 0/3359 [00:00<?, ?it/s]

In [27]:
out: List = []
with Pool(processes=8) as pool:
    results = pool.imap_unordered(_extract_opensmile_f_duration, wav_file_start_end_uuid)
    results = tqdm(results, total=len(wav_file_start_end_uuid))
    try:
        out = [f for f in results]
    except:
        traceback.print_exc()
        pool.terminate()
    finally:
        pool.close()
        pool.join()

df_gemaps_func_dur_start = pd.concat([o[0] for o in out], ignore_index=True)


  0%|          | 0/3359 [00:00<?, ?it/s]

In [None]:
df_gemaps_func_dur_start_m = df_gemaps_func_dur_start.merge(
    df_cgn_metadata, on="uuid", how="left"
)


In [None]:
df_gemaps_func_dur_start_m.to_parquet(
    loc_data_dir / "df_gemaps_cgn_15s_end.parquet", engine="fastparquet"
)
