Skip to content

Commit

Permalink
Merge pull request #15 from sensein/dev
Browse files Browse the repository at this point in the history
Adding praat and opensmile feats extraction
  • Loading branch information
fabiocat93 committed May 13, 2024
2 parents 892d29a + f69e4fb commit 6f5e71e
Show file tree
Hide file tree
Showing 8 changed files with 644 additions and 84 deletions.
474 changes: 390 additions & 84 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ pydra = "^0.23"
pydantic = "^2.7.1"
accelerate = "^0.29.3"
huggingface-hub = "^0.23.0"
praat-parselmouth = "^0.4.3"
iso-639 = {git = "https://github.com/noumar/iso639.git", tag = "0.4.5"}
opensmile = "^2.5.0"

[tool.poetry.group.dev]
optional = true
Expand Down
31 changes: 31 additions & 0 deletions scripts/exp2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""This script is used to test the audio tasks."""
from senselab.audio.tasks.features_extraction.opensmile import extract_feats_from_dataset
from senselab.audio.tasks.features_extraction.praat_parselmouth import (
get_hf_dataset_durations,
get_hf_dataset_f0_descriptors,
get_hf_dataset_harmonicity_descriptors,
get_hf_dataset_jitter_descriptors,
get_hf_dataset_shimmer_descriptors,
)
from senselab.utils.tasks.input_output import read_files_from_disk

dataset = read_files_from_disk(["/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav"])

print(dataset)

duration_dataset = get_hf_dataset_durations(dataset)
f0_dataset = get_hf_dataset_f0_descriptors(dataset, f0min=100, f0max=500)
harmonicity_dataset = get_hf_dataset_harmonicity_descriptors(dataset, f0min=100)
jitter_dataset = get_hf_dataset_jitter_descriptors(dataset, f0min=100, f0max=500)
shimmer_dataset = get_hf_dataset_shimmer_descriptors(dataset, f0min=100, f0max=500)

print(duration_dataset)
print(f0_dataset)
print(harmonicity_dataset)
print(jitter_dataset)
print(shimmer_dataset)

opensmile_feats = extract_feats_from_dataset(dataset, audio_column="audio", feature_set="eGeMAPSv02", feature_level="Functionals")

print(opensmile_feats)

1 change: 1 addition & 0 deletions src/senselab/audio/tasks/features_extraction/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""This module provides the implementation of the Senselab audio features extraction tasks."""
44 changes: 44 additions & 0 deletions src/senselab/audio/tasks/features_extraction/opensmile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""This module contains functions for extracting features from audios using openSMILE."""

from typing import Any, Dict

import opensmile

from senselab.utils.tasks.input_output import _from_dict_to_hf_dataset, _from_hf_dataset_to_dict


def extract_feats_from_dataset(dataset: Dict[str, Any], audio_column: str = 'audio', feature_set: str = "eGeMAPSv02", feature_level: str = "Functionals") -> Dict[str, Any]:
"""Apply feature extraction across a dataset of audio files.
Low-level descriptors are extracted on 20ms windows with a hop of 10ms.
Functional descriptors are extracted on the entire audio signal.
"""
def _load_opensmile_model(feature_set: str, feature_level: str) -> opensmile.Smile:
"""Load an openSMILE configuration to extract audio features."""
smile = opensmile.Smile(
feature_set=opensmile.FeatureSet[feature_set],
feature_level=opensmile.FeatureLevel[feature_level],
)
return smile

def _extract_feats_from_row(sample: Dict[str, Any], smile: opensmile.Smile, audio_column: str) -> Dict[str, Any]:
"""Extract features from a single audio sample using the specified openSMILE model."""
# Extracting audio data
audio_array = sample[audio_column]['array']
sampling_rate = sample[audio_column]['sampling_rate']

# Processing the audio sample to compute features
sample_features = smile.process_signal(audio_array, sampling_rate)
return sample_features.to_dict("list")

hf_dataset = _from_dict_to_hf_dataset(dataset)
unnecessary_columns = [col for col in hf_dataset.column_names if col != audio_column]
hf_dataset = hf_dataset.remove_columns(unnecessary_columns)

smile = _load_opensmile_model(feature_set, feature_level)
features_dataset = hf_dataset.map(
_extract_feats_from_row,
fn_kwargs={"smile": smile, "audio_column": audio_column},
)
features_dataset = features_dataset.remove_columns([audio_column])
return _from_hf_dataset_to_dict(features_dataset)
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
"""This module defines a pydra API for the praat_parselmouth features extraction task."""
import pydra

from senselab.audio.tasks.features_extraction.opensmile import extract_feats_from_dataset

extract_feats_from_dataset_pt = pydra.mark.task(extract_feats_from_dataset)


151 changes: 151 additions & 0 deletions src/senselab/audio/tasks/features_extraction/praat_parselmouth.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
"""This module contains functions for extracting features from audio files using Praat and Parselmouth."""
from typing import Any, Dict

import parselmouth
from datasets import Dataset
from parselmouth.praat import call

from senselab.utils.tasks.input_output import _from_dict_to_hf_dataset, _from_hf_dataset_to_dict


def get_hf_dataset_durations(dataset: Dict[str, Any], audio_column: str = 'audio') -> Dict[str, float]:
"""Returns the duration of the audios in the Hugging Face `Dataset` object."""
hf_dataset = _from_dict_to_hf_dataset(dataset)
unnecessary_columns = [col for col in hf_dataset.column_names if col != audio_column]
hf_dataset = hf_dataset.remove_columns(unnecessary_columns)

def get_hf_dataset_row_duration(row: Dataset, audio_column: str) -> Dict[str, float]:
def _get_duration(sound: parselmouth.Sound) -> float:
return call(sound, "Get total duration")
waveform = row[audio_column]['array']
sampling_rate = row[audio_column]['sampling_rate']
sound = parselmouth.Sound(waveform, sampling_frequency=sampling_rate)
duration = _get_duration(sound)
return {'duration_seconds': duration}

durations_hf_dataset = hf_dataset.map(lambda x: get_hf_dataset_row_duration(x, audio_column))
durations_hf_dataset = durations_hf_dataset.remove_columns([audio_column])
return _from_hf_dataset_to_dict(durations_hf_dataset)


def get_hf_dataset_f0_descriptors(dataset: Dict[str, Any], f0min: float, f0max: float, audio_column: str = 'audio', unit: str = 'Hertz') -> Dict[str, float]:
"""Returns the fundamental frequency descriptors of the audios in the Hugging Face `Dataset` object."""
hf_dataset = _from_dict_to_hf_dataset(dataset)
unnecessary_columns = [col for col in hf_dataset.column_names if col != audio_column]
hf_dataset = hf_dataset.remove_columns(unnecessary_columns)

def get_hf_dataset_row_f0_descriptors(row: Dataset, audio_column: str, f0min: float, f0max: float, unit: str) -> Dict[str, float]:

def _to_pitch(sound: parselmouth.Sound, f0min: float, f0max: float) -> parselmouth.Pitch:
return call(sound, "To Pitch", 0.0, f0min, f0max)

def _get_mean_f0(pitch: parselmouth.Pitch, unit: str) -> float:
return call(pitch, "Get mean", 0, 0, unit)

def _get_std_dev_f0(pitch: parselmouth.Pitch, unit: str) -> float:
return call(pitch, "Get standard deviation", 0, 0, unit)

waveform = row[audio_column]['array']
sampling_rate = row[audio_column]['sampling_rate']
sound = parselmouth.Sound(waveform, sampling_frequency=sampling_rate)

pitch = _to_pitch(sound, f0min, f0max)
return {
f'f0_mean_{unit}': _get_mean_f0(pitch, unit),
f'f0_std_dev_{unit}': _get_std_dev_f0(pitch, unit)
}

f0_descriptors_hf_dataset = hf_dataset.map(lambda x: get_hf_dataset_row_f0_descriptors(x, audio_column, f0min, f0max, unit))
f0_descriptors_hf_dataset = f0_descriptors_hf_dataset.remove_columns([audio_column])
return _from_hf_dataset_to_dict(f0_descriptors_hf_dataset)


def get_hf_dataset_harmonicity_descriptors(dataset: Dict[str, Any], f0min: float, audio_column: str = 'audio') -> Dict[str, float]:
"""Returns the harmonicity descriptors of the audios in the Hugging Face `Dataset` object."""
hf_dataset = _from_dict_to_hf_dataset(dataset)
unnecessary_columns = [col for col in hf_dataset.column_names if col != audio_column]
hf_dataset = hf_dataset.remove_columns(unnecessary_columns)

def _get_hf_dataset_row_harmonicity_descriptors(row: Dataset, audio_column: str, f0min: float) -> Dict[str, float]:
def _to_harmonicity(sound: parselmouth.Sound, f0min: float) -> parselmouth.Harmonicity:
return call(sound, "To Harmonicity (cc)", 0.01, f0min, 0.1, 1.0)

def _get_mean_hnr(harmonicity: parselmouth.Harmonicity) -> float:
return call(harmonicity, "Get mean", 0, 0)

def _get_std_dev_hnr(harmonicity: parselmouth.Harmonicity) -> float:
return call(harmonicity, "Get standard deviation", 0, 0)

waveform = row[audio_column]['array']
sampling_rate = row[audio_column]['sampling_rate']
sound = parselmouth.Sound(waveform, sampling_frequency=sampling_rate)

harmonicity = _to_harmonicity(sound, f0min)
return {
'harmonicity_mean': _get_mean_hnr(harmonicity),
'harmonicity_std_dev': _get_std_dev_hnr(harmonicity)
}

harmonicity_descriptors_hf_dataset = hf_dataset.map(lambda x: _get_hf_dataset_row_harmonicity_descriptors(x, audio_column, f0min))
harmonicity_descriptors_hf_dataset = harmonicity_descriptors_hf_dataset.remove_columns([audio_column])
return _from_hf_dataset_to_dict(harmonicity_descriptors_hf_dataset)

def get_hf_dataset_jitter_descriptors(dataset: Dict[str, Any], f0min: float, f0max: float, audio_column: str = 'audio') -> Dict[str, float]:
"""Returns the jitter descriptors of the audios in the Hugging Face `Dataset` object."""
hf_dataset = _from_dict_to_hf_dataset(dataset)
unnecessary_columns = [col for col in hf_dataset.column_names if col != audio_column]
hf_dataset = hf_dataset.remove_columns(unnecessary_columns)

def _get_hf_dataset_row_jitter_descriptors(row: Dataset, audio_column: str, f0min: float, f0max: float) -> Dict[str, float]:
def _to_point_process(sound: parselmouth.Sound, f0min: float, f0max: float) -> parselmouth.Data:
return call(sound, "To PointProcess (periodic, cc)", f0min, f0max)

def _get_jitter(type: str, point_process: parselmouth.Data) -> float:
return call(point_process, f"Get jitter ({type})", 0, 0, 0.0001, 0.02, 1.3)

waveform = row[audio_column]['array']
sampling_rate = row[audio_column]['sampling_rate']
sound = parselmouth.Sound(waveform, sampling_frequency=sampling_rate)
point_process = _to_point_process(sound, f0min, f0max)
return {
'local_jitter': _get_jitter("local", point_process),
'localabsolute_jitter': _get_jitter("local, absolute", point_process),
'rap_jitter': _get_jitter("rap", point_process),
'ppq5_jitter': _get_jitter("ppq5", point_process),
'ddp_jitter': _get_jitter("ddp", point_process)
}

jitter_descriptors_hf_dataset = hf_dataset.map(lambda x: _get_hf_dataset_row_jitter_descriptors(x, audio_column, f0min, f0max))
jitter_descriptors_hf_dataset = jitter_descriptors_hf_dataset.remove_columns([audio_column])
return _from_hf_dataset_to_dict(jitter_descriptors_hf_dataset)

def get_hf_dataset_shimmer_descriptors(dataset: Dict[str, Any], f0min: float, f0max: float, audio_column: str = 'audio') -> Dict[str, float]:
"""Returns the shimmer descriptors of the audios in the Hugging Face `Dataset` object."""
hf_dataset = _from_dict_to_hf_dataset(dataset)
unnecessary_columns = [col for col in hf_dataset.column_names if col != audio_column]
hf_dataset = hf_dataset.remove_columns(unnecessary_columns)

def _get_hf_dataset_row_shimmer_descriptors(row: Dataset, audio_column: str, f0min: float, f0max: float) -> Dict[str, float]:
def _to_point_process(sound: parselmouth.Sound, f0min: float, f0max: float) -> parselmouth.Data:
return call(sound, "To PointProcess (periodic, cc)", f0min, f0max)

def _get_shimmer(type: str, sound: parselmouth.Sound, point_process: parselmouth.Data) -> float:
# Use a single function call with flexible shimmer type
return call([sound, point_process], f"Get shimmer ({type})", 0, 0, 0.0001, 0.02, 1.3, 1.6)

waveform = row[audio_column]['array']
sampling_rate = row[audio_column]['sampling_rate']
sound = parselmouth.Sound(waveform, sampling_frequency=sampling_rate)
point_process = _to_point_process(sound, f0min, f0max)
return {
'local_shimmer': _get_shimmer("local", sound, point_process),
'localDB_shimmer': _get_shimmer("local_dB", sound, point_process),
'apq3_shimmer': _get_shimmer("apq3", sound, point_process),
'apq5_shimmer': _get_shimmer("apq5", sound, point_process),
'apq11_shimmer': _get_shimmer("apq11", sound, point_process),
'dda_shimmer': _get_shimmer("dda", sound, point_process)
}

shimmer_descriptors_hf_dataset = hf_dataset.map(lambda x: _get_hf_dataset_row_shimmer_descriptors(x, audio_column, f0min, f0max))
shimmer_descriptors_hf_dataset = shimmer_descriptors_hf_dataset.remove_columns([audio_column])
return _from_hf_dataset_to_dict(shimmer_descriptors_hf_dataset)
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"""This module defines a pydra API for the praat_parselmouth features extraction task."""
import pydra

from senselab.audio.tasks.features_extraction.praat_parselmouth import (
get_hf_dataset_durations,
get_hf_dataset_f0_descriptors,
get_hf_dataset_harmonicity_descriptors,
get_hf_dataset_jitter_descriptors,
get_hf_dataset_shimmer_descriptors,
)

get_hf_dataset_durations_pt = pydra.mark.task(get_hf_dataset_durations)
get_hf_dataset_f0_descriptors_pt = pydra.mark.task(get_hf_dataset_f0_descriptors)
get_hf_dataset_harmonicity_descriptors_pt = pydra.mark.task(get_hf_dataset_harmonicity_descriptors)
get_hf_dataset_jitter_descriptors_pt = pydra.mark.task(get_hf_dataset_jitter_descriptors)
get_hf_dataset_shimmer_descriptors_pt = pydra.mark.task(get_hf_dataset_shimmer_descriptors)

0 comments on commit 6f5e71e

Please sign in to comment.