generated from sensein/python-package-template
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #15 from sensein/dev
Adding praat and opensmile feats extraction
- Loading branch information
Showing
8 changed files
with
644 additions
and
84 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
"""This script is used to test the audio tasks.""" | ||
from senselab.audio.tasks.features_extraction.opensmile import extract_feats_from_dataset | ||
from senselab.audio.tasks.features_extraction.praat_parselmouth import ( | ||
get_hf_dataset_durations, | ||
get_hf_dataset_f0_descriptors, | ||
get_hf_dataset_harmonicity_descriptors, | ||
get_hf_dataset_jitter_descriptors, | ||
get_hf_dataset_shimmer_descriptors, | ||
) | ||
from senselab.utils.tasks.input_output import read_files_from_disk | ||
|
||
dataset = read_files_from_disk(["/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav"]) | ||
|
||
print(dataset) | ||
|
||
duration_dataset = get_hf_dataset_durations(dataset) | ||
f0_dataset = get_hf_dataset_f0_descriptors(dataset, f0min=100, f0max=500) | ||
harmonicity_dataset = get_hf_dataset_harmonicity_descriptors(dataset, f0min=100) | ||
jitter_dataset = get_hf_dataset_jitter_descriptors(dataset, f0min=100, f0max=500) | ||
shimmer_dataset = get_hf_dataset_shimmer_descriptors(dataset, f0min=100, f0max=500) | ||
|
||
print(duration_dataset) | ||
print(f0_dataset) | ||
print(harmonicity_dataset) | ||
print(jitter_dataset) | ||
print(shimmer_dataset) | ||
|
||
opensmile_feats = extract_feats_from_dataset(dataset, audio_column="audio", feature_set="eGeMAPSv02", feature_level="Functionals") | ||
|
||
print(opensmile_feats) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""This module provides the implementation of the Senselab audio features extraction tasks.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
"""This module contains functions for extracting features from audios using openSMILE.""" | ||
|
||
from typing import Any, Dict | ||
|
||
import opensmile | ||
|
||
from senselab.utils.tasks.input_output import _from_dict_to_hf_dataset, _from_hf_dataset_to_dict | ||
|
||
|
||
def extract_feats_from_dataset(dataset: Dict[str, Any], audio_column: str = 'audio', feature_set: str = "eGeMAPSv02", feature_level: str = "Functionals") -> Dict[str, Any]: | ||
"""Apply feature extraction across a dataset of audio files. | ||
Low-level descriptors are extracted on 20ms windows with a hop of 10ms. | ||
Functional descriptors are extracted on the entire audio signal. | ||
""" | ||
def _load_opensmile_model(feature_set: str, feature_level: str) -> opensmile.Smile: | ||
"""Load an openSMILE configuration to extract audio features.""" | ||
smile = opensmile.Smile( | ||
feature_set=opensmile.FeatureSet[feature_set], | ||
feature_level=opensmile.FeatureLevel[feature_level], | ||
) | ||
return smile | ||
|
||
def _extract_feats_from_row(sample: Dict[str, Any], smile: opensmile.Smile, audio_column: str) -> Dict[str, Any]: | ||
"""Extract features from a single audio sample using the specified openSMILE model.""" | ||
# Extracting audio data | ||
audio_array = sample[audio_column]['array'] | ||
sampling_rate = sample[audio_column]['sampling_rate'] | ||
|
||
# Processing the audio sample to compute features | ||
sample_features = smile.process_signal(audio_array, sampling_rate) | ||
return sample_features.to_dict("list") | ||
|
||
hf_dataset = _from_dict_to_hf_dataset(dataset) | ||
unnecessary_columns = [col for col in hf_dataset.column_names if col != audio_column] | ||
hf_dataset = hf_dataset.remove_columns(unnecessary_columns) | ||
|
||
smile = _load_opensmile_model(feature_set, feature_level) | ||
features_dataset = hf_dataset.map( | ||
_extract_feats_from_row, | ||
fn_kwargs={"smile": smile, "audio_column": audio_column}, | ||
) | ||
features_dataset = features_dataset.remove_columns([audio_column]) | ||
return _from_hf_dataset_to_dict(features_dataset) |
8 changes: 8 additions & 0 deletions
8
src/senselab/audio/tasks/features_extraction/opensmile_pydra.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
"""This module defines a pydra API for the praat_parselmouth features extraction task.""" | ||
import pydra | ||
|
||
from senselab.audio.tasks.features_extraction.opensmile import extract_feats_from_dataset | ||
|
||
extract_feats_from_dataset_pt = pydra.mark.task(extract_feats_from_dataset) | ||
|
||
|
151 changes: 151 additions & 0 deletions
151
src/senselab/audio/tasks/features_extraction/praat_parselmouth.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
"""This module contains functions for extracting features from audio files using Praat and Parselmouth.""" | ||
from typing import Any, Dict | ||
|
||
import parselmouth | ||
from datasets import Dataset | ||
from parselmouth.praat import call | ||
|
||
from senselab.utils.tasks.input_output import _from_dict_to_hf_dataset, _from_hf_dataset_to_dict | ||
|
||
|
||
def get_hf_dataset_durations(dataset: Dict[str, Any], audio_column: str = 'audio') -> Dict[str, float]: | ||
"""Returns the duration of the audios in the Hugging Face `Dataset` object.""" | ||
hf_dataset = _from_dict_to_hf_dataset(dataset) | ||
unnecessary_columns = [col for col in hf_dataset.column_names if col != audio_column] | ||
hf_dataset = hf_dataset.remove_columns(unnecessary_columns) | ||
|
||
def get_hf_dataset_row_duration(row: Dataset, audio_column: str) -> Dict[str, float]: | ||
def _get_duration(sound: parselmouth.Sound) -> float: | ||
return call(sound, "Get total duration") | ||
waveform = row[audio_column]['array'] | ||
sampling_rate = row[audio_column]['sampling_rate'] | ||
sound = parselmouth.Sound(waveform, sampling_frequency=sampling_rate) | ||
duration = _get_duration(sound) | ||
return {'duration_seconds': duration} | ||
|
||
durations_hf_dataset = hf_dataset.map(lambda x: get_hf_dataset_row_duration(x, audio_column)) | ||
durations_hf_dataset = durations_hf_dataset.remove_columns([audio_column]) | ||
return _from_hf_dataset_to_dict(durations_hf_dataset) | ||
|
||
|
||
def get_hf_dataset_f0_descriptors(dataset: Dict[str, Any], f0min: float, f0max: float, audio_column: str = 'audio', unit: str = 'Hertz') -> Dict[str, float]: | ||
"""Returns the fundamental frequency descriptors of the audios in the Hugging Face `Dataset` object.""" | ||
hf_dataset = _from_dict_to_hf_dataset(dataset) | ||
unnecessary_columns = [col for col in hf_dataset.column_names if col != audio_column] | ||
hf_dataset = hf_dataset.remove_columns(unnecessary_columns) | ||
|
||
def get_hf_dataset_row_f0_descriptors(row: Dataset, audio_column: str, f0min: float, f0max: float, unit: str) -> Dict[str, float]: | ||
|
||
def _to_pitch(sound: parselmouth.Sound, f0min: float, f0max: float) -> parselmouth.Pitch: | ||
return call(sound, "To Pitch", 0.0, f0min, f0max) | ||
|
||
def _get_mean_f0(pitch: parselmouth.Pitch, unit: str) -> float: | ||
return call(pitch, "Get mean", 0, 0, unit) | ||
|
||
def _get_std_dev_f0(pitch: parselmouth.Pitch, unit: str) -> float: | ||
return call(pitch, "Get standard deviation", 0, 0, unit) | ||
|
||
waveform = row[audio_column]['array'] | ||
sampling_rate = row[audio_column]['sampling_rate'] | ||
sound = parselmouth.Sound(waveform, sampling_frequency=sampling_rate) | ||
|
||
pitch = _to_pitch(sound, f0min, f0max) | ||
return { | ||
f'f0_mean_{unit}': _get_mean_f0(pitch, unit), | ||
f'f0_std_dev_{unit}': _get_std_dev_f0(pitch, unit) | ||
} | ||
|
||
f0_descriptors_hf_dataset = hf_dataset.map(lambda x: get_hf_dataset_row_f0_descriptors(x, audio_column, f0min, f0max, unit)) | ||
f0_descriptors_hf_dataset = f0_descriptors_hf_dataset.remove_columns([audio_column]) | ||
return _from_hf_dataset_to_dict(f0_descriptors_hf_dataset) | ||
|
||
|
||
def get_hf_dataset_harmonicity_descriptors(dataset: Dict[str, Any], f0min: float, audio_column: str = 'audio') -> Dict[str, float]: | ||
"""Returns the harmonicity descriptors of the audios in the Hugging Face `Dataset` object.""" | ||
hf_dataset = _from_dict_to_hf_dataset(dataset) | ||
unnecessary_columns = [col for col in hf_dataset.column_names if col != audio_column] | ||
hf_dataset = hf_dataset.remove_columns(unnecessary_columns) | ||
|
||
def _get_hf_dataset_row_harmonicity_descriptors(row: Dataset, audio_column: str, f0min: float) -> Dict[str, float]: | ||
def _to_harmonicity(sound: parselmouth.Sound, f0min: float) -> parselmouth.Harmonicity: | ||
return call(sound, "To Harmonicity (cc)", 0.01, f0min, 0.1, 1.0) | ||
|
||
def _get_mean_hnr(harmonicity: parselmouth.Harmonicity) -> float: | ||
return call(harmonicity, "Get mean", 0, 0) | ||
|
||
def _get_std_dev_hnr(harmonicity: parselmouth.Harmonicity) -> float: | ||
return call(harmonicity, "Get standard deviation", 0, 0) | ||
|
||
waveform = row[audio_column]['array'] | ||
sampling_rate = row[audio_column]['sampling_rate'] | ||
sound = parselmouth.Sound(waveform, sampling_frequency=sampling_rate) | ||
|
||
harmonicity = _to_harmonicity(sound, f0min) | ||
return { | ||
'harmonicity_mean': _get_mean_hnr(harmonicity), | ||
'harmonicity_std_dev': _get_std_dev_hnr(harmonicity) | ||
} | ||
|
||
harmonicity_descriptors_hf_dataset = hf_dataset.map(lambda x: _get_hf_dataset_row_harmonicity_descriptors(x, audio_column, f0min)) | ||
harmonicity_descriptors_hf_dataset = harmonicity_descriptors_hf_dataset.remove_columns([audio_column]) | ||
return _from_hf_dataset_to_dict(harmonicity_descriptors_hf_dataset) | ||
|
||
def get_hf_dataset_jitter_descriptors(dataset: Dict[str, Any], f0min: float, f0max: float, audio_column: str = 'audio') -> Dict[str, float]: | ||
"""Returns the jitter descriptors of the audios in the Hugging Face `Dataset` object.""" | ||
hf_dataset = _from_dict_to_hf_dataset(dataset) | ||
unnecessary_columns = [col for col in hf_dataset.column_names if col != audio_column] | ||
hf_dataset = hf_dataset.remove_columns(unnecessary_columns) | ||
|
||
def _get_hf_dataset_row_jitter_descriptors(row: Dataset, audio_column: str, f0min: float, f0max: float) -> Dict[str, float]: | ||
def _to_point_process(sound: parselmouth.Sound, f0min: float, f0max: float) -> parselmouth.Data: | ||
return call(sound, "To PointProcess (periodic, cc)", f0min, f0max) | ||
|
||
def _get_jitter(type: str, point_process: parselmouth.Data) -> float: | ||
return call(point_process, f"Get jitter ({type})", 0, 0, 0.0001, 0.02, 1.3) | ||
|
||
waveform = row[audio_column]['array'] | ||
sampling_rate = row[audio_column]['sampling_rate'] | ||
sound = parselmouth.Sound(waveform, sampling_frequency=sampling_rate) | ||
point_process = _to_point_process(sound, f0min, f0max) | ||
return { | ||
'local_jitter': _get_jitter("local", point_process), | ||
'localabsolute_jitter': _get_jitter("local, absolute", point_process), | ||
'rap_jitter': _get_jitter("rap", point_process), | ||
'ppq5_jitter': _get_jitter("ppq5", point_process), | ||
'ddp_jitter': _get_jitter("ddp", point_process) | ||
} | ||
|
||
jitter_descriptors_hf_dataset = hf_dataset.map(lambda x: _get_hf_dataset_row_jitter_descriptors(x, audio_column, f0min, f0max)) | ||
jitter_descriptors_hf_dataset = jitter_descriptors_hf_dataset.remove_columns([audio_column]) | ||
return _from_hf_dataset_to_dict(jitter_descriptors_hf_dataset) | ||
|
||
def get_hf_dataset_shimmer_descriptors(dataset: Dict[str, Any], f0min: float, f0max: float, audio_column: str = 'audio') -> Dict[str, float]: | ||
"""Returns the shimmer descriptors of the audios in the Hugging Face `Dataset` object.""" | ||
hf_dataset = _from_dict_to_hf_dataset(dataset) | ||
unnecessary_columns = [col for col in hf_dataset.column_names if col != audio_column] | ||
hf_dataset = hf_dataset.remove_columns(unnecessary_columns) | ||
|
||
def _get_hf_dataset_row_shimmer_descriptors(row: Dataset, audio_column: str, f0min: float, f0max: float) -> Dict[str, float]: | ||
def _to_point_process(sound: parselmouth.Sound, f0min: float, f0max: float) -> parselmouth.Data: | ||
return call(sound, "To PointProcess (periodic, cc)", f0min, f0max) | ||
|
||
def _get_shimmer(type: str, sound: parselmouth.Sound, point_process: parselmouth.Data) -> float: | ||
# Use a single function call with flexible shimmer type | ||
return call([sound, point_process], f"Get shimmer ({type})", 0, 0, 0.0001, 0.02, 1.3, 1.6) | ||
|
||
waveform = row[audio_column]['array'] | ||
sampling_rate = row[audio_column]['sampling_rate'] | ||
sound = parselmouth.Sound(waveform, sampling_frequency=sampling_rate) | ||
point_process = _to_point_process(sound, f0min, f0max) | ||
return { | ||
'local_shimmer': _get_shimmer("local", sound, point_process), | ||
'localDB_shimmer': _get_shimmer("local_dB", sound, point_process), | ||
'apq3_shimmer': _get_shimmer("apq3", sound, point_process), | ||
'apq5_shimmer': _get_shimmer("apq5", sound, point_process), | ||
'apq11_shimmer': _get_shimmer("apq11", sound, point_process), | ||
'dda_shimmer': _get_shimmer("dda", sound, point_process) | ||
} | ||
|
||
shimmer_descriptors_hf_dataset = hf_dataset.map(lambda x: _get_hf_dataset_row_shimmer_descriptors(x, audio_column, f0min, f0max)) | ||
shimmer_descriptors_hf_dataset = shimmer_descriptors_hf_dataset.remove_columns([audio_column]) | ||
return _from_hf_dataset_to_dict(shimmer_descriptors_hf_dataset) |
16 changes: 16 additions & 0 deletions
16
src/senselab/audio/tasks/features_extraction/praat_parselmouth_pydra.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
"""This module defines a pydra API for the praat_parselmouth features extraction task.""" | ||
import pydra | ||
|
||
from senselab.audio.tasks.features_extraction.praat_parselmouth import ( | ||
get_hf_dataset_durations, | ||
get_hf_dataset_f0_descriptors, | ||
get_hf_dataset_harmonicity_descriptors, | ||
get_hf_dataset_jitter_descriptors, | ||
get_hf_dataset_shimmer_descriptors, | ||
) | ||
|
||
get_hf_dataset_durations_pt = pydra.mark.task(get_hf_dataset_durations) | ||
get_hf_dataset_f0_descriptors_pt = pydra.mark.task(get_hf_dataset_f0_descriptors) | ||
get_hf_dataset_harmonicity_descriptors_pt = pydra.mark.task(get_hf_dataset_harmonicity_descriptors) | ||
get_hf_dataset_jitter_descriptors_pt = pydra.mark.task(get_hf_dataset_jitter_descriptors) | ||
get_hf_dataset_shimmer_descriptors_pt = pydra.mark.task(get_hf_dataset_shimmer_descriptors) |