feat: add support for generic multi-label segmentation

Co-authored-by: Hervé BREDIN <hbredin@users.noreply.github.com>
pyannote · Jul 1, 2022 · 1dac64a · 1dac64a
1 parent 26cddd9
commit 1dac64a
Show file tree

Hide file tree

Showing 7 changed files with 425 additions and 42 deletions.
diff --git a/pyannote/audio/pipelines/__init__.py b/pyannote/audio/pipelines/__init__.py
@@ -20,9 +20,10 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-from .segmentation import SpeakerSegmentation
+from .multilabel import MultiLabelSegmentation
 from .overlapped_speech_detection import OverlappedSpeechDetection
 from .resegmentation import Resegmentation
+from .segmentation import SpeakerSegmentation
 from .speaker_diarization import SpeakerDiarization
 from .voice_activity_detection import VoiceActivityDetection
 
@@ -32,4 +33,5 @@
     "SpeakerSegmentation",
     "SpeakerDiarization",
     "Resegmentation",
+    "MultiLabelSegmentation",
 ]
diff --git a/pyannote/audio/pipelines/multilabel.py b/pyannote/audio/pipelines/multilabel.py
@@ -0,0 +1,213 @@
+# The MIT License (MIT)
+#
+# Copyright (c) 2022- CNRS
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# AUTHORS
+# Hadrien TITEUX - https://github.com/hadware
+# Hervé BREDIN - http://herve.niderb.fr
+
+
+from typing import Callable, Optional, Union
+
+from pyannote.core import Annotation, SlidingWindowFeature
+from pyannote.metrics.identification import IdentificationErrorRate
+from pyannote.pipeline.parameter import ParamDict, Uniform
+
+from pyannote.audio import Inference
+from pyannote.audio.core.io import AudioFile
+from pyannote.audio.core.pipeline import Pipeline
+from pyannote.audio.utils.metric import MacroAverageFMeasure
+
+from ..utils.signal import Binarize
+from .utils import PipelineModel, get_devices, get_model
+
+
+class MultiLabelSegmentation(Pipeline):
+    """Generic multi-label segmentation
+
+    Parameters
+    ----------
+    segmentation : Model, str, or dict
+        Pretrained multi-label segmentation model.
+        See pyannote.audio.pipelines.utils.get_model for supported format.
+    fscore : bool, optional
+        Optimize for average (precision/recall) fscore, over all classes.
+        Defaults to optimizing identification error rate.
+    share_min_duration : bool, optional
+        If True, `min_duration_on` and `min_duration_off` are shared among labels.
+    inference_kwargs : dict, optional
+        Keywords arguments passed to Inference.
+
+    Hyper-parameters
+    ----------------
+    Each {label} of the segmentation model is assigned four hyper-parameters:
+    onset, offset : float
+        Onset/offset detection thresholds
+    min_duration_on : float
+        Remove {label} regions shorter than that many seconds.
+        Shared between labels if `share_min_duration` is `True`.
+    min_duration_off : float
+        Fill non-{label} regions shorter than that many seconds.
+        Shared between labels if `share_min_duration` is `True`.
+    """
+
+    def __init__(
+            self,
+            segmentation: PipelineModel = None,
+            fscore: bool = False,
+            share_min_duration: bool = False,
+            **inference_kwargs,
+    ):
+
+        super().__init__()
+
+        if segmentation is None:
+            raise ValueError(
+                "MultiLabelSegmentation pipeline must be provided with a `segmentation` model."
+            )
+
+        self.segmentation = segmentation
+        self.fscore = fscore
+        self.share_min_duration = share_min_duration
+
+        # load model and send it to GPU (when available and not already on GPU)
+        model = get_model(segmentation)
+        if model.device.type == "cpu":
+            (segmentation_device,) = get_devices(needs=1)
+            model.to(segmentation_device)
+
+        self._classes = model.specifications.classes
+        self._segmentation = Inference(model, **inference_kwargs)
+
+        # hyper-parameters used for hysteresis thresholding and postprocessing
+        if self.share_min_duration:
+            self.min_duration_on = Uniform(0.0, 2.0)
+            self.min_duration_off = Uniform(0.0, 2.0)
+
+            self.thresholds = ParamDict(
+                **{
+                    label: ParamDict(
+                        onset=Uniform(0.0, 1.0),
+                        offset=Uniform(0.0, 1.0),
+                    )
+                    for label in self._classes
+                }
+            )
+        else:
+            self.thresholds = ParamDict(
+                **{
+                    label: ParamDict(
+                        onset=Uniform(0.0, 1.0),
+                        offset=Uniform(0.0, 1.0),
+                        min_duration_on=Uniform(0.0, 2.0),
+                        min_duration_off=Uniform(0.0, 2.0),
+                    )
+                    for label in self._classes
+                }
+            )
+
+    # needed by pyannote.audio Prodigy recipes
+    def classes(self):
+        return self._classes
+
+    def initialize(self):
+        """Initialize pipeline with current set of parameters"""
+        self._binarize = {
+            label: Binarize(
+                onset=self.thresholds[label]["onset"],
+                offset=self.thresholds[label]["offset"],
+                min_duration_on=(self.thresholds[label]["min_duration_on"]
+                                 if not self.share_min_duration
+                                 else self.min_duration_on), # noqa
+                min_duration_off=(self.thresholds[label]["min_duration_off"]
+                                  if not self.share_min_duration
+                                  else self.min_duration_off) , # noqa
+            )
+            for label in self._classes
+        }
+
+    CACHED_SEGMENTATION = "cache/segmentation"
+
+    def apply(self, file: AudioFile, hook: Optional[Callable] = None) -> Annotation:
+        """Apply multi-label detection
+
+        Parameters
+        ----------
+        file : AudioFile
+            Processed file.
+        hook : callable, optional
+            Hook called after each major step of the pipeline with the following
+            signature: hook("step_name", step_artefact, file=file)
+
+        Returns
+        -------
+        detection : Annotation
+            Detected regions.
+        """
+
+        # setup hook (e.g. for debugging purposes)
+        hook = self.setup_hook(file, hook=hook)
+
+        # apply segmentation model (only if needed)
+        # output shape is (num_chunks, num_frames, num_classes)
+        if self.training:
+            if self.CACHED_SEGMENTATION in file:
+                segmentations = file[self.CACHED_SEGMENTATION]
+            else:
+                segmentations = self._segmentation(file)
+                file[self.CACHED_SEGMENTATION] = segmentations
+        else:
+            segmentations: SlidingWindowFeature = self._segmentation(file)
+
+        hook("segmentation", segmentations)
+
+        # apply hysteresis thresholding on each class separately
+        detection = Annotation(uri=file["uri"])
+
+        for i, label in enumerate(self._classes):
+            # extract raw segmentation of current label
+            label_segmentation = SlidingWindowFeature(
+                segmentations.data[:, i: i + 1], segmentations.sliding_window
+            )
+            # obtain hard segments
+            label_annotation: Annotation = self._binarize[label](label_segmentation)
+
+            # add them to the pool of labels
+            detection.update(
+                label_annotation.rename_labels(
+                    dict.fromkeys(label_annotation.labels(), label), copy=False
+                )
+            )
+
+        return detection
+
+    def get_metric(self) -> Union[MacroAverageFMeasure, IdentificationErrorRate]:
+        """Return new instance of identification metric"""
+
+        if self.fscore:
+            return MacroAverageFMeasure(classes=self._classes)
+
+        return IdentificationErrorRate()
+
+    def get_direction(self):
+        if self.fscore:
+            return "maximize"
+        return "minimize"
diff --git a/pyannote/audio/tasks/__init__.py b/pyannote/audio/tasks/__init__.py
@@ -25,18 +25,15 @@
     OverlappedSpeechDetection,
 )
 
-from .segmentation.speaker_tracking import SpeakerTracking  # isort:skip
-
+from .segmentation.multilabel import MultiLabelSegmentation   # isort:skip
 from .segmentation.segmentation import Segmentation  # isort:skip
-
 from .embedding.arcface import SupervisedRepresentationLearningWithArcFace  # isort:skip
-
 SpeakerEmbedding = SupervisedRepresentationLearningWithArcFace
 
 __all__ = [
     "Segmentation",
     "VoiceActivityDetection",
     "OverlappedSpeechDetection",
-    "SpeakerTracking",
+    "MultiLabelSegmentation",
     "SpeakerEmbedding",
 ]
diff --git a/...io/tasks/segmentation/speaker_tracking.py → ...te/audio/tasks/segmentation/multilabel.py b/...io/tasks/segmentation/speaker_tracking.py → ...te/audio/tasks/segmentation/multilabel.py
@@ -20,7 +20,8 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-from typing import Dict, Optional, Sequence, Text, Tuple, Union
+import warnings
+from typing import Dict, Optional, Sequence, Text, Tuple, Union, List
 
 import numpy as np
 import torch
@@ -32,20 +33,21 @@
 from pyannote.audio.tasks.segmentation.mixins import SegmentationTaskMixin
 
 
-class SpeakerTracking(SegmentationTaskMixin, Task):
-    """Speaker tracking
+class MultiLabelSegmentation(SegmentationTaskMixin, Task):
+    """Generic multi-label segmentation
 
-    Speaker tracking is the process of determining if and when a (previously
-    enrolled) person's voice can be heard in a given audio recording.
+    Multi-label segmentation is the process of detecting temporal intervals 
+    when a specific audio class is active.
 
-    Here, it is addressed with the same approach as voice activity detection,
-    except {"non-speech", "speech"} classes are replaced by {"speaker1", ...,
-    "speaker_N"} where N is the number of speakers in the training set.
+    Example use cases include speaker tracking, gender (male/female)
+    classification, or audio event detection.
 
     Parameters
     ----------
     protocol : Protocol
         pyannote.database protocol
+    classes : List[str], optional
+        List of classes. Defaults to the list of classes available in the training set.
     duration : float, optional
         Chunks duration. Defaults to 2s.
     warm_up : float or (float, float), optional
@@ -78,19 +80,19 @@ class SpeakerTracking(SegmentationTaskMixin, Task):
     """
 
     def __init__(
-        self,
-        protocol: Protocol,
-        duration: float = 2.0,
-        warm_up: Union[float, Tuple[float, float]] = 0.0,
-        balance: Text = None,
-        weight: Text = None,
-        batch_size: int = 32,
-        num_workers: int = None,
-        pin_memory: bool = False,
-        augmentation: BaseWaveformTransform = None,
-        metric: Union[Metric, Sequence[Metric], Dict[str, Metric]] = None,
+            self,
+            protocol: Protocol,
+            classes: Optional[List[str]] = None,
+            duration: float = 2.0,
+            warm_up: Union[float, Tuple[float, float]] = 0.0,
+            balance: Text = None,
+            weight: Text = None,
+            batch_size: int = 32,
+            num_workers: int = None,
+            pin_memory: bool = False,
+            augmentation: BaseWaveformTransform = None,
+            metric: Union[Metric, Sequence[Metric], Dict[str, Metric]] = None,
     ):
-
         super().__init__(
             protocol,
             duration=duration,
@@ -104,20 +106,29 @@ def __init__(
 
         self.balance = balance
         self.weight = weight
+        self.classes = classes
 
-        # for speaker tracking, task specification depends
-        # on the data: we do not know in advance which
-        # speakers should be tracked. therefore, we postpone
-        # the definition of specifications.
+        # task specification depends on the data: we do not know in advance which
+        # classes should be detected. therefore, we postpone the definition of 
+        # specifications to setup()
 
     def setup(self, stage: Optional[str] = None):
 
         super().setup(stage=stage)
 
+        classes_from_training_set = sorted(self._train_metadata["annotation"])
+        if self.classes is None:
+            classes = classes_from_training_set
+        else:
+            if set(classes_from_training_set) != set(self.classes):
+                warnings.warn(
+                    f"Mismatch between classes passed to the task ({self.classes}) "
+                    f"and those of the training set ({classes_from_training_set})."
+                )
+            classes = self.classes
+
         self.specifications = Specifications(
-            # one class per speaker
-            classes=sorted(self._train_metadata["annotation"]),
-            # multiple speakers can be active at once
+            classes=classes,
             problem=Problem.MULTI_LABEL_CLASSIFICATION,
             resolution=Resolution.FRAME,
             duration=self.duration,