In [1]:
import torch

In [2]:
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    for i in range(num_gpus):
        gpu_name = torch.cuda.get_device_name(i)
        print(f"GPU {i}: {gpu_name}")
else:
    print("CUDA is not available.")

GPU 0: AMD Radeon RX 9070 XT
GPU 1: AMD Ryzen 7 7800X3D 8-Core Processor


In [3]:
DEVICE="cuda:0"

In [4]:
DATASET_DIR = "../dataset/audio"

ORIG_FILE = "./output/original.wav"
SPEAKER1_FILE = "./output/speaker1.wav"
SPEAKER2_FILE = "./output/speaker2.wav"
SPEAKER1_METRICS_FILE = "./output/speaker1.json"
SPEAKER2_METRICS_FILE = "./output/speaker2.json"

SPEAKER1_DIR = "./output/speaker1"
SPEAKER2_DIR = "./output/speaker2"

MIN_SEGMENT_LENGTH_SEC = 0.1
STOP_PHRASES = [
    "ДИНАМИЧНАЯ МУЗЫКА",
    "Продолжение следует.",
    "Продолжение следует...",
]
STOP_PHRASE_LENGTH_DELTA = 5

In [5]:
PER_SEGMENT_PROCESSORS = dict()
PER_CHANNEL_PROCESSORS = dict()

In [6]:
import os

os.makedirs(SPEAKER1_DIR, exist_ok=True)
os.makedirs(SPEAKER2_DIR, exist_ok=True)

In [7]:
import random

input_file = random.choice([
    f for f in os.listdir(DATASET_DIR)
    if os.path.isfile(os.path.join(DATASET_DIR, f))
])
input_file = f"{DATASET_DIR}/" + input_file
input_file

'../dataset/mix_13013_13136__2025_10_01__10_04_11_690.mp3'

In [8]:
import librosa
import soundfile as sf

y, sr = librosa.load(input_file, sr=None, mono=False)

if y.ndim != 2 or y.shape[0] != 2:
    raise ValueError("Input audio file must be stereo.")

sf.write(SPEAKER1_FILE, y[0], sr, subtype="PCM_16")
sf.write(SPEAKER2_FILE, y[1], sr, subtype="PCM_16")
sf.write(ORIG_FILE, y.T, sr, subtype="PCM_16")

In [9]:
import whisper
whispermodel = whisper.load_model("large").to(DEVICE)

In [10]:
from typing import Generator, Tuple

def filter_out(segment_data: dict) -> bool:
    # Removing short segments
    if (segment_data["end"] - segment_data["start"]) < MIN_SEGMENT_LENGTH_SEC:
        return True

    # Removing segments that contain stop_phrases
    matching_stop_phrases = [s for s in STOP_PHRASES if s.lower() in segment_data["text"].lower()]
    if matching_stop_phrases:
        # Remove the segment if it contains only a stop phrase
        if max(map(len, matching_stop_phrases)) + STOP_PHRASE_LENGTH_DELTA > len(segment_data["text"].strip()):
            return True
            
    return False

def segmentize(source: str, segments_dir: str) -> Generator[Tuple[dict, str]]:
    transcription = whispermodel.transcribe(source, word_timestamps=True, language='ru')
    y, sr = librosa.load(source, sr=None, mono=False)
    
    for segment_data in transcription['segments']:
        if filter_out(segment_data):
            continue
            
        start = float(segment_data['start'])
        end = float(segment_data['end'])
        start_sample = int(start * sr)
        end_sample = int(end * sr)

        segment_audio = y[:, start_sample:end_sample] if y.ndim == 2 else y[start_sample:end_sample]
        
        path = f"{segments_dir}/{segment_data['id']}.wav"
        sf.write(
            path,
            segment_audio.T if y.ndim == 2 else segment_audio,
            sr,
            subtype="PCM_16",
        )
        yield (segment_data, path)

In [11]:
def show(vals: dict) -> dict:
    keys = ["id", "start", "end", "text"]
    return {k: vals[0][k] for k in keys}
list(map(show, segmentize(SPEAKER1_FILE, SPEAKER1_DIR)))

[{'id': 0,
  'start': np.float64(0.820000000000001),
  'end': np.float64(4.24),
  'text': ' Спасибо за ожидание. Компания «Автомир» в Аверно-Дмитровском шоссе.'},
 {'id': 1,
  'start': np.float64(4.32),
  'end': np.float64(6.3),
  'text': ' Оператор Татьяна, здравствуйте, Александр.'},
 {'id': 2,
  'start': np.float64(6.32),
  'end': np.float64(8.98),
  'text': ' На линии оставайтесь, пожалуйста. Съединю вас с менеджером.'},
 {'id': 3,
  'start': np.float64(9.06),
  'end': np.float64(10.34),
  'text': ' Александр, спасибо за звонок.'},
 {'id': 5,
  'start': np.float64(60.879999999999995),
  'end': np.float64(65.18),
  'text': ' Компания «Автомир», хавал-центр Дмитров, командер отдела продаж новых автомобилей «Барин Фан».'},
 {'id': 6,
  'start': np.float64(65.18),
  'end': np.float64(68.24),
  'text': ' Приветствую вас, Александр. У вас покупка нового автомобиля «Джулион» интересует.'},
 {'id': 7,
  'start': np.float64(72.76),
  'end': np.float64(76.54),
  'text': ' Понял вас. Ну, мы о

In [12]:
from typing import Callable, Any
import pathlib
import dacite

from analysis_node.messages import MetricCollection
from analysis_node.analysis.postprocessing import WhisperMetrics
from analysis_node.analysis.processors import Processor

def collect_metrics_per_segment(
    segment_file: pathlib.Path | str,
) -> Tuple[dict[str, MetricCollection], dict[str, MetricCollection]]:
    def process(processors: dict[str, Processor]) -> dict[str, MetricCollection]:
        return {
            proc_name: processor.process(segment_file)
            for proc_name, processor in processors.items()
        }

    return (
        process(PER_SEGMENT_PROCESSORS),
        process(PER_CHANNEL_PROCESSORS),
    )

def collect_metrics_per_channel(
    channel_file: pathlib.Path | str,
    segments_dir: pathlib.Path | str,
) -> Generator[
    Tuple[dict[str, MetricCollection], dict[str, MetricCollection], WhisperMetrics],
    None,
    None,
]:
    for segment_data, segment_path in segmentize(channel_file, segments_dir):
        per_segment, per_channel = collect_metrics_per_segment(segment_path)
        whisper_data = dacite.from_dict(
            data_class=WhisperMetrics,
            data=segment_data,
        )
        yield per_segment, per_channel, whisper_data

torchcodec is not installed correctly so built-in audio decoding will fail. Solutions are:
* use audio preloaded in-memory as a {'waveform': (channel, time) torch.Tensor, 'sample_rate': int} dictionary;
* fix torchcodec installation. Error message was:

Deliberately disabling torchcodec.
  available_backends = torchaudio.list_audio_backends()


In [13]:
from analysis_node.analysis.processors import VadEmotionProcessor
from analysis_node.analysis.processors import AgeGenderProcessor

PER_SEGMENT_PROCESSORS["emotion"] = VadEmotionProcessor(DEVICE)
PER_CHANNEL_PROCESSORS["age_gender"] = AgeGenderProcessor("small", DEVICE)

In [14]:
raw_metrics = list(collect_metrics_per_channel(SPEAKER1_FILE, SPEAKER1_DIR))
raw_metrics

[({'emotion': MetricCollection(provider='audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim', metrics=[Metric(name='arousal', type=<MetricType.FLOAT: 'float'>, value=0.5853163599967957, unit=None, description=None), Metric(name='dominance', type=<MetricType.FLOAT: 'float'>, value=0.595945417881012, unit=None, description=None), Metric(name='valence', type=<MetricType.FLOAT: 'float'>, value=0.38391968607902527, unit=None, description=None)], description=None)},
  {'age_gender': MetricCollection(provider='audeering/wav2vec2-large-robust-6-ft-age-gender', metrics=[Metric(name='age', type=<MetricType.INT: 'int'>, value=29, unit='years', description=None), Metric(name='female', type=<MetricType.FLOAT: 'float'>, value=0.9996389150619507, unit=None, description=None), Metric(name='male', type=<MetricType.FLOAT: 'float'>, value=0.00021282209490891546, unit=None, description=None), Metric(name='child', type=<MetricType.FLOAT: 'float'>, value=0.00014820143405813724, unit=None, description=Non