In [1]:
import torch

In [2]:
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    for i in range(num_gpus):
        gpu_name = torch.cuda.get_device_name(i)
        print(f"GPU {i}: {gpu_name}")
else:
    print("CUDA is not available.")

GPU 0: AMD Radeon RX 9070 XT
GPU 1: AMD Ryzen 7 7800X3D 8-Core Processor


In [3]:
DEVICE="cuda:0"

In [4]:
DATASET_DIR = "../dataset"

ORIG_FILE = "./output/original.wav"
SPEAKER1_FILE = "./output/speaker1.wav"
SPEAKER2_FILE = "./output/speaker2.wav"
SPEAKER1_METRICS_FILE = "./output/speaker1.json"
SPEAKER2_METRICS_FILE = "./output/speaker2.json"

SPEAKER1_DIR = "./output/speaker1"
SPEAKER2_DIR = "./output/speaker2"

MIN_SEGMENT_LENGTH_SEC = 0.1
STOP_PHRASES = [
    "ДИНАМИЧНАЯ МУЗЫКА",
    "Продолжение следует.",
    "Продолжение следует...",
]
STOP_PHRASE_LENGTH_DELTA = 5

In [5]:
PER_SEGMENT_PROCESSORS = dict()
PER_CHANNEL_PROCESSORS = dict()

In [6]:
import os

os.makedirs(SPEAKER1_DIR, exist_ok=True)
os.makedirs(SPEAKER2_DIR, exist_ok=True)

In [7]:
import random

input_file = random.choice(os.listdir(DATASET_DIR))
input_file = f"{DATASET_DIR}/" + input_file
input_file

'../dataset/mix_13100_13137__2025_10_01__09_19_59_723.mp3'

In [8]:
import pydub

audio = pydub.AudioSegment.from_file(input_file)

try:
    left_channel, right_channel = audio.split_to_mono()
except ValueError as ex:
    raise ValueError("Input file must be stereo.") from ex

left_channel.export(SPEAKER1_FILE, format="wav")
right_channel.export(SPEAKER2_FILE, format="wav")
audio.export(ORIG_FILE, format="wav")

<_io.BufferedRandom name='./output/original.wav'>

In [9]:
import whisper
whispermodel = whisper.load_model("large").to(DEVICE)

In [10]:
from typing import Generator, Tuple

def filter_out(segment_data: dict) -> bool:
    # Removing short segments
    if (segment_data["end"] - segment_data["start"]) < MIN_SEGMENT_LENGTH_SEC:
        return True

    # Removing segments that contain stop_phrases
    matching_stop_phrases = [s for s in STOP_PHRASES if s.lower() in segment_data["text"].lower()]
    if matching_stop_phrases:
        # Remove the segment if it contains only a stop phrase
        if max(map(len, matching_stop_phrases)) + STOP_PHRASE_LENGTH_DELTA > len(segment_data["text"].strip()):
            return True
            
    return False

def segmentize(source: str, segments_dir: str) -> Generator[Tuple[dict, str]]:
    transcription = whispermodel.transcribe(source, word_timestamps=True, language='ru')
    audio = pydub.AudioSegment.from_file(source)
    for segment_data in transcription['segments']:
        if filter_out(segment_data):
            continue
        start = float(segment_data['start']) * 1000
        end = float(segment_data['end']) * 1000
        path = f"{segments_dir}/{segment_data['id']}.wav"
        audio[start:end].export(path, format="wav")
        yield (segment_data, path)

In [11]:
def show(vals: dict) -> dict:
    keys = ["id", "start", "end", "text"]
    return {k: vals[0][k] for k in keys}
list(map(show, segmentize(SPEAKER1_FILE, SPEAKER1_DIR)))

[{'id': 0,
  'start': np.float64(18.220000000000002),
  'end': np.float64(21.02),
  'text': ' Доброе утро!'},
 {'id': 1,
  'start': np.float64(30.0),
  'end': np.float64(32.82),
  'text': ' Уже какой-то выбор сделал в пользу какого-то бренда?'},
 {'id': 2,
  'start': np.float64(33.96),
  'end': np.float64(36.92),
  'text': ' Угу, супер, с мобилем познакомились, понимаете, с Drive прошли?'},
 {'id': 3,
  'start': np.float64(40.8),
  'end': np.float64(43.1),
  'text': ' Угу, с комплектацией определились, правильно?'},
 {'id': 4,
  'start': np.float64(46.58),
  'end': np.float64(47.48),
  'text': ' Супер, отлично.'},
 {'id': 5,
  'start': np.float64(48.72),
  'end': np.float64(51.76),
  'text': ' Вы рассматриваете как покупку, в течение какого времени?'},
 {'id': 6,
  'start': np.float64(54.56),
  'end': np.float64(55.16),
  'text': ' Угу.'},
 {'id': 7,
  'start': np.float64(57.18),
  'end': np.float64(59.7),
  'text': ' Так, вы когда тест-драйв проходили,'},
 {'id': 8,
  'start': np.floa

In [12]:
from typing import Callable, Any
import pathlib
import dacite

from analysis_node.messages import MetricCollection
from analysis_node.analysis.postprocessing import WhisperMetrics
from analysis_node.analysis.processors import Processor

def collect_metrics_per_segment(
    segment_file: pathlib.Path | str,
) -> Tuple[dict[str, MetricCollection], dict[str, MetricCollection]]:
    def process(processors: dict[str, Processor]) -> dict[str, MetricCollection]:
        return {
            proc_name: processor.process(segment_file)
            for proc_name, processor in processors.items()
        }

    return (
        process(PER_SEGMENT_PROCESSORS),
        process(PER_CHANNEL_PROCESSORS),
    )

def collect_metrics_per_channel(
    channel_file: pathlib.Path | str,
    segments_dir: pathlib.Path | str,
) -> Generator[
    Tuple[dict[str, MetricCollection], dict[str, MetricCollection], WhisperMetrics],
    None,
    None,
]:
    for segment_data, segment_path in segmentize(channel_file, segments_dir):
        per_segment, per_channel = collect_metrics_per_segment(segment_path)
        whisper_data = dacite.from_dict(
            data_class=WhisperMetrics,
            data=segment_data,
        )
        yield per_segment, per_channel, whisper_data

In [13]:
from analysis_node.analysis.processors import VadEmotionProcessor
from analysis_node.analysis.processors import AgeGenderProcessor

PER_SEGMENT_PROCESSORS["emotion"] = VadEmotionProcessor(DEVICE)
PER_CHANNEL_PROCESSORS["age_gender"] = AgeGenderProcessor("small", DEVICE)

In [14]:
raw_metrics = list(collect_metrics_per_channel(SPEAKER1_FILE, SPEAKER1_DIR))
raw_metrics

[({'emotion': MetricCollection(provider='audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim', metrics=[Metric(name='arousal', type=<MetricType.FLOAT: 'float'>, value=np.float32(0.6565948), unit=None, description=None), Metric(name='dominance', type=<MetricType.FLOAT: 'float'>, value=np.float32(0.62044114), unit=None, description=None), Metric(name='valence', type=<MetricType.FLOAT: 'float'>, value=np.float32(0.5084065), unit=None, description=None)], description=None)},
  {'age_gender': MetricCollection(provider='audeering/wav2vec2-large-robust-6-ft-age-gender', metrics=[Metric(name='age', type=<MetricType.INT: 'int'>, value=np.float32(11.01968), unit='years', description=None), Metric(name='female', type=<MetricType.FLOAT: 'float'>, value=np.float32(0.34786567), unit=None, description=None), Metric(name='male', type=<MetricType.FLOAT: 'float'>, value=np.float32(0.0071976567), unit=None, description=None), Metric(name='child', type=<MetricType.FLOAT: 'float'>, value=np.float32(0.644