In [1]:
from analysis_node.analysis.processors import *

torchcodec is not installed correctly so built-in audio decoding will fail. Solutions are:
* use audio preloaded in-memory as a {'waveform': (channel, time) torch.Tensor, 'sample_rate': int} dictionary;
* fix torchcodec installation. Error message was:

Deliberately disabling torchcodec.
  available_backends = torchaudio.list_audio_backends()


In [2]:
import torch

In [3]:
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    for i in range(num_gpus):
        gpu_name = torch.cuda.get_device_name(i)
        print(f"GPU {i}: {gpu_name}")
else:
    print("CUDA is not available.")

GPU 0: AMD Radeon RX 9070 XT
GPU 1: AMD Ryzen 7 7800X3D 8-Core Processor


In [4]:
config = {
    "server": {"name": "my_server"},
    "kafka": {
        "bootstrap_servers": "127.0.0.1:9092",
        "group_id": "analysis_nodes",
        "topics": {
            "incoming": "analysis_requests",
            "outgoing": "metrics_output",
        },
    },
    "models": {
        "device": "cuda:0",
        "whisper": {"model": "large", "lang": "en"},
        "wav2vec2_age_gender": {"num_layers": 24},
    },
    "preprocessing": {
        "min_segment_length_sec": 0.5,
        "min_segment_distance_sec": 1,
        "no_speech_threshold": 1,
        "stop_phrases": [
            "ДИНАМИЧНАЯ МУЗЫКА",
            "Продолжение следует.",
            "Продолжение следует...",
        ],
        "stop_phrase_length_delta": 5,
    },
    "reporting": {"progress_delta": 5},
    "logging": {
        "version": 1,
        "disable_existing_loggers": False,
        "formatters": {
            "standard": {
                "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
            }
        },
        "handlers": {
            "console": {
                "class": "logging.StreamHandler",
                "formatter": "standard",
                "level": "INFO",
            }
        },
        "root": {"handlers": ["console"], "level": "DEBUG"},
    },
}


from analysis_node.config import prepare_config

CONFIG = prepare_config(config)

In [5]:
DATASET_DIR = "../dataset/audio"

OUTPUT_DATASET_PATH = "../dataset/normal.csv"

EXCLUDE_METRICS = ["age_gender"]

In [6]:
from analysis_node.analysis.pipeline import AnalysisPipeline

pipeline = AnalysisPipeline(CONFIG)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…



In [7]:
from pathlib import Path

def get_all_files(directory: str, max_size_mb: float) -> list[str]:
    path = Path(directory)
    if not path.exists() or not path.is_dir():
        raise ValueError(f"{directory} is not a valid directory.")

    max_size_bytes = max_size_mb * 1024 * 1024
    
    all_files = [
        str(file_path)
        for file_path in path.rglob('*')
        if file_path.is_file() and file_path.stat().st_size < max_size_bytes
    ]
    
    return all_files

sources = get_all_files(DATASET_DIR, 3)
sources

['../dataset/mix_13136_16e011__2025_10_01__09_38_42_163.mp3',
 '../dataset/mix_13044_16e010__2025_10_01__09_15_23_187.mp3',
 '../dataset/mix_13053_13135__2025_10_01__09_27_24_800.mp3',
 '../dataset/mix_13112_16e011__2025_10_01__09_24_59_537.mp3',
 '../dataset/mix_13015_16e021__2025_10_01__09_50_43_070.mp3',
 '../dataset/mix_13136_16e029__2025_10_01__09_44_26_680.mp3',
 '../dataset/mix_13142_16e021__2025_10_01__09_53_28_810.mp3',
 '../dataset/mix_13102_16e029__2025_10_01__09_42_41_923.mp3',
 '../dataset/mix_13100_16e004__2025_10_01__09_18_05_197.mp3',
 '../dataset/mix_13099_16e008__2025_10_01__09_16_46_787.mp3',
 '../dataset/mix_13098_13135__2025_10_01__09_38_00_730.mp3',
 '../dataset/mix_13013_16e011__2025_10_01__10_01_29_140.mp3',
 '../dataset/mix_13109_16e010__2025_10_01__09_19_07_507.mp3',
 '../dataset/mix_13044_13102__2025_10_01__09_16_19_207.mp3',
 '../dataset/mix_13099_13136__2025_10_01__09_19_20_820.mp3',
 '../dataset/mix_13102_16e025__2025_10_01__10_00_41_300.mp3',
 '../dataset

In [8]:
import librosa
import tempfile
from analysis_node.analysis.preprocessing import split_audio as split

def split_audio(audio_file: str) -> list[tempfile._TemporaryFileWrapper]:
    y, sr = librosa.load(audio_file, sr=None, mono=False)

    if y.ndim != 2 or y.shape[0] != 2:
        channel_files = list(pipeline.diarizer.process(y, sr))
    else:
        channel_files = split(y, sr)

    return channel_files

In [9]:
from collections import defaultdict

def to_rows(channel_file, exclude_metrics: list[str] = list()) -> dict[str, list]:
    """
    Returns:
    - A columns dict
    """

    output = defaultdict(list)
    
    with channel_file:
        for (
            segment_metrics,
            channel_metrics,
            _
        ) in pipeline._collect_metrics_per_channel(channel_file.name):
            metrics = segment_metrics | channel_metrics
            filtered_metrics = {k: v for k, v in metrics.items() if k not in exclude_metrics}
            flat_metrics = [metric for collection in filtered_metrics.values() for metric in collection.metrics]
            simple_metrics = {m.name: m.value for m in flat_metrics}
            for k, v in simple_metrics.items():
                output[k].append(v)
    return output
            

sample_split = split_audio(sources[0])
to_rows(sample_split[0], EXCLUDE_METRICS)

defaultdict(list,
            {'arousal': [0.4981192648410797,
              0.4388979971408844,
              0.5822452306747437],
             'dominance': [0.554597795009613,
              0.4290090501308441,
              0.5580669641494751],
             'valence': [0.5832354426383972,
              0.38446569442749023,
              0.39135095477104187],
             'neutral': [3.238715410232544,
              2.055671215057373,
              1.5450992584228516],
             'angry': [-2.190589189529419,
              0.013483338057994843,
              -0.2612634301185608],
             'positive': [2.4738056659698486,
              0.19944994151592255,
              0.8034318089485168],
             'sad': [-1.3184692859649658,
              -0.14838550984859467,
              -0.8820557594299316],
             'other': [-3.8666059970855713,
              -3.07854962348938,
              -1.9023798704147339],
             'f0_mean': [258.1654542027255,
              324.66176

In [10]:
sample_rows = list()
for source in sources[:1]:
    for channel_file in split_audio(source):
        sample_rows.append(to_rows(channel_file, EXCLUDE_METRICS))
sample_rows

[defaultdict(list,
             {'arousal': [0.4981192648410797,
               0.4388979971408844,
               0.5822452306747437],
              'dominance': [0.554597795009613,
               0.4290090501308441,
               0.5580669641494751],
              'valence': [0.5832354426383972,
               0.38446569442749023,
               0.39135095477104187],
              'neutral': [3.238715410232544,
               2.055671215057373,
               1.5450992584228516],
              'angry': [-2.190589189529419,
               0.013483338057994843,
               -0.2612634301185608],
              'positive': [2.4738056659698486,
               0.19944994151592255,
               0.8034318089485168],
              'sad': [-1.3184692859649658,
               -0.14838550984859467,
               -0.8820557594299316],
              'other': [-3.8666059970855713,
               -3.07854962348938,
               -1.9023798704147339],
              'f0_mean': [258.165454202725

In [11]:
from itertools import chain

def join_defaultdicts_flat(dds: list[defaultdict]) -> dict:
    if not dds:
        return {}
    
    keys = set(dds[0].keys())
    for dd in dds[1:]:
        if set(dd.keys()) != keys:
            raise ValueError("All defaultdicts must have the same keys.")
    
    joined = {}
    for key in keys:
        joined[key] = list(chain.from_iterable(dd[key] for dd in dds))
    
    return joined

join_defaultdicts_flat(sample_rows)

{'articulation_rate': [5.7894736842105265,
  4.651162790697675,
  5.813953488372093,
  5.862068965517241,
  5.769230769230769,
  5.882352941176471,
  5.208333333333334],
 'neutral': [3.238715410232544,
  2.055671215057373,
  1.5450992584228516,
  3.5948359966278076,
  1.1269172430038452,
  2.820155620574951,
  3.1339542865753174],
 'intensity_std': [8.549604228693232,
  13.868459563526983,
  12.035211534835687,
  19.4242565711698,
  13.03055867822894,
  12.57001118662274,
  11.407267017836444],
 'formant_f1_mean': [322.0524991996592,
  299.4242235091865,
  316.35924182875095,
  316.8804094540115,
  375.5529803233288,
  361.93195057597654,
  371.4100193294459],
 'dominance': [0.554597795009613,
  0.4290090501308441,
  0.5580669641494751,
  0.6446192264556885,
  0.6130695939064026,
  0.6097265481948853,
  0.5983362793922424],
 'formant_f1_std': [76.10131877446508,
  61.952173802351105,
  50.17940116970165,
  120.59280146032658,
  99.10332309955072,
  98.63995350696229,
  88.5424220788583

In [12]:
from tqdm import tqdm

dataset_rows = list()
for source in tqdm(sources, desc="Processing sources"):
    for channel_file in split_audio(source):
        dataset_rows.append(to_rows(channel_file, EXCLUDE_METRICS))
dataset_dict = join_defaultdicts_flat(dataset_rows)

Processing sources:  94%|█████████████████████████████████████████████████████████████████████████████████████████      | 45/48 [26:18<02:47, 55.92s/it]

Output()

  y, sr = librosa.load(audio_file, sr=None, mono=False)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Processing sources:  96%|███████████████████████████████████████████████████████████████████████████████████████████    | 46/48 [27:26<01:59, 59.52s/it]

Output()

  y, sr = librosa.load(audio_file, sr=None, mono=False)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Processing sources:  98%|█████████████████████████████████████████████████████████████████████████████████████████████  | 47/48 [28:04<00:53, 53.00s/it]

Output()

  y, sr = librosa.load(audio_file, sr=None, mono=False)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Processing sources: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [29:28<00:00, 36.85s/it]


In [15]:
import pandas as pd

df = pd.DataFrame.from_dict(dataset_dict)
df.head()

Unnamed: 0,articulation_rate,neutral,intensity_std,formant_f1_mean,dominance,formant_f1_std,talk_time_mean,pauses_num,talk_time_std,pauses_total_duration,...,zcr_std,voiced_ratio,sad,f0_std,spectral_entropy_std,zcr_mean,jitter,spectral_centroid_std,hnr_std,valence
0,5.789474,3.238715,8.549604,322.052499,0.554598,76.101319,0.0,0,0.0,0.0,...,930.682312,0.997361,-1.318469,67.318468,0.094397,1307.124011,11.046585,382.812392,29.666108,0.583235
1,4.651163,2.055671,13.86846,299.424224,0.429009,61.952174,0.0,0,0.0,0.0,...,759.930555,1.0,-0.148386,121.328617,0.103551,1336.309524,17.526502,283.896589,20.761109,0.384466
2,5.813953,1.545099,12.035212,316.359242,0.558067,50.179401,0.0,0,0.0,0.0,...,587.751329,1.0,-0.882056,58.374491,0.092568,1340.47619,12.439938,222.598806,5.7336,0.391351
3,5.862069,3.594836,19.424257,316.880409,0.644619,120.592801,1.4,1,0.64,0.28,...,832.108355,0.990506,-3.255889,116.716242,0.163133,1043.934599,16.577902,371.97858,8.896209,0.525628
4,5.769231,1.126917,13.030559,375.55298,0.61307,99.103323,0.0,0,0.0,0.0,...,594.957956,0.986188,-1.953436,106.443649,0.084982,1488.35175,22.227037,266.217842,36.435997,0.508493


In [20]:
df.to_csv(OUTPUT_DATASET_PATH, index=False)