In [None]:
import librosa
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [None]:
%pip install nemo_toolkit

In [None]:
%pip install wget

In [None]:
from nemo.collections.asr.models import ClusteringDiarizer

In [None]:
from omegaconf import OmegaConf

In [None]:
import json
import os
import librosa
import wget

In [None]:
from librosa import core
from pathlib import Path

In [None]:
def parse_nemo_output(path: str):
    results = Path(path).read_text()
    lines = results.splitlines()
    lines = [line.strip() for line in lines if len(line.strip()) > 1]
    
    ret = []
    for line in lines:
        _, _, _, t0, duration, _, _, ID, *_ = line.split()
        t0, duration = float(t0), float(duration)
        seg = {"speaker": ID.capitalize(), "start": t0, "end": t0 + duration}
        ret.append(seg)
    return ret

# Основная функция для извлечения спикеров
def extract_speakers(path: str, combine: bool = True) -> list:
    # Патч для багов в Librosa (многоканальный аудио)
    old_resample = core.resample
    def resample(y, *args, **kwargs):
        if y.ndim == 2:
            y = y.mean(axis=1)
        return old_resample(y, *args, **kwargs)
    core.resample = resample

    # Метаинформация для манифеста
    meta = {
        "audio_filepath": path,
        "duration": None,
        "label": "infer",
        "text": "-",
        "rttm_filepath": None,
        "uem_filepath": None,
    }

    # Создание манифеста
    manifest = Path(path).parent / "manifest.json"
    manifest.write_text(json.dumps(meta) + "\n")

    # Путь для вывода
    output_dir = Path(path).parent / "nemo-output"
    output_dir.mkdir(exist_ok=True)

    # Загрузка конфигурации
    config_url = "https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/diar_infer_general.yaml"
    MODEL_CONFIG = wget.download(config_url, str(output_dir))
    config = OmegaConf.load(MODEL_CONFIG)

    # Настройка конфигурации
    pretrained_vad = "vad_multilingual_marblenet"
    pretrained_speaker_model = "titanet_large"

    config.num_workers = 1
    config.diarizer.manifest_filepath = str(manifest)
    config.diarizer.out_dir = output_dir

    config.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
    config.diarizer.speaker_embeddings.parameters.window_length_in_sec = [1.5, 1.25, 1.0, 0.75, 0.5]
    config.diarizer.speaker_embeddings.parameters.shift_length_in_sec = [0.75, 0.625, 0.5, 0.375, 0.1]
    config.diarizer.speaker_embeddings.parameters.multiscale_weights = [1, 1, 1, 1, 1]
    config.diarizer.oracle_vad = False
    config.diarizer.clustering.parameters.oracle_num_speakers = False

    # Используем NeMo VAD
    config.diarizer.vad.model_path = pretrained_vad
    config.diarizer.vad.parameters.onset = 0.8
    config.diarizer.vad.parameters.offset = 0.6
    config.diarizer.vad.parameters.pad_offset = -0.05

    # Запуск диаризации
    sd_model = ClusteringDiarizer(cfg=config)
    sd_model.diarize()

    # Парсинг результатов из RTTM
    rttm = output_dir / "pred_rttms" / Path(path).with_suffix(".rttm").name
    parsed = parse_nemo_output(str(rttm))

    return parsed

In [None]:
path = '/home/jupyter/datasphere/project/rodion_dir/test_dir/case_5_2.wav'

In [None]:
result = extract_speakers(path)