In [1]:
import json
import os

In [8]:
%pip install --upgrade pip


Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.2.4
    Uninstalling pip-21.2.4:
      Successfully uninstalled pip-21.2.4
Successfully installed pip-25.1.1


## Pyannote VAD

In [None]:
from pyannote.audio import Pipeline
from pydub import AudioSegment

pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token="YOUR_HF_TOKEN")

manifest_path = "manifest.jsonl"
output_dir = "segmented_audio"
os.makedirs(output_dir, exist_ok=True)

new_manifest = []

with open(manifest_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

total_files = len(lines)
print(f"Total files to process: {total_files}")

for idx, line in enumerate(lines):
    item = json.loads(line)
    audio_path = item["audio_filepath"].replace("\\", "/")
    text = item["text"]

    print(f"\n[{idx+1}/{total_files}] Processing: {audio_path}")

    try:
        audio = AudioSegment.from_file(audio_path)
    except Exception as e:
        print(f"  [ERROR] Could not load audio: {e}")
        continue

    try:
        vad_result = pipeline(audio_path)
        segments = vad_result.get_timeline().support()
        num_segments = len(segments)
        print(f"  Detected {num_segments} speech segments")

        if num_segments == 0:
            print("  [WARN] No speech segments found, saving entire audio as one segment")
            segment_filename = f"{os.path.splitext(os.path.basename(audio_path))[0]}_segment0.wav"
            segment_path = os.path.join(output_dir, segment_filename)
            audio.export(segment_path, format="wav")
            new_manifest.append({
                "audio_filepath": segment_path,
                "text": text
            })
            continue

        words = text.split()
        words_per_segment = max(1, len(words) // num_segments)
        start_word = 0

        for i, segment in enumerate(segments):
            start_ms = int(segment.start * 1000)
            end_ms = int(segment.end * 1000)
            audio_chunk = audio[start_ms:end_ms]

            segment_filename = f"{os.path.splitext(os.path.basename(audio_path))[0]}_segment{i}.wav"
            segment_path = os.path.join(output_dir, segment_filename)
            audio_chunk.export(segment_path, format="wav")

            end_word = start_word + words_per_segment
            text_chunk = " ".join(words[start_word:end_word])
            start_word = end_word

            new_manifest.append({
                "audio_filepath": segment_path,
                "text": text_chunk
            })
    except Exception as e:
        print(f"  [ERROR] VAD processing failed: {e}")

with open("manifest_segmented.jsonl", "w", encoding="utf-8") as f_out:
    for entry in new_manifest:
        f_out.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"\nCreated segmented dataset with {len(new_manifest)} samples.")


## Silero VAD

In [4]:
%pip install torchaudio pydub
%pip install git+https://github.com/snakers4/silero-vad.git


Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\Admin\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


Collecting git+https://github.com/snakers4/silero-vad.git
  Cloning https://github.com/snakers4/silero-vad.git to c:\users\admin\appdata\local\temp\pip-req-build-l98z9uhc
  Resolved https://github.com/snakers4/silero-vad.git to commit 94811cbe1207ec24bc0f5370b895364b8934936f
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
    Preparing wheel metadata: started
    Preparing wheel metadata: finished with status 'done'
Building wheels for collected packages: silero-vad
  Building wheel for silero-vad (PEP 517): started
  Building wheel for silero-vad (PEP 517): finished with status 'done'
  Created wheel for silero-vad: filename=silero_vad-5.1.2-py3-none-any.whl size=6114613 sha256=a2eb9545cfde8244645482275646dcd175e43476773d5c09323e4399a6868cb2
  Stored in directory: C:\Users\Admin\AppData\Local\Temp\pip-ephem-wheel-cache-

  Running command git clone -q https://github.com/snakers4/silero-vad.git 'C:\Users\Admin\AppData\Local\Temp\pip-req-build-l98z9uhc'
You should consider upgrading via the 'c:\Users\Admin\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [None]:
import os
import json
import torch
from pydub import AudioSegment
from transformers import WhisperProcessor

# Load Silero VAD
model, utils = torch.hub.load('snakers4/silero-vad', model='silero_vad', force_reload=False)
(get_speech_timestamps, _, read_audio, _, _) = utils

# Load Whisper tokenizer
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3", language="russian", task="transcribe")
tokenizer = processor.tokenizer

# Параметры
manifest_path = "manifest.jsonl"
output_dir = "segmented_audio_silero_v2"
output_manifest = "manifest_silero_v2.jsonl"
max_audio_len_sec = 30
max_tokens = 448
chunk_overlap_ms = 1000  # для "мягкой" резки длинных кусков

os.makedirs(output_dir, exist_ok=True)
new_manifest = []

# Загрузка манифеста
with open(manifest_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

print(f"Total files to process: {len(lines)}")

for idx, line in enumerate(lines):
    item = json.loads(line)
    audio_path = item["audio_filepath"].replace("\\", "/")
    full_text = item["text"].strip()
    words = full_text.split()

    print(f"\n[{idx+1}/{len(lines)}] Processing: {audio_path}")

    try:
        waveform = read_audio(audio_path, sampling_rate=16000)
        vad_segments = get_speech_timestamps(waveform, model, sampling_rate=16000)
        audio = AudioSegment.from_file(audio_path)
    except Exception as e:
        print(f"  [ERROR] Audio load failed: {e}")
        continue

    if len(vad_segments) == 0:
        print("  [WARN] No speech found, saving whole audio")
        segment_path = os.path.join(output_dir, f"{os.path.basename(audio_path)}_segment0.wav")
        audio.export(segment_path, format="wav")
        new_manifest.append({"audio_filepath": segment_path, "text": full_text})
        continue

    total_audio_duration_ms = sum([s["end"] - s["start"] for s in vad_segments])
    word_cursor = 0

    for i, seg in enumerate(vad_segments):
        start_ms = seg["start"] // 16
        end_ms = seg["end"] // 16
        chunk_duration = (end_ms - start_ms) / 1000.0
        chunk = audio[start_ms:end_ms]

        if chunk_duration > max_audio_len_sec:
            print(f"  [LONG] Segment {i} is {chunk_duration:.1f}s → splitting")
            step = max_audio_len_sec * 1000
            for j in range(start_ms, end_ms, int(step - chunk_overlap_ms)):
                part_start = j
                part_end = min(end_ms, j + int(step))
                subchunk = audio[part_start:part_end]
                sub_duration = (part_end - part_start) / 1000.0
  
                segment_name = f"{os.path.splitext(os.path.basename(audio_path))[0]}_segment{i}_{j}.wav"
                segment_path = os.path.join(output_dir, segment_name)
                subchunk.export(segment_path, format="wav")

                # Назначаем пропорционально текст
                words_per_chunk = max(1, int(sub_duration / total_audio_duration_ms * len(words)))
                text_chunk = " ".join(words[word_cursor:word_cursor + words_per_chunk])
                word_cursor += words_per_chunk

                # Токенизация и проверка длины
                if len(tokenizer(text_chunk)["input_ids"]) <= max_tokens:
                    new_manifest.append({"audio_filepath": segment_path, "text": text_chunk})
        else:
            segment_name = f"{os.path.splitext(os.path.basename(audio_path))[0]}_segment{i}.wav"
            segment_path = os.path.join(output_dir, segment_name)
            chunk.export(segment_path, format="wav")

            words_per_chunk = max(1, int((seg["end"] - seg["start"]) / total_audio_duration_ms * len(words)))
            text_chunk = " ".join(words[word_cursor:word_cursor + words_per_chunk])
            word_cursor += words_per_chunk

            if len(tokenizer(text_chunk)["input_ids"]) <= max_tokens:
                new_manifest.append({"audio_filepath": segment_path, "text": text_chunk})

# Сохраняем манифест
with open(output_manifest, "w", encoding="utf-8") as f_out:
    for item in new_manifest:
        f_out.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"\n✅ Done: {len(new_manifest)} segments saved to {output_manifest}")


Using cache found in C:\Users\Admin/.cache\torch\hub\snakers4_silero-vad_master


Total files to process: 5190

[1/5190] Processing: C:/Users/Admin/stuff/DSS/annot/aud (1).wav

[2/5190] Processing: C:/Users/Admin/stuff/DSS/annot/aud (10).wav

[3/5190] Processing: C:/Users/Admin/stuff/DSS/annot/aud (100).wav

[4/5190] Processing: C:/Users/Admin/stuff/DSS/annot/aud (101).wav

[5/5190] Processing: C:/Users/Admin/stuff/DSS/annot/aud (102).wav

[6/5190] Processing: C:/Users/Admin/stuff/DSS/annot/aud (103).wav

[7/5190] Processing: C:/Users/Admin/stuff/DSS/annot/aud (104).wav

[8/5190] Processing: C:/Users/Admin/stuff/DSS/annot/aud (105).wav

[9/5190] Processing: C:/Users/Admin/stuff/DSS/annot/aud (106).wav

[10/5190] Processing: C:/Users/Admin/stuff/DSS/annot/aud (107).wav

[11/5190] Processing: C:/Users/Admin/stuff/DSS/annot/aud (108).wav

[12/5190] Processing: C:/Users/Admin/stuff/DSS/annot/aud (109).wav

[13/5190] Processing: C:/Users/Admin/stuff/DSS/annot/aud (11).wav

[14/5190] Processing: C:/Users/Admin/stuff/DSS/annot/aud (110).wav

[15/5190] Processing: C:/Users/