© 2025, Stefan Webb. Some Rights Reserved.

Except where otherwise noted, this work is licensed under a
[Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)](https://creativecommons.org/licenses/by-sa/4.0/deed.en)

In [6]:
import open_vad
from open_vad import SileroVAD
from open_vad.utils import get_speech_timestamps, read_audio, save_audio
from pprint import pprint
import torch

## Load Models

In [8]:
# Precompiled model
jit_model = torch.jit.load("../models/silero_vad.jit")
jit_model.eval()

RecursiveScriptModule(
  original_name=VADRNNJITMerge
  (_model): RecursiveScriptModule(
    original_name=VADRNNJIT
    (stft): RecursiveScriptModule(
      original_name=STFT
      (padding): RecursiveScriptModule(original_name=ReflectionPad1d)
    )
    (encoder): RecursiveScriptModule(
      original_name=Sequential
      (0): RecursiveScriptModule(
        original_name=SileroVadBlock
        (se): RecursiveScriptModule(original_name=Identity)
        (activation): RecursiveScriptModule(original_name=ReLU)
        (reparam_conv): RecursiveScriptModule(original_name=Conv1d)
      )
      (1): RecursiveScriptModule(
        original_name=SileroVadBlock
        (se): RecursiveScriptModule(original_name=Identity)
        (activation): RecursiveScriptModule(original_name=ReLU)
        (reparam_conv): RecursiveScriptModule(original_name=Conv1d)
      )
      (2): RecursiveScriptModule(
        original_name=SileroVadBlock
        (se): RecursiveScriptModule(original_name=Identity)
     

In [9]:
# Loading weights from precompiled model into our PyTorch model code
model = SileroVAD()
model.eval()

state_dict = jit_model.state_dict()
state_dict = {k.removeprefix("_model."): v for k,v in state_dict.items() if not k.startswith('_model_8k')}
model.load_state_dict(state_dict)

<All keys matched successfully>

# Basic Realtime Voice Activity Detection (VAD)
Implements a simple test of running Silero VAD in realtime from the console.

## Test offline inference

In [3]:
torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')

100%|██████████| 1.83M/1.83M [00:01<00:00, 1.69MB/s]


In [10]:
sampling_rate = 16000
wav = read_audio('en_example.wav', sampling_rate=sampling_rate)

In [11]:
wav.shape, wav.numel() / 16000

(torch.Size([960000]), 60.0)

In [13]:
speech_timestamps = get_speech_timestamps(wav,jit_model, sampling_rate=sampling_rate)
pprint(speech_timestamps)

[{'end': 33248, 'start': 32},
 {'end': 77792, 'start': 42528},
 {'end': 109536, 'start': 79392},
 {'end': 214496, 'start': 149024},
 {'end': 243168, 'start': 216608},
 {'end': 253408, 'start': 245280},
 {'end': 286688, 'start': 260640},
 {'end': 313824, 'start': 293920},
 {'end': 602080, 'start': 325152},
 {'end': 622048, 'start': 607264},
 {'end': 693216, 'start': 638496},
 {'end': 713184, 'start': 697888},
 {'end': 749536, 'start': 720416},
 {'end': 799200, 'start': 781344},
 {'end': 855008, 'start': 817184},
 {'end': 960000, 'start': 856608}]


In [None]:
# TODO: Debug the following, suspect a simple reshape is required
speech_timestamps = get_speech_timestamps(wav,jit_model, sampling_rate=sampling_rate)
pprint(speech_timestamps)

## Test online inference

In [1]:
# TODO: Streaming audio from microphone with pyaudio