<a href="https://colab.research.google.com/github/research-clone/notebook_tutorials/blob/main/silero-vad.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Dependencies

In [1]:
#@title Install and Import Dependencies

# this assumes that you have a relevant version of PyTorch installed
!pip install -q torchaudio

SAMPLING_RATE = 16000

import torch
torch.set_num_threads(1)

from IPython.display import Audio
from pprint import pprint
# download example
torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')

100%|██████████| 1.83M/1.83M [00:00<00:00, 1.94MB/s]


In [2]:
USE_ONNX = False # change this to True if you want to test onnx model
if USE_ONNX:
    !pip install -q onnxruntime

model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True,
                              onnx=USE_ONNX)

(get_speech_timestamps,
 save_audio,
 read_audio,
 VADIterator,
 collect_chunks) = utils

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /root/.cache/torch/hub/master.zip


In [7]:
Audio('en_example.wav')

## Full Audio

**Speech timestapms from full audio**

In [12]:
wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)
# get speech timestamps from full audio file
speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)
new_timestamps = []
for d in speech_timestamps:
  new_dict = {}
  new_dict['start'] = d['start']/16000
  new_dict['end'] = d['end']/16000
  new_timestamps.append(new_dict)
pprint(new_timestamps)

[{'end': 1.95, 'start': 0.098},
 {'end': 4.606, 'start': 2.658},
 {'end': 6.782, 'start': 4.962},
 {'end': 10.238, 'start': 9.314},
 {'end': 11.358, 'start': 10.434},
 {'end': 13.246, 'start': 11.458},
 {'end': 14.238, 'start': 13.538},
 {'end': 15.07, 'start': 14.37},
 {'end': 15.806, 'start': 15.33},
 {'end': 17.854, 'start': 16.29},
 {'end': 18.814, 'start': 18.402},
 {'end': 19.486, 'start': 18.978},
 {'end': 26.27, 'start': 20.354},
 {'end': 28.446, 'start': 26.402},
 {'end': 30.686, 'start': 28.674},
 {'end': 32.51, 'start': 30.818},
 {'end': 35.422, 'start': 32.738},
 {'end': 37.566, 'start': 35.778},
 {'end': 38.814, 'start': 37.954},
 {'end': 41.822, 'start': 39.906},
 {'end': 43.23, 'start': 41.986},
 {'end': 44.542, 'start': 43.618},
 {'end': 46.782, 'start': 45.058},
 {'end': 49.918, 'start': 48.866},
 {'end': 53.374, 'start': 51.106},
 {'end': 54.078, 'start': 53.538},
 {'end': 56.478, 'start': 54.498},
 {'end': 57.31, 'start': 56.642},
 {'end': 59.518, 'start': 57.506}]


In [4]:
# merge all speech chunks to one audio
save_audio('only_speech.wav',
           collect_chunks(speech_timestamps, wav), sampling_rate=SAMPLING_RATE)
Audio('only_speech.wav')

## Stream imitation example

In [5]:
## using VADIterator class

vad_iterator = VADIterator(model)
wav = read_audio(f'en_example.wav', sampling_rate=SAMPLING_RATE)

window_size_samples = 1536 # number of samples in a single audio chunk
for i in range(0, len(wav), window_size_samples):
    chunk = wav[i: i+ window_size_samples]
    if len(chunk) < window_size_samples:
      break
    speech_dict = vad_iterator(chunk, return_seconds=True)
    if speech_dict:
        print(speech_dict, end=' ')
vad_iterator.reset_states() # reset model states after each audio

{'start': 0.3} {'end': 2.0} {'start': 2.8} {'end': 4.7} {'start': 5.1} {'end': 6.8} {'start': 9.4} {'end': 13.4} {'start': 13.6} {'end': 15.2} {'start': 15.4} {'end': 15.9} {'start': 16.4} {'end': 18.0} {'start': 18.5} {'end': 19.6} {'start': 20.4} {'end': 28.5} {'start': 28.8} {'end': 32.6} {'start': 32.8} {'end': 35.5} {'start': 35.9} {'end': 37.7} {'start': 38.1} {'end': 38.9} {'start': 40.0} {'end': 43.3} {'start': 43.7} {'end': 44.7} {'start': 45.2} {'end': 46.9} {'start': 48.9} {'end': 50.0} {'start': 51.2} {'end': 54.2} {'start': 54.6} {'end': 59.6} {'start': 60.0} 

In [6]:
## just probabilities

wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)
speech_probs = []
window_size_samples = 1536
for i in range(0, len(wav), window_size_samples):
    chunk = wav[i: i+ window_size_samples]
    if len(chunk) < window_size_samples:
      break
    speech_prob = model(chunk, SAMPLING_RATE).item()
    speech_probs.append(speech_prob)
vad_iterator.reset_states() # reset model states after each audio

print(speech_probs[:10]) # first 10 chunks predicts

[0.06508486717939377, 0.43142661452293396, 0.9363492131233215, 0.9912925362586975, 0.9910984635353088, 0.7554672360420227, 0.9901331067085266, 0.9961254000663757, 0.9948359131813049, 0.9947713017463684]
