In [1]:
import torch
import torchaudio
from IPython.display import Audio
from pprint import pprint

In [2]:
SAMPLING_RATE = 16000
pprint(str(torchaudio.get_audio_backend()))

'soundfile'


In [3]:
#download examples
torch.hub.download_url_to_file('https://models.silero.ai/vad_models/ru.wav', 'ru_example.wav')
torch.hub.download_url_to_file('https://models.silero.ai/vad_models/ru_num.wav', 'ru_number_example.wav')
torch.hub.download_url_to_file('https://models.silero.ai/vad_models/ru.wav', 'ru_lang_example.wav')

  0%|          | 0.00/1.83M [00:00<?, ?B/s]

  0%|          | 0.00/210k [00:00<?, ?B/s]

  0%|          | 0.00/1.83M [00:00<?, ?B/s]

# 1. Voice detector

### Prepare full file
Orbiting the file with the model. We get the timestamps of the speech and collect the speech without noise and pauses.

In [4]:
# load model and function
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True)

(get_speech_timestamps,
 save_audio,
 read_audio,
 VADIterator,
 collect_chunks) = utils


wav = read_audio('ru_example.wav', sampling_rate=SAMPLING_RATE)
# get speech timestamps from full audio file
speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)
pprint(speech_timestamps)
Audio('ru_example.wav')

Downloading: "https://github.com/snakers4/silero-vad/archive/master.zip" to C:\Users\bitzh/.cache\torch\hub\master.zip


[{'end': 34272, 'start': 0},
 {'end': 174048, 'start': 89088},
 {'end': 260064, 'start': 199680},
 {'end': 326112, 'start': 267264},
 {'end': 386016, 'start': 353280},
 {'end': 510432, 'start': 479232},
 {'end': 591840, 'start': 517632},
 {'end': 679392, 'start': 628224},
 {'end': 745440, 'start': 685056},
 {'end': 766944, 'start': 752640},
 {'end': 806880, 'start': 772608},
 {'end': 854496, 'start': 817152},
 {'end': 877536, 'start': 864768},
 {'end': 909792, 'start': 887808},
 {'end': 960000, 'start': 936960}]


In [5]:
# merge all speech chunks to one audio
save_audio('only_speech.wav',
           collect_chunks(speech_timestamps, wav), sampling_rate=SAMPLING_RATE) 
Audio('only_speech.wav')

### Detect in stream
The start and end timings of the voice are recorded and displayed. (You can save to a list)

In [6]:
## using VADIterator class

vad_iterator = VADIterator(model)

window_size_samples = 1536 # number of samples in a single audio chunk
for i in range(0, len(wav), window_size_samples):
    chunk = wav[i: i+ window_size_samples]
    if len(chunk) < window_size_samples:
      break
    speech_dict = vad_iterator(chunk, return_seconds=True)
    if speech_dict:
        print(speech_dict, end=' ')
vad_iterator.reset_states() # reset model states after each audio

{'start': 0.1} {'end': 2.2} {'start': 5.6} {'end': 11.0} {'start': 12.5} {'end': 16.4} {'start': 16.8} {'end': 20.5} {'start': 22.1} {'end': 24.2} {'start': 30.0} {'end': 32.0} {'start': 32.4} {'end': 37.1} {'start': 39.3} {'end': 42.6} {'start': 42.9} {'end': 46.7} {'start': 47.1} {'end': 48.0} {'start': 48.4} {'end': 50.5} {'start': 51.1} {'end': 53.5} {'start': 54.1} {'end': 54.9} {'start': 55.6} {'end': 57.0} {'start': 58.6} 

In [7]:
speech_probs = []
window_size_samples = 1536
for i in range(0, len(wav), window_size_samples):
    chunk = wav[i: i+ window_size_samples]
    if len(chunk) < window_size_samples:
      break
    speech_prob = model(chunk, SAMPLING_RATE).item()
    speech_probs.append(speech_prob)
vad_iterator.reset_states() # reset model states after each audio

print(speech_probs[:10]) # first 10 chunks predicts

[0.9485929012298584, 0.9933443069458008, 0.998516857624054, 0.9993320107460022, 0.9993133544921875, 0.999509334564209, 0.9997748732566833, 0.9995781779289246, 0.9996708631515503, 0.9997628331184387]


# 2. Numbers detecting

In [8]:
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_number_detector',
                              force_reload=True)

(get_number_ts,
 save_audio,
 read_audio,
 collect_chunks,
 drop_chunks) = utils

Downloading: "https://github.com/snakers4/silero-vad/archive/master.zip" to C:\Users\bitzh/.cache\torch\hub\master.zip


  0%|          | 0.00/2.74M [00:00<?, ?B/s]

In [9]:
wav = read_audio('ru_number_example.wav', sampling_rate=SAMPLING_RATE)

# get number timestamps from full audio file
number_timestamps = get_number_ts(wav, model)
pprint(number_timestamps)
Audio('ru_number_example.wav')

[{'end': 240, 'start': 80},
 {'end': 1760, 'start': 400},
 {'end': 2160, 'start': 2000},
 {'end': 3600, 'start': 2240},
 {'end': 5120, 'start': 3840},
 {'end': 5280, 'start': 5200},
 {'end': 6480, 'start': 5440}]


  return forward_call(*input, **kwargs)


In [10]:
# convert ms in timestamps to samples
for timestamp in number_timestamps:
    timestamp['start'] = int(timestamp['start'] * SAMPLING_RATE / 1000)
    timestamp['end'] = int(timestamp['end'] * SAMPLING_RATE / 1000)
    
# merge all number chunks to one audio
save_audio('only_numbers.wav',
           collect_chunks(number_timestamps, wav), SAMPLING_RATE) 
Audio('only_numbers.wav')

In [11]:
# drop all number chunks from audio
save_audio('no_numbers.wav',
           drop_chunks(number_timestamps, wav), SAMPLING_RATE) 
Audio('no_numbers.wav')

# 3. Language detector

In [12]:
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_lang_detector',
                              force_reload=True)

get_language, read_audio = utils

wav = read_audio('ru_lang_example.wav', sampling_rate=SAMPLING_RATE)
lang = get_language(wav, model)
print(lang)
Audio('ru_lang_example.wav')

Downloading: "https://github.com/snakers4/silero-vad/archive/master.zip" to C:\Users\bitzh/.cache\torch\hub\master.zip


  0%|          | 0.00/2.74M [00:00<?, ?B/s]

ru
