# PyTorch Examples

## VAD

### Install Dependencies

In [1]:
#@title Install and Import Dependencies

# this assumes that you have a relevant version of PyTorch installed
!pip install -q torchaudio soundfile

import glob
import torch
torch.set_num_threads(1)

from IPython.display import Audio
from pprint import pprint

model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True)

(get_speech_ts,
 get_speech_ts_adaptive,
 save_audio,
 read_audio,
 state_generator,
 single_audio_stream,
 collect_chunks) = utils

files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'

[?25l[K     |▏                               | 10kB 16.5MB/s eta 0:00:01[K     |▍                               | 20kB 20.7MB/s eta 0:00:01[K     |▌                               | 30kB 23.6MB/s eta 0:00:01[K     |▊                               | 40kB 24.5MB/s eta 0:00:01[K     |▉                               | 51kB 25.9MB/s eta 0:00:01[K     |█                               | 61kB 23.6MB/s eta 0:00:01[K     |█▏                              | 71kB 19.5MB/s eta 0:00:01[K     |█▍                              | 81kB 20.3MB/s eta 0:00:01[K     |█▌                              | 92kB 18.4MB/s eta 0:00:01[K     |█▊                              | 102kB 17.6MB/s eta 0:00:01[K     |█▉                              | 112kB 17.6MB/s eta 0:00:01[K     |██                              | 122kB 17.6MB/s eta 0:00:01[K     |██▏                             | 133kB 17.6MB/s eta 0:00:01[K     |██▍                             | 143kB 17.6MB/s eta 0:00:01[K     |██▌          

Downloading: "https://github.com/snakers4/silero-vad/archive/master.zip" to /root/.cache/torch/hub/master.zip


### Full Audio

**Classic way of getting speech chunks, you may need to select the tresholds yourself**

In [2]:
wav = read_audio(f'{files_dir}/en.wav')
# get speech timestamps from full audio file
speech_timestamps = get_speech_ts(wav, model,
                                  num_steps=4)
pprint(speech_timestamps)

  result = self.forward(*input, **kwargs)


[{'end': 35000, 'start': 0},
 {'end': 112000, 'start': 35000},
 {'end': 124000, 'start': 112000},
 {'end': 320000, 'start': 143000},
 {'end': 628000, 'start': 319000},
 {'end': 752000, 'start': 632000},
 {'end': 801000, 'start': 775000},
 {'end': 960000, 'start': 811000}]


In [None]:
# merge all speech chunks to one audio
save_audio('only_speech.wav',
           collect_chunks(speech_timestamps, wav), 16000) 
Audio('only_speech.wav')

**Experimental Adaptive method, algorythm selects tresholds itself (see readme for more information)**

In [3]:
wav = read_audio(f'{files_dir}/en.wav')
# get speech timestamps from full audio file
speech_timestamps = get_speech_ts_adaptive(wav, model, step=500, num_samples_per_window=4000)
pprint(speech_timestamps)

[{'end': 35000, 'start': 0},
 {'end': 112000, 'start': 35500},
 {'end': 246000, 'start': 142500},
 {'end': 288500, 'start': 251500},
 {'end': 315500, 'start': 289500},
 {'end': 603500, 'start': 318000},
 {'end': 623000, 'start': 606500},
 {'end': 713000, 'start': 631000},
 {'end': 728500, 'start': 712000},
 {'end': 748500, 'start': 726500},
 {'end': 798500, 'start': 775000},
 {'end': 899500, 'start': 811000},
 {'end': 914000, 'start': 897000},
 {'end': 962000, 'start': 913000}]


In [None]:
# merge all speech chunks to one audio
save_audio('only_speech.wav',
           collect_chunks(speech_timestamps, wav), 16000) 
Audio('only_speech.wav')

### Single Audio Stream

**Classic way of getting speech chunks, you may need to select the tresholds yourself**

In [4]:
wav = f'{files_dir}/en.wav'

for batch in single_audio_stream(model, wav):
    if batch:
        print(batch)

[{4000: 'start'}]
[{39000: 'end'}]
[{43000: 'start'}]
[{115500: 'end'}]
[{121500: 'start'}]
[{127500: 'end'}]
[{150500: 'start'}]
[{291000: 'end'}]
[{295000: 'start'}]
[{322000: 'end'}]
[{326500: 'start'}]
[{631500: 'end'}]
[{640500: 'start'}]
[{755000: 'end'}]
[{782500: 'start'}]
[{804500: 'end'}]
[{818500: 'start'}]


**Experimental Adaptive method, algorythm selects tresholds itself (see readme for more information)**

In [5]:
wav = f'{files_dir}/en.wav'

for batch in single_audio_stream(model, wav, iterator_type='adaptive'):
    if batch:
        print(batch)

[{2000: 'start'}]
[{40000: 'end'}]
[{44000: 'start'}]
[{115500: 'end'}]
[{151000: 'start'}]
[{251000: 'end'}]
[{260000: 'start'}]
[{291500: 'end'}]
[{298000: 'start'}]
[{320500: 'end'}]
[{326500: 'start'}]
[{612500: 'end'}]
[{615000: 'start'}]
[{628000: 'end'}]
[{639500: 'start'}]
[{718500: 'end'}]
[{720500: 'start'}]
[{755500: 'end'}]
[{783500: 'start'}]
[{805000: 'end'}]
[{819500: 'start'}]
[{902000: 'end'}]
[{905500: 'start'}]
[{921000: 'start'}]


### Multiple Audio Streams

In [None]:
audios_for_stream = glob.glob(f'{files_dir}/*.wav')
len(audios_for_stream) # total 4 audios

In [None]:
for batch in state_generator(model, audios_for_stream, audios_in_stream=2): # 2 audio stream
    if batch:
        pprint(batch)

## Number detector

### Install Dependencies

In [None]:
#@title Install and Import Dependencies

# this assumes that you have a relevant version of PyTorch installed
!pip install -q torchaudio soundfile

import glob
import torch
torch.set_num_threads(1)

from IPython.display import Audio
from pprint import pprint

model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_number_detector',
                              force_reload=True)

(get_number_ts,
 save_audio,
 read_audio,
 collect_chunks,
 drop_chunks) = utils

files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'

### Full audio

In [None]:
wav = read_audio(f'{files_dir}/en_num.wav')
# get number timestamps from full audio file
number_timestamps = get_number_ts(wav, model)
pprint(number_timestamps)

In [None]:
sample_rate = 16000
# convert ms in timestamps to samples
for timestamp in number_timestamps:
    timestamp['start'] = int(timestamp['start'] * sample_rate / 1000)
    timestamp['end'] = int(timestamp['end'] * sample_rate / 1000)

In [None]:
# merge all number chunks to one audio
save_audio('only_numbers.wav',
           collect_chunks(number_timestamps, wav), sample_rate) 
Audio('only_numbers.wav')

In [None]:
# drop all number chunks from audio
save_audio('no_numbers.wav',
           drop_chunks(number_timestamps, wav), sample_rate) 
Audio('no_numbers.wav')

## Language detector

### Install Dependencies

In [None]:
#@title Install and Import Dependencies

# this assumes that you have a relevant version of PyTorch installed
!pip install -q torchaudio soundfile

import glob
import torch
torch.set_num_threads(1)

from IPython.display import Audio
from pprint import pprint

model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_lang_detector',
                              force_reload=True)

(get_language,
 read_audio) = utils

files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'

### Full audio

In [None]:
wav = read_audio(f'{files_dir}/en.wav')
lang = get_language(wav, model)
print(lang)

# ONNX Example

## VAD

### Install Dependencies

In [6]:
#@title Install and Import Dependencies

# this assumes that you have a relevant version of PyTorch installed
!pip install -q torchaudio soundfile onnxruntime

import glob
import onnxruntime
from pprint import pprint

from IPython.display import Audio

_, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True)

(get_speech_ts,
 get_speech_ts_adaptive,
 save_audio,
 read_audio,
 state_generator,
 single_audio_stream,
 collect_speeches) = utils

files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'

def init_onnx_model(model_path: str):
    return onnxruntime.InferenceSession(model_path)

def validate_onnx(model, inputs):
    with torch.no_grad():
        ort_inputs = {'input': inputs.cpu().numpy()}
        outs = model.run(None, ort_inputs)
        outs = [torch.Tensor(x) for x in outs]
    return outs[0]

[K     |████████████████████████████████| 4.1MB 19.5MB/s 
[?25h

Downloading: "https://github.com/snakers4/silero-vad/archive/master.zip" to /root/.cache/torch/hub/master.zip


### Full Audio

**Classic way of getting speech chunks, you may need to select the tresholds yourself**

In [7]:
model = init_onnx_model(f'{files_dir}/model.onnx')
wav = read_audio(f'{files_dir}/en.wav')

# get speech timestamps from full audio file
speech_timestamps = get_speech_ts(wav, model, num_steps=4, run_function=validate_onnx) 
pprint(speech_timestamps)

[{'end': 33000, 'start': 0},
 {'end': 112000, 'start': 35000},
 {'end': 287000, 'start': 143000},
 {'end': 317000, 'start': 287000},
 {'end': 623000, 'start': 319000},
 {'end': 752000, 'start': 632000},
 {'end': 801000, 'start': 775000},
 {'end': 960000, 'start': 811000}]


In [None]:
# merge all speech chunks to one audio
save_audio('only_speech.wav', collect_chunks(speech_timestamps, wav), 16000)
Audio('only_speech.wav')

**Experimental Adaptive method, algorythm selects tresholds itself (see readme for more information)**

In [8]:
model = init_onnx_model(f'{files_dir}/model.onnx')
wav = read_audio(f'{files_dir}/en.wav')

# get speech timestamps from full audio file
speech_timestamps = get_speech_ts_adaptive(wav, model, run_function=validate_onnx) 
pprint(speech_timestamps)

[{'end': 35000, 'start': 0},
 {'end': 112500, 'start': 34500},
 {'end': 245000, 'start': 140000},
 {'end': 286500, 'start': 251500},
 {'end': 315000, 'start': 285000},
 {'end': 527500, 'start': 316500},
 {'end': 603500, 'start': 524500},
 {'end': 623500, 'start': 606500},
 {'end': 713000, 'start': 629500},
 {'end': 738500, 'start': 711500},
 {'end': 751000, 'start': 735000},
 {'end': 797500, 'start': 772500},
 {'end': 883000, 'start': 809000},
 {'end': 914500, 'start': 897000},
 {'end': 962000, 'start': 911500}]


In [None]:
# merge all speech chunks to one audio
save_audio('only_speech.wav', collect_chunks(speech_timestamps, wav), 16000)
Audio('only_speech.wav')

NameError: name 'save_audio' is not defined

### Single Audio Stream

**Classic way of getting speech chunks, you may need to select the tresholds yourself**

In [9]:
model = init_onnx_model(f'{files_dir}/model.onnx')
wav = f'{files_dir}/en.wav'

In [10]:
for batch in single_audio_stream(model, wav, run_function=validate_onnx):
    if batch:
        pprint(batch)

[{4000: 'start'}]
[{37000: 'end'}]
[{43000: 'start'}]
[{115500: 'end'}]
[{150500: 'start'}]
[{291000: 'end'}]
[{294500: 'start'}]
[{321000: 'end'}]
[{326500: 'start'}]
[{627000: 'end'}]
[{639000: 'start'}]
[{718000: 'end'}]
[{721000: 'start'}]
[{755500: 'end'}]
[{783000: 'start'}]
[{804000: 'end'}]
[{818500: 'start'}]


**Experimental Adaptive method, algorythm selects tresholds itself (see readme for more information)**

In [11]:
model = init_onnx_model(f'{files_dir}/model.onnx')
wav = f'{files_dir}/en.wav'

In [12]:
for batch in single_audio_stream(model, wav, iterator_type='adaptive', run_function=validate_onnx):
    if batch:
        pprint(batch)

[{0: 'start'}]
[{38000: 'end'}]
[{43000: 'start'}]
[{115000: 'end'}]
[{148500: 'start'}]
[{250500: 'end'}]
[{260000: 'start'}]
[{292000: 'end'}]
[{293500: 'start'}]
[{320000: 'end'}]
[{325000: 'start'}]
[{548000: 'end'}]
[{547500: 'start'}]
[{613000: 'end'}]
[{615000: 'start'}]
[{626500: 'end'}]
[{638000: 'start'}]
[{697500: 'start'}]
[{718000: 'end'}]
[{720000: 'start'}]
[{756000: 'end'}]
[{781000: 'start'}]
[{804500: 'end'}]
[{817500: 'start'}]
[{872000: 'end'}]
[{871000: 'start'}]
[{902000: 'end'}]
[{905500: 'start'}]
[{920500: 'end'}]
[{920000: 'start'}]


### Multiple Audio Streams

In [None]:
model = init_onnx_model(f'{files_dir}/model.onnx')
audios_for_stream = glob.glob(f'{files_dir}/*.wav')
pprint(len(audios_for_stream)) # total 4 audios

In [None]:
for batch in state_generator(model, audios_for_stream, audios_in_stream=2, run_function=validate_onnx): # 2 audio stream
    if batch:
        pprint(batch)

## Number detector

### Install Dependencies

In [None]:
#@title Install and Import Dependencies

# this assumes that you have a relevant version of PyTorch installed
!pip install -q torchaudio soundfile onnxruntime

import glob
import torch
import onnxruntime
from pprint import pprint

from IPython.display import Audio

_, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_number_detector',
                              force_reload=True)

(get_number_ts,
 save_audio,
 read_audio,
 collect_chunks,
 drop_chunks) = utils

files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'

def init_onnx_model(model_path: str):
    return onnxruntime.InferenceSession(model_path)

def validate_onnx(model, inputs):
    with torch.no_grad():
        ort_inputs = {'input': inputs.cpu().numpy()}
        outs = model.run(None, ort_inputs)
        outs = [torch.Tensor(x) for x in outs]
    return outs

### Full Audio

In [None]:
model = init_onnx_model(f'{files_dir}/number_detector.onnx')
wav = read_audio(f'{files_dir}/en_num.wav')

# get number timestamps from full audio file
number_timestamps = get_number_ts(wav, model, run_function=validate_onnx)
pprint(number_timestamps)

In [None]:
sample_rate = 16000
# convert ms in timestamps to samples
for timestamp in number_timestamps:
    timestamp['start'] = int(timestamp['start'] * sample_rate / 1000)
    timestamp['end'] = int(timestamp['end'] * sample_rate / 1000)

In [None]:
# merge all number chunks to one audio
save_audio('only_numbers.wav',
           collect_chunks(number_timestamps, wav), 16000) 
Audio('only_numbers.wav')

In [None]:
# drop all number chunks from audio
save_audio('no_numbers.wav',
           drop_chunks(number_timestamps, wav), 16000) 
Audio('no_numbers.wav')

## Language detector

### Install Dependencies

In [None]:
#@title Install and Import Dependencies

# this assumes that you have a relevant version of PyTorch installed
!pip install -q torchaudio soundfile onnxruntime

import glob
import torch
import onnxruntime
from pprint import pprint

from IPython.display import Audio

_, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_lang_detector',
                              force_reload=True)

(get_language,
 read_audio) = utils

files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'

def init_onnx_model(model_path: str):
    return onnxruntime.InferenceSession(model_path)

def validate_onnx(model, inputs):
    with torch.no_grad():
        ort_inputs = {'input': inputs.cpu().numpy()}
        outs = model.run(None, ort_inputs)
        outs = [torch.Tensor(x) for x in outs]
    return outs

### Full Audio

In [None]:
model = init_onnx_model(f'{files_dir}/number_detector.onnx')
wav = read_audio(f'{files_dir}/en.wav')

lang = get_language(wav, model, run_function=validate_onnx)
print(lang)