In [1]:
%cd ../
!pwd

/home/timur.bikbulatov/personal/aa_on_vad
/home/timur.bikbulatov/personal/aa_on_vad


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
import plotly.graph_objects as go
import numpy as np

def plot(y:list):
    trace = []
    colors = [
        'Blue',
        'Orange',
        'Green',
        'Red',
        'Purple',
        'Magenta',
        'Cyan',
        'Brown',
        'Pink',
        'Lime',
        'Yellow',
        'Teal',
        'Olive',
        'Navy',
        'Maroon',
        'Coral',
        'Gold',
        'Indigo',
        'Turquoise',
        'Lavender',
        'Mint',
        'Silver',
        ]
    for ik, y_ in enumerate(y):
        trace.append(go.Scatter(x=np.arange(len(y_)), y=y_, mode='lines', name=f'arg # {ik + 1}', line=dict(color=colors[ik])))

    # Combining both traces into one figure
    fig = go.Figure(data=trace)

    # Setting the layout
    fig.update_layout(
        title='Two Line Charts on One Plot',
        xaxis_title='X-axis',
        yaxis_title='Y-axis',
        showlegend=True
    )

    # Display the plot
    fig.show()

In [3]:
from src.evaluation.evaluator import VADEvaluator

In [4]:
from src.datasets.mixed_vad_datset import get_datset

In [5]:
import torch.nn.functional as F
import torch


@torch.no_grad()
def get_vad_mask(
    audio: torch.Tensor,
    model,
    threshold: float = 0.5,
    sample_rate: int = 16000,
    window_size_samples: int = 512
) -> torch.Tensor:
    """
    Convert VAD model predictions into a binary mask.
    
    Args:
        audio: torch.Tensor - Input audio (1D tensor)
        model: VAD model
        threshold: float - Speech probability threshold
        sample_rate: int - Audio sampling rate
        window_size_samples: int - Window size for processing
        
    Returns:
        torch.Tensor - Binary mask of same length as input audio
    """
    # Ensure audio is 1D
    if not torch.is_tensor(audio):
        audio = torch.tensor(audio)
    audio = audio.squeeze()
    
    # Handle sample_rate
    if sample_rate > 16000 and (sample_rate % 16000 == 0):
        step = sample_rate // 16000
        sample_rate = 16000
        audio = audio[::step]
    
    # Reset model states
    if hasattr(model, 'reset_states'):
        model.reset_states()
    
    # Initialize mask
    audio_length = len(audio)
    mask = torch.zeros(audio_length)
    # Process audio in windows
    for start_idx in range(0, audio_length, window_size_samples):
        # Get chunk
        chunk = audio[start_idx: start_idx + window_size_samples]
        
        # Pad last chunk if needed
        if len(chunk) < window_size_samples:
            chunk = F.pad(chunk, (0, window_size_samples - len(chunk)))
        
        # Get prediction
        speech_prob = model(chunk, sample_rate).item()
        
        # Fill mask for this window
        end_idx = min(start_idx + window_size_samples, audio_length)
        mask[start_idx:end_idx] = float(speech_prob >= threshold)
    
    return mask

In [6]:
from torch.utils.data import DataLoader
from tqdm import tqdm

def validate_silero_vad(model, dataset, device):
    """
    Validate Silero VAD model using the VADEvaluator
    
    Args:
        model: Silero VAD model instance
        dataset: Dataset instance providing audio samples and labels
        device: torch device
        batch_size: batch size for DataLoader
    """
    dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
    evaluator = VADEvaluator(threshold=0.5)
    sample_rate = 16000
    model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader):
            # Move data to device
            wavs = batch['sample'].to(device)
            masks_true = batch['mask'].to(device)
            
            # Process each audio in batch
            mask_pred = get_vad_mask(
                wavs, 
                model, 
                sample_rate=sample_rate)
            
            # Update evaluator
            evaluator.update(mask_pred, masks_true)
    
    # Compute and return metrics
    metrics = evaluator.compute()
    return metrics

In [7]:
import torch

torch.set_num_threads(1)

model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)



Using cache found in /home/timur.bikbulatov/.cache/torch/hub/snakers4_silero-vad_master


In [8]:
# Create dataset
dataset = get_datset(mode='test', erase_silence=True)

In [9]:
sample = dataset[0]
wav = sample['sample']
maskT = sample['mask']
mask = get_vad_mask(wav.to('cuda:0'), model)

  audio = torch.tensor(self.audios[idx], dtype=torch.float32)


In [None]:
plot([wav, mask, maskT])

In [11]:
# Run validation
metrics = validate_silero_vad(model, dataset, device)


To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).

100%|██████████| 262/262 [00:12<00:00, 21.79it/s]


In [12]:
print("Validation metrics:")
for metric_name, value in metrics.items():
    print(f"{metric_name}: {value:.4f}")

Validation metrics:
precision: 0.8470
recall: 0.9164
f1_score: 0.8803
accuracy: 0.8978
false_positive_rate: 0.1151
true_positive_rate: 0.9164
