In [None]:
%%writefile voiceprint_local.py


import os
import time
import json
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import sounddevice as sd
import soundfile as sf
import librosa
import torch
import torchaudio
from speechbrain.inference.speaker import EncoderClassifier


# -----------------------------
# Configuration
# -----------------------------
@dataclass
class Config:
    sr: int = 16000
    channels: int = 1
    enroll_takes: int = 3
    enroll_seconds: int = 4
    verify_seconds: int = 4
    trim_top_db: int = 25
    min_duration_sec: float = 1.2
    min_peak: float = 0.02
    max_clip_ratio: float = 0.01
    threshold: float = 0.75
    db_dir: str = "voice_db"


CFG = Config()


# -----------------------------
# Utilities
# -----------------------------
def ensure_dir(path):
    Path(path).mkdir(parents=True, exist_ok=True)


def cosine_sim(a, b):
    return float(np.dot(a, b) / ((np.linalg.norm(a) * np.linalg.norm(b)) + 1e-12))


def record_wav(path, seconds):
    print("Recording...")
    audio = sd.rec(int(seconds * CFG.sr), samplerate=CFG.sr,
                    channels=CFG.channels, dtype="float32")
    sd.wait()
    sf.write(path, audio, CFG.sr)


def preprocess_wav(inp, out):
    y, sr = librosa.load(inp, sr=CFG.sr, mono=True)
    y, _ = librosa.effects.trim(y, top_db=CFG.trim_top_db)

    if len(y) < int(CFG.min_duration_sec * sr):
        raise ValueError("Audio too short")

    y = librosa.util.normalize(y)

    if np.max(np.abs(y)) < CFG.min_peak:
        raise ValueError("Audio too quiet")

    if np.mean(np.abs(y) > 0.98) > CFG.max_clip_ratio:
        raise ValueError("Clipping detected")

    sf.write(out, y, sr)


# -----------------------------
# Speaker Encoder (ECAPA)
# -----------------------------
class SpeakerEncoder:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = EncoderClassifier.from_hparams(
            source="speechbrain/spkrec-ecapa-voxceleb",
            run_opts={"device": self.device},
        )

    def embed(self, wav):
        signal, sr = torchaudio.load(wav)
        if sr != CFG.sr:
            signal = torchaudio.functional.resample(signal, sr, CFG.sr)
        if signal.shape[0] > 1:
            signal = signal.mean(dim=0, keepdim=True)

        with torch.no_grad():
            emb = self.model.encode_batch(signal.to(self.device))
        emb = emb.squeeze().cpu().numpy()
        return emb / (np.linalg.norm(emb) + 1e-12)


# -----------------------------
# Enrollment
# -----------------------------
def enroll(user):
    ensure_dir(CFG.db_dir)
    user_path = Path(CFG.db_dir) / user
    ensure_dir(user_path


In [None]:
voiceprint_local.py
