In [1]:
# torch_bootstrap.py
import os
os.environ["OMP_NUM_THREADS"] = "8"
os.environ["OPENBLAS_NUM_THREADS"] = "8"
os.environ["MKL_NUM_THREADS"] = "8"
os.environ["VECLIB_MAXIMUM_THREADS"] = "8"
os.environ["NUMEXPR_NUM_THREADS"] = "8"

import torch
torch.set_num_threads(8)
torch.set_num_interop_threads(1)
torch.set_grad_enabled(False)


torch.autograd.grad_mode.set_grad_enabled(mode=False)

In [65]:
# %%
import sounddevice as sd
import numpy as np
from faster_whisper import WhisperModel
import time
from collections import deque
import threading
import json
import io

os.environ["LLAMA_CPP_LOG_LEVEL"] = "ERROR"  # Only show errors
from llama_cpp import Llama

try:
    from pocket_tts import TTSModel
    POCKET_AVAILABLE = True
except ImportError:
    POCKET_AVAILABLE = False
    print("‚ö†Ô∏è pocket-tts not installed. Install with: uv add pocket-tts")

# =============================
# CONFIGURATION
# =============================
TRANSCRIPTION_MODE = "vosk"  # Options: "vosk" (fast partials) or "whisper"
PROJECT_ROOT = r"D:\Work\Projects\AI\interactive-chat-ai"
TTS_MODE = "pocket"  # Options: "pocket" (neural) or "powershell" (system)
POCKET_VOICE = "alba"  # Options: alba, marius, javert, jean, fantine, cosette, 

# =============================
# GLOBAL INTERRUPTION CONTROL
# =============================
human_interrupt_event = threading.Event()   # Human started speaking
ai_speaking_event = threading.Event()       # AI currently speaking
processing_lock = threading.Lock()           # Guards LLM/Whisper sections
human_speaking_now = threading.Event()
ai_interrupt_latched = False
spoken_sentences = []
tts_playback_lock = threading.Lock()


# Carry-over buffer when human interrupts mid-processing
pending_user_text = deque()  # list[str]

# =============================
# EPHEMERAL CONVERSATION MEMORY
# =============================
MAX_MEMORY_TURNS = 6  # 3 user + 3 assistant turns (tune later)

conversation_memory = deque(maxlen=MAX_MEMORY_TURNS)

In [56]:
# =============================
# LOAD MODELS (ALWAYS LOAD WHISPER FOR FINAL TRANSCRIPTION)
# =============================
print("Loading Silero VAD...")
vad_model, _ = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=False)

print("Loading Whisper (for final transcription)...")
whisper = WhisperModel(
    r"D:\Work\Projects\AI\interactive-chat-ai\models\whisper\distil-small.en",
    device="cpu",
    compute_type="int8",
    local_files_only=True,  # Force local, no hub download
    cpu_threads=8
)

# Load Vosk only if needed
vosk_model = None
vosk_rec = None
if TRANSCRIPTION_MODE == "vosk":
    from vosk import Model, KaldiRecognizer
    print("Loading Vosk...")
    vosk_model = Model("models/vosk-model-small-en-us-0.15")
    vosk_rec = KaldiRecognizer(vosk_model, 16000)
    vosk_rec.SetWords(True)

print(f"ASR mode: {TRANSCRIPTION_MODE}")



Loading Silero VAD...


Using cache found in C:\Users\PC/.cache\torch\hub\snakers4_silero-vad_master


Loading Whisper (for final transcription)...
Loading Vosk...
ASR mode: vosk


In [57]:
# GLOBALS (replace old _llm_model/_llm_tokenizer)
_llama_model = None
GGUF_MODEL_PATH = os.path.join(PROJECT_ROOT, "models", "llm" ,"qwen2.5-3b-instruct-q5_k_m.gguf")  # Adjust path as needed

def get_llm():
    global _llama_model
    if _llama_model is None:
        print("‚è≥ Loading Qwen2.5-3B (Q5_K_M GGUF) on CPU...")
        try:
            _llama_model = Llama(
                model_path=GGUF_MODEL_PATH,
                n_ctx=2048,
                n_threads=8,
                n_threads_batch=8,
                n_batch=512,
                n_gqa=1,  # ‚ö†Ô∏è CRITICAL FOR QWEN
                verbose=False,  # Enable loading logs
                use_mmap=True,
                use_mlock=False,
                rope_freq_base=1000000.0
            )
            print("‚úÖ Qwen2.5-3B loaded successfully")
        except Exception as e:
            print(f"‚ùå FAILED to load GGUF model: {e}")
            print(f"   Check if file exists: {os.path.exists(GGUF_MODEL_PATH)}")
            raise  # Force crash to see error
    return _llama_model

# Add after defining GGUF_MODEL_PATH
print(f"üîç Checking model path: {GGUF_MODEL_PATH}")
print(f"   Exists? {os.path.exists(GGUF_MODEL_PATH)}")
print(f"   Size: {os.path.getsize(GGUF_MODEL_PATH) / (1024**3):.2f} GB")

üîç Checking model path: D:\Work\Projects\AI\interactive-chat-ai\models\llm\qwen2.5-3b-instruct-q5_k_m.gguf
   Exists? True
   Size: 2.27 GB


In [58]:
# =============================
# AUDIO SETUP
# =============================
SAMPLE_RATE = 16000
audio_buffer = []
VOSK_MIN_SAMPLES = 3200  # 0.2 sec @ 16kHz

def audio_callback(indata, frames, time, status):
    audio_buffer.append(indata.copy())

stream = sd.InputStream(samplerate=SAMPLE_RATE, channels=1, callback=audio_callback)


# =============================
# ASR WORKER (STREAMING PARTIALS)
# =============================
asr_audio = deque()      # For streaming partials (trimmed)
turn_audio = deque()     # For final transcription (full turn)
asr_lock = threading.Lock()
turn_audio_lock = threading.Lock()
current_partial_text = ""
vosk_reset_requested = False

def float32_to_int16(audio):
    audio = np.clip(audio, -1.0, 1.0)
    return (audio * 32767).astype(np.int16)

def asr_worker():
    global current_partial_text, vosk_reset_requested
    WHISPER_WINDOW_SEC = 1.2

    while True:
        time.sleep(0.05 if TRANSCRIPTION_MODE == "vosk" else 0.7)

        if TRANSCRIPTION_MODE == "whisper":
            with asr_lock:
                if not asr_audio:
                    continue
                now = time.time()
                recent = [frame for frame, t in asr_audio if now - t <= WHISPER_WINDOW_SEC]
            if not recent:
                continue
            audio_np = np.concatenate(recent)
            segments, _ = whisper.transcribe(
                audio_np, language="en", vad_filter=False, beam_size=1, temperature=0.0
            )
            text = " ".join(seg.text for seg in segments).strip()
            if text and text != current_partial_text:
                current_partial_text = text
                print("üìù Partial:", text)

        else:  # Vosk mode
            if vosk_reset_requested:
                vosk_rec.Reset()
                vosk_reset_requested = False
                current_partial_text = ""
            with asr_lock:
                if not asr_audio:
                    continue
                frame, _ = asr_audio.popleft()
                if len(frame) < VOSK_MIN_SAMPLES:
                    continue
                pcm16 = float32_to_int16(frame)
            try:
                if vosk_rec.AcceptWaveform(pcm16.tobytes()):
                    res = json.loads(vosk_rec.Result())
                    text = res.get("text", "").strip()
                    if text:
                        print("üìù Final:", text)
                        current_partial_text = ""
                else:
                    res = json.loads(vosk_rec.PartialResult())
                    partial = res.get("partial", "").strip()
                    if partial and partial != current_partial_text:
                        current_partial_text = partial
                        print("üìù Partial:", partial)
            except Exception:
                continue

threading.Thread(target=asr_worker, daemon=True).start()
print("ASR worker started")

# =============================
# TURN-TAKING RULES
# =============================
TRAILING_CONJUNCTIONS = {"and","or","but","because","so","that","which","who","when","if","though","while"}
OPEN_ENDED_PREFIXES = ("i think","i guess","i'm not sure","the thing is","it depends")
QUESTION_LEADINS = ("do you think","would you say","is it possible","can you")
SELF_REPAIR_MARKERS = ("i mean","actually","sorry","no wait")
FILLER_ENDINGS = ("uh","um","like","you know","kind of")

def lexical_bias(text: str) -> float:
    if not text: return 0.0
    t = text.lower().strip()
    words = t.split()
    score = 0.0
    if words[-1] in TRAILING_CONJUNCTIONS: score -= 1.0
    if any(t.startswith(p) for p in OPEN_ENDED_PREFIXES): score -= 0.6
    if any(t.startswith(q) for q in QUESTION_LEADINS): score -= 0.5
    if any(m in t[-20:] for m in SELF_REPAIR_MARKERS): score -= 0.4
    if words[-1] in FILLER_ENDINGS: score -= 0.7
    return score

def energy_decay_score(energy_history):
    if len(energy_history) < 5: return 0.0
    x = np.arange(len(energy_history))
    y = np.array(energy_history)
    slope = np.polyfit(x, y, 1)[0]
    return 0.8 if slope < -0.00015 else 0.0

ASR worker started


In [59]:
# =============================
# POCKET TTS LOADING
# =============================
_pocket_model = None
_pocket_voice_state = None
_pocket_sample_rate = 24000  # Pocket TTS default

def get_pocket_tts():
    """Lazy load Pocket TTS model"""
    global _pocket_model, _pocket_voice_state, _pocket_sample_rate
    if _pocket_model is None:
        if not POCKET_AVAILABLE:
            raise ImportError("pocket-tts not installed")
        print(f"‚è≥ Loading Pocket TTS (voice: {POCKET_VOICE})...")
        _pocket_model = TTSModel.load_model()
        _pocket_voice_state = _pocket_model.get_state_for_audio_prompt(POCKET_VOICE)
        _pocket_sample_rate = _pocket_model.sample_rate
        print("‚úÖ Pocket TTS loaded!")
    return _pocket_model, _pocket_voice_state, _pocket_sample_rate

In [None]:
import subprocess

def speak(text):
    if not text or not text.strip():
        return

    with tts_playback_lock:
        ai_speaking_event.set()

        try:
            if TTS_MODE == "pocket":
                model, voice_state, sr = get_pocket_tts()
                audio = model.generate_audio(voice_state, text)
                audio_np = audio.numpy() if hasattr(audio, 'numpy') else np.array(audio)

                sd.play(audio_np, sr)
                sd.wait()  # üîí HARD BLOCK until playback ends

            else:
                speak_powershell(text)

        finally:
            ai_speaking_event.clear()



def speak_powershell(text):
    """Original Windows PowerShell TTS (fallback)"""
    safe_text = text.replace('"', '""').replace('\n', ' ').replace('\r', '')
    cmd = f'Add-Type -AssemblyName System.Speech; $s=New-Object System.Speech.Synthesis.SpeechSynthesizer; $s.Speak("{safe_text}")'
    try:
        subprocess.run(["powershell", "-Command", cmd],
                       stdout=subprocess.DEVNULL,
                       stderr=subprocess.DEVNULL,
                       timeout=10)
    except Exception as e:
        print(f"üîä Speech error: {e}")

In [61]:
# =============================
# WINDOWS-RELIABLE TTS (POWER SHELL)
# =============================
import queue
import threading

response_queue = queue.Queue()

def tts_main_loop():
    while True:
        try:
            text = response_queue.get(timeout=0.1)

            # If human spoke, drop all pending AI speech
            if human_interrupt_event.is_set():
                with response_queue.mutex:
                    response_queue.queue.clear()
                continue

            speak(text)

        except queue.Empty:
            pass


# Started earlier in your code:
threading.Thread(target=tts_main_loop, daemon=False).start()

In [62]:
# At the TOP of your notebook (before main loop), make sure these exist:
from dataclasses import dataclass, field
from typing import List

@dataclass
class TurnTiming:
    turn_id: int = 0
    speech_end_time: float = 0.0
    audio_capture_duration_ms: float = 0.0
    whisper_transcribe_ms: float = 0.0
    whisper_rtf: float = 0.0
    llm_tokenize_ms: float = 0.0
    llm_generate_ms: float = 0.0
    llm_tokens_per_sec: float = 0.0
    text_process_ms: float = 0.0
    tts_generate_ms: float = 0.0
    tts_playback_ms: float = 0.0
    total_latency_ms: float = 0.0
    total_audio_duration_sec: float = 0.0
    
    def print_report(self):
        print(f"\n{'='*60}")
        print(f"üìä TURN #{self.turn_id} TIMING AUDIT")
        print(f"{'='*60}")
        print(f"üéôÔ∏è  User audio duration:     {self.total_audio_duration_sec:.2f}s")
        print(f"‚è±Ô∏è  Speech end ‚Üí Response:   {self.total_latency_ms:.0f}ms total")
        print(f"{'‚îÄ'*40}")
        print(f"1. Audio buffer capture:     {self.audio_capture_duration_ms:.1f}ms")
        print(f"2. Whisper transcription:    {self.whisper_transcribe_ms:.1f}ms (RTF: {self.whisper_rtf:.2f}x)")
        print(f"3. LLM tokenization:         {self.llm_tokenize_ms:.1f}ms")
        print(f"4. LLM generation:           {self.llm_generate_ms:.1f}ms ({self.llm_tokens_per_sec:.1f} tok/s)")
        print(f"5. Text processing:          {self.text_process_ms:.1f}ms")
        if self.tts_generate_ms > 0:
            print(f"6. TTS generation:           {self.tts_generate_ms:.1f}ms")
            print(f"7. Audio playback:           {self.tts_playback_ms:.1f}ms")
        print(f"{'='*60}\n")

turn_counter = 0
timing_history: List[TurnTiming] = []

In [63]:
def generate_response(frames, timing: TurnTiming):
    global turn_counter
    with processing_lock:
        timing.speech_end_time = time.perf_counter()
    
    # Skip only if human is actively speaking NOW
        if human_speaking_now.is_set():
            print("‚ö†Ô∏è Skipping generation ‚Äî human speaking")
            return
    
    try:
        # Stage 1: Audio Capture
        t0 = time.perf_counter()
        if not frames:
            print("‚ö†Ô∏è No audio captured ‚Äî skipping response")
            return
        
        full_audio = np.concatenate([frame for frame, _ in frames])
        timing.total_audio_duration_sec = full_audio.shape[0] / 16000.0
        timing.audio_capture_duration_ms = (time.perf_counter() - t0) * 1000
        print(f"üîä Captured {len(frames)} frames ({timing.total_audio_duration_sec:.2f}s) in {timing.audio_capture_duration_ms:.1f}ms")
        
        # Stage 2: Whisper Transcription
        t1 = time.perf_counter()
        segments, info = whisper.transcribe(
            full_audio,
            language="en",
            beam_size=5,
            temperature=0.0,
            condition_on_previous_text=False
        )
        user_text = " ".join(seg.text for seg in segments).strip()
        if human_interrupt_event.is_set():
            print("üß† Interrupted during transcription, buffering text")
            if user_text:
                pending_user_text.append(user_text)
            return
        timing.whisper_transcribe_ms = (time.perf_counter() - t1) * 1000
        timing.whisper_rtf = timing.whisper_transcribe_ms / (timing.total_audio_duration_sec * 1000)
        
        if not user_text:
            print("‚ö†Ô∏è Empty transcription ‚Äî skipping response")
            return
        
        # ---- MERGE CARRY-OVER TEXT ----
        if pending_user_text:
            carry = " ".join(pending_user_text)
            user_text = carry + " " + user_text
            pending_user_text.clear()


        conversation_memory.append({
            "role": "user",
            "content": user_text
        })
        print(f"üí¨ User: '{user_text}' (Whisper: {timing.whisper_transcribe_ms:.1f}ms, RTF: {timing.whisper_rtf:.2f}x)")
        
        # Stage 3: LLM Generation (STREAMING)
        llm_model = get_llm()
        t3 = time.perf_counter()
        
        SYSTEM_PROMPT = (
            "You are a real-time conversational assistant. "
            "Use the conversation history to maintain context and answer follow-up questions. "
            "If the user refers to something mentioned earlier, use that information. "
            "Keep responses concise (1‚Äì2 sentences) and natural. "
            "Do not mention being an AI."
        )
        messages = (
            [{"role": "system", "content": SYSTEM_PROMPT}]
            + list(conversation_memory)
        )
        stream = llm_model.create_chat_completion(
            messages=messages,
            max_tokens=40,
            temperature=0.0,
            stream=True
        )
        
        full_response_text = ""  # Track complete response for timing
        sentence_buffer = ""
        
        for chunk in stream:
            if human_interrupt_event.is_set():
                print("üõë LLM interrupted by human")
                pending_user_text.append(user_text)
                return
            if "choices" in chunk and len(chunk["choices"]) > 0:
                delta = chunk["choices"][0].get("delta", {})
                if "content" in delta:
                    token = delta["content"]
                    full_response_text += token
                    sentence_buffer += token
                    
                    # Speak complete sentences immediately
                    if token in ".!?":
                        sentence = sentence_buffer.strip()
                        if sentence:
                            response_queue.put(sentence)
                        sentence_buffer = ""
        
        # Handle any remaining text
        if sentence_buffer.strip():
            response_queue.put(sentence_buffer.strip())
            full_response_text += sentence_buffer
        
        # Calculate timing metrics
        gen_time = time.perf_counter() - t3
        timing.llm_generate_ms = gen_time * 1000
        output_tokens = len(full_response_text.split())
        timing.llm_tokens_per_sec = output_tokens / gen_time if gen_time > 0 else 0
        if full_response_text.strip():
            if spoken_sentences:
                conversation_memory.append({
                    "role": "assistant",
                    "content": " ".join(spoken_sentences)
                })
        print(f"ü§ñ LLM: {timing.llm_generate_ms:.1f}ms ({timing.llm_tokens_per_sec:.1f} tok/s)")
        
        # Final timing report (no TTS timing since it's handled by queue)
        timing.total_latency_ms = (time.perf_counter() - timing.speech_end_time) * 1000
        timing.print_report()
        timing_history.append(timing)
        turn_counter += 1
        
        # Running statistics
        if len(timing_history) > 1:
            avg_latency = sum(t.total_latency_ms for t in timing_history) / len(timing_history)
            print(f"üìà Running average latency: {avg_latency:.0f}ms over {len(timing_history)} turns")
            
    except Exception as e:
        print(f"‚ùå Error: {e}")
        import traceback
        traceback.print_exc()
        response_queue.put("Sorry, I couldn't process that.")

In [None]:
# =============================
# MAIN LOOP
# =============================
import tempfile
import os
import wave
import time
import re

# CONFIG
VAD_MIN_SAMPLES = 512
PAUSE_MS = 600
END_MS = 1200
SAFETY_TIMEOUT_MS = 2500
ENERGY_FLOOR = 0.015
WHISPER_WINDOW_SEC = 3.0
CONFIDENCE_THRESHOLD = 1.2

# STATE
state = "IDLE"
last_voice_time = None
last_ai_interrupted = False
vad_buffer = np.zeros(0, dtype=np.float32)
energy_history = deque(maxlen=15)
pause_history = deque(maxlen=5)
micro_spike_times = deque(maxlen=5)

stream.start()
print("üéôÔ∏è Real-time conversation test started")

try:
    while True:
        if not audio_buffer:
            time.sleep(0.01)
            continue

        # ---- COLLECT AUDIO CHUNK ----
        chunk = audio_buffer.pop(0).astype(np.float32).flatten()
        vad_buffer = np.concatenate([vad_buffer, chunk])

        if len(vad_buffer) < VAD_MIN_SAMPLES:
            continue

        frame = vad_buffer[:VAD_MIN_SAMPLES]
        vad_buffer = vad_buffer[VAD_MIN_SAMPLES:]
        if len(frame) < VAD_MIN_SAMPLES:
            continue

        now = time.time()
        rms = np.sqrt(np.mean(frame ** 2))
        energy_history.append(rms)

        # ---- VAD ----
        with torch.no_grad():
            vad_confidence = vad_model(torch.from_numpy(frame).unsqueeze(0), 16000).item()
        speech_started = vad_confidence > 0.5
        sustained = sum(e > ENERGY_FLOOR for e in energy_history) >= 3

        if speech_started or sustained:
            human_speaking_now.set()
        else:
            human_speaking_now.clear()

        # ---- HUMAN INTERRUPTION DETECTION (edge-triggered) ----
        if speech_started and not human_interrupt_event.is_set():
            human_interrupt_event.set()

            if ai_speaking_event.is_set() and not ai_interrupt_latched:
                ai_interrupt_latched = True
                sd.stop()  # üî• HARD STOP CURRENT PLAYBACK
                print("üõë Human interrupted AI")

        # ---- MICRO-SPIKE DETECTION ----
        if state == "PAUSING" and rms > ENERGY_FLOOR:
            micro_spike_times.append(now)

        # ---- STATE MACHINE ----
        if not speech_started and state == "IDLE":
            human_interrupt_event.clear()

        if state == "IDLE":
            if speech_started or sustained:
                state = "SPEAKING"
                last_voice_time = now
                print("üü¢ Speech started")

        elif state == "SPEAKING":
            if speech_started or sustained:
                last_voice_time = now
            else:
                elapsed = (now - last_voice_time) * 1000
                if elapsed >= PAUSE_MS:
                    state = "PAUSING"
                    print(f"üü° Pause {int(elapsed)} ms")

        elif state == "PAUSING":
            elapsed = (now - last_voice_time) * 1000

            # SAFETY TIMEOUT
            if elapsed > SAFETY_TIMEOUT_MS:
                print(f"üî¥ SAFETY TIMEOUT: Force-ending turn after {elapsed:.0f}ms")
                state = "IDLE"
                last_voice_time = None
                energy_history.clear()
                pause_history.clear()
                micro_spike_times.clear()
                last_ai_interrupted = False
                with turn_audio_lock:
                    turn_audio.clear()
                current_partial_text = ""
                if TRANSCRIPTION_MODE == "vosk":
                    vosk_reset_requested = True
                continue

            # RESUME SPEECH?
            if speech_started or sustained:
                state = "SPEAKING"
                last_voice_time = now
                print("üü¢ Speech resumed")
            else:
                # CALCULATE CONFIDENCE
                confidence = 0.0
                if elapsed > END_MS:
                    confidence += 1.0
                if len(energy_history) >= 8:
                    recent_energies = list(energy_history)[-8:]
                    if max(recent_energies) < ENERGY_FLOOR * 1.8:
                        confidence += 0.7
                if elapsed < 1000:
                    recent_spikes = [t for t in micro_spike_times if now - t < 0.6]
                    if len(recent_spikes) >= 2:
                        confidence -= 0.5
                if elapsed < 900 and current_partial_text:
                    confidence += lexical_bias(current_partial_text) * 0.6
                if last_ai_interrupted:
                    confidence -= 0.5

                # END TURN?
                if confidence >= CONFIDENCE_THRESHOLD:
                    print(f"üî¥ Turn ended (confidence={confidence:.2f}, silence={elapsed:.0f}ms)")

                    human_interrupt_event.clear()

                    # CAPTURE FULL TURN AUDIO
                    with turn_audio_lock:
                        turn_frames = list(turn_audio)
                        turn_audio.clear()

                    # RESET STATE
                    state = "IDLE"
                    last_voice_time = None
                    energy_history.clear()
                    pause_history.clear()
                    micro_spike_times.clear()
                    last_ai_interrupted = False
                    current_partial_text = ""
                    if TRANSCRIPTION_MODE == "vosk":
                        vosk_reset_requested = True

                    # FIX: Create timing object and pass it to the thread
                    timing = TurnTiming(turn_id=turn_counter)
                    threading.Thread(
                        target=generate_response, 
                        args=(turn_frames, timing),  # <-- Pass both arguments!
                        daemon=True
                    ).start()
                    

                   


        # ---- BUFFER AUDIO FOR STREAMING AND FINAL TRANSCRIPTION ----
        if state in ("SPEAKING", "PAUSING"):
            # For final transcription (never trimmed until turn ends)
            with turn_audio_lock:
                turn_audio.append((frame.copy(), now))
            # For streaming partials
            with asr_lock:
                asr_audio.append((frame.copy(), now))
                if TRANSCRIPTION_MODE == "whisper":
                    cutoff = now - WHISPER_WINDOW_SEC
                    while asr_audio and asr_audio[0][1] < cutoff:
                        asr_audio.popleft()
            # Vosk internal buffer
            if TRANSCRIPTION_MODE == "vosk":
                if not hasattr(asr_worker, "vosk_buf"):
                    asr_worker.vosk_buf = np.zeros(0, dtype=np.float32)
                asr_worker.vosk_buf = np.concatenate([asr_worker.vosk_buf, frame])
                while len(asr_worker.vosk_buf) >= VOSK_MIN_SAMPLES:
                    chunk_to_send = asr_worker.vosk_buf[:VOSK_MIN_SAMPLES]
                    asr_worker.vosk_buf = asr_worker.vosk_buf[VOSK_MIN_SAMPLES:]

except KeyboardInterrupt:
    stream.stop()
    print("\nüõë Test stopped")

üéôÔ∏è Real-time conversation test started
üü¢ Speech started
üü° Pause 600 ms
üî¥ Turn ended (confidence=1.70, silence=1231ms)
üîä Captured 82 frames (2.62s) in 0.1ms
üí¨ User: 'Hey, how's it going?' (Whisper: 1057.8ms, RTF: 0.40x)
‚è≥ Loading Qwen2.5-3B (Q5_K_M GGUF) on CPU...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (32768) -- the full capacity of the model will not be utilized


‚úÖ Qwen2.5-3B loaded successfully
üó£Ô∏è Speaking: 'Hey, going well, thanks!'
‚è≥ Loading Pocket TTS (voice: alba)...
‚è≥ Loading Pocket TTS (voice: alba)...
ü§ñ LLM: 2021.4ms (3.5 tok/s)

üìä TURN #0 TIMING AUDIT
üéôÔ∏è  User audio duration:     2.62s
‚è±Ô∏è  Speech end ‚Üí Response:   3451ms total
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
1. Audio buffer capture:     0.1ms
2. Whisper transcription:    1057.8ms (RTF: 0.40x)
3. LLM tokenization:         0.0ms
4. LLM generation:           2021.4ms (3.5 tok/s)
5. Text processing:          0.0ms

‚úÖ Pocket TTS loaded!
‚úÖ Pocket TTS loaded!
üü¢ Speech started
üü° Pause 620 ms
üî¥ Turn ended (confidence=1.70, silence=1248ms)
üîä Captured 67 frames (2.14s) in 0.1ms
üí¨ User: 'I'm doing well.' (Whisper: 1098.7ms, RTF: 0.51x)
üó£Ô∏è Speaking: 'How can I assist you today?'
ü§ñ LLM: 863.6ms (10.4 tok/s)

üìä TURN #1 TIMING AUDIT
üéôÔ∏è  User audio dura

In [None]:
# %% =============================
# BENCHMARK SUMMARY TOOL
# =============================
def print_benchmark_summary():
    """Call this manually after a session to see aggregate stats"""
    if not timing_history:
        print("No timing data recorded yet")
        return
    
    print(f"\n{'='*70}")
    print(f"üìä SESSION BENCHMARK SUMMARY ({len(timing_history)} turns)")
    print(f"{'='*70}")
    
    stages = [
        ("Audio Capture", "audio_capture_duration_ms"),
        ("Whisper Transcribe", "whisper_transcribe_ms"),
        ("LLM Tokenization", "llm_tokenize_ms"),
        ("LLM Generation", "llm_generate_ms"),
        ("Text Processing", "text_process_ms"),
        ("TTS Generation", "tts_generate_ms"),
        ("Total Latency", "total_latency_ms")
    ]
    
    for name, attr in stages:
        values = [getattr(t, attr) for t in timing_history if getattr(t, attr) > 0]
        if values:
            avg = sum(values) / len(values)
            mn, mx = min(values), max(values)
            print(f"{name:20s}: {avg:6.1f}ms avg [{mn:6.1f} - {mx:6.1f}]")
    
    # RTF analysis
    rtfs = [t.whisper_rtf for t in timing_history if t.whisper_rtf > 0]
    if rtfs:
        print(f"\nWhisper RTF: {sum(rtfs)/len(rtfs):.2f}x (lower is better, <1.0 = real-time)")
    
    print(f"{'='*70}\n")

# Run this anytime to see stats:
print_benchmark_summary()



üìä SESSION BENCHMARK SUMMARY (10 turns)
Audio Capture       :    0.1ms avg [   0.1 -    0.2]
Whisper Transcribe  : 1056.2ms avg [1010.0 - 1124.3]
LLM Generation      : 1416.3ms avg [ 980.9 - 1781.0]
Total Latency       : 2509.9ms avg [2026.1 - 2842.3]

Whisper RTF: 0.40x (lower is better, <1.0 = real-time)

