From 9e508dcf7f31a053ee960e9f2f48dc75d3dc39d3 Mon Sep 17 00:00:00 2001
From: quantumaikr <hi@quantumai.kr>
Date: Fri, 10 Apr 2026 13:57:01 +0900
Subject: [PATCH] feat(wasm): model selector (Qwen3 0.6B / Llama 3.2 1B) +
 real-time streaming
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the single SmolLM2-135M demo button with a two-card model
selector:

  - Qwen3 0.6B Q4_K_M (~378 MB) — recommended default. Much better
    quality than 135M, multilingual, reasonable download size.
  - Llama 3.2 1B Q4_K_M (~770 MB) — "higher quality" option for
    users willing to wait.

Each model has its own chat template (ChatML for Qwen, Llama 3
format for Llama) and IndexedDB cache key, so switching models
doesn't evict the other from cache.

Real-time streaming:
  - Add wasm_generate_async() in quant_wasm.c which calls
    emscripten_sleep(0) after each token, yielding control back
    to the browser event loop for DOM repaint.
  - Build with -sASYNCIFY + ASYNCIFY_IMPORTS=["emscripten_sleep"].
  - JS generate() now awaits _wasm_generate_async() with fallback
    to sync _wasm_generate() for non-ASYNCIFY builds.
  - Live tok/s counter updates during generation.

Also adds Qwen3-0.6B to the Python model registry.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 bindings/python/quantcpp/__init__.py |   5 +
 wasm/build.sh                        |   5 +-
 wasm/index.html                      | 209 +++++++++++++++++++--------
 wasm/quant_wasm.c                    |  81 +++++++++--
 4 files changed, 230 insertions(+), 70 deletions(-)

diff --git a/bindings/python/quantcpp/__init__.py b/bindings/python/quantcpp/__init__.py
index 2794b69..bea7137 100644
--- a/bindings/python/quantcpp/__init__.py
+++ b/bindings/python/quantcpp/__init__.py
@@ -53,6 +53,11 @@
         "smollm2-135m-instruct-q8_0.gguf",
         135,
     ),
+    "Qwen3-0.6B": (
+        "unsloth/Qwen3-0.6B-GGUF",
+        "Qwen3-0.6B-Q4_K_M.gguf",
+        378,
+    ),
     "Llama-3.2-1B": (
         "hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF",
         "llama-3.2-1b-instruct-q4_k_m.gguf",
diff --git a/wasm/build.sh b/wasm/build.sh
index cb6c11f..5b79866 100755
--- a/wasm/build.sh
+++ b/wasm/build.sh
@@ -32,7 +32,7 @@ emcc "$SCRIPT_DIR/quant_wasm.c" \
     -s ALLOW_MEMORY_GROWTH=1 \
     -s MAXIMUM_MEMORY=4GB \
     -s INITIAL_MEMORY=256MB \
-    -s EXPORTED_FUNCTIONS='["_main","_wasm_load_model","_wasm_generate","_wasm_model_info","_wasm_is_ready","_malloc","_free"]' \
+    -s EXPORTED_FUNCTIONS='["_main","_wasm_load_model","_wasm_generate","_wasm_generate_async","_wasm_model_info","_wasm_is_ready","_malloc","_free"]' \
     -s EXPORTED_RUNTIME_METHODS='["UTF8ToString","allocateUTF8","FS"]' \
     -s FORCE_FILESYSTEM=1 \
     -s MODULARIZE=0 \
@@ -40,6 +40,9 @@ emcc "$SCRIPT_DIR/quant_wasm.c" \
     -s NO_EXIT_RUNTIME=1 \
     -s ASSERTIONS=0 \
     -s STACK_SIZE=1MB \
+    -s ASYNCIFY \
+    -s 'ASYNCIFY_IMPORTS=["emscripten_sleep"]' \
+    -s ASYNCIFY_STACK_SIZE=65536 \
     -lm \
     -DNDEBUG \
     -D__EMSCRIPTEN__ \
diff --git a/wasm/index.html b/wasm/index.html
index 6a85128..42d42ad 100644
--- a/wasm/index.html
+++ b/wasm/index.html
@@ -43,14 +43,33 @@
 .dropzone.loaded { border-color: #2a5a3a; background: #0d1a14; padding: 16px; }
 .dropzone.loaded h2 { font-size: 14px; color: #6ee7b7; }
 
+/* Model selector */
+.model-cards {
+    display: flex; gap: 12px; margin-bottom: 16px; justify-content: center; flex-wrap: wrap;
+}
+.model-card {
+    padding: 14px 20px; border: 1px solid #333; border-radius: 10px;
+    cursor: pointer; transition: all 0.2s; text-align: left; min-width: 220px;
+    background: #111;
+}
+.model-card:hover { border-color: #6ee7b7; background: #0d1f17; }
+.model-card.recommended { border-color: #059669; }
+.model-card .name { font-weight: 600; font-size: 14px; margin-bottom: 4px; }
+.model-card .meta { font-size: 12px; color: #888; }
+.model-card .tag {
+    display: inline-block; font-size: 10px; padding: 1px 6px; border-radius: 6px;
+    background: #1a3a2a; color: #6ee7b7; margin-top: 6px;
+}
+.model-card .tag.blue { background: #1a2a3a; color: #7bb8f0; }
+
 /* Chat */
 .chat { flex: 1; overflow-y: auto; margin-bottom: 16px; }
-.message { padding: 12px 16px; margin-bottom: 8px; border-radius: 8px; font-size: 14px; line-height: 1.6; }
+.message { padding: 12px 16px; margin-bottom: 8px; border-radius: 8px; font-size: 14px; line-height: 1.6; white-space: pre-wrap; word-wrap: break-word; }
 .message.user { background: #1a1a2e; border: 1px solid #2a2a4e; }
 .message.assistant { background: #111; border: 1px solid #222; }
 .message.assistant .cursor { animation: blink 1s step-end infinite; }
 @keyframes blink { 50% { opacity: 0; } }
-.message.system { color: #666; font-size: 12px; text-align: center; }
+.message.system { color: #666; font-size: 12px; text-align: center; white-space: normal; }
 .message code { background: #1a1a1a; padding: 1px 4px; border-radius: 3px; font-size: 13px; }
 .message pre { background: #1a1a1a; padding: 12px; border-radius: 6px; overflow-x: auto; margin: 8px 0; }
 .message pre code { background: none; padding: 0; }
@@ -92,21 +111,32 @@
 <div class="header">
     <h1>quant<span>.cpp</span></h1>
     <span class="badge">WASM</span>
-    <span class="badge" id="kvBadge" style="display:none">7x Context</span>
+    <span class="badge" id="kvBadge" style="display:none">3x Context</span>
     <a class="github" href="https://github.com/quantumaikr/quant.cpp" target="_blank">GitHub ↗</a>
 </div>
 
 <div class="main">
     <div class="dropzone" id="dropzone">
-        <h2>LLM in Your Browser — 189 KB</h2>
+        <h2>LLM in Your Browser</h2>
         <p style="margin-bottom:16px; color:#6ee7b7; font-size:15px">No install. No API key. No server. Just click.</p>
-        <button id="demoBtn" onclick="loadDemoModel()" style="
-            padding: 12px 32px; font-size: 16px; font-weight: 600;
-            background: #059669; color: white; border: none; border-radius: 8px;
-            cursor: pointer; margin-bottom: 12px;
-        ">▶ Try with SmolLM2-135M (~135 MB download)</button>
+
+        <div class="model-cards" id="modelCards">
+            <div class="model-card recommended" onclick="loadDemoModel('qwen3-0.6b')">
+                <div class="name">Qwen3 0.6B</div>
+                <div class="meta">~378 MB download &middot; Q4_K_M</div>
+                <span class="tag">Recommended</span>
+                <div class="meta" style="margin-top:4px">Fast, multilingual, good for demo</div>
+            </div>
+            <div class="model-card" onclick="loadDemoModel('llama-3.2-1b')">
+                <div class="name">Llama 3.2 1B</div>
+                <div class="meta">~770 MB download &middot; Q4_K_M</div>
+                <span class="tag blue">Higher quality</span>
+                <div class="meta" style="margin-top:4px">Better reasoning, longer wait</div>
+            </div>
+        </div>
+
         <p style="color:#555; font-size:13px">Or <a href="#" onclick="document.getElementById('fileInput').click(); return false" style="color:#6ee7b7">drop your own GGUF</a> file.</p>
-        <p style="margin-top:8px; color:#333; font-size:12px">Runs entirely in your browser. Nothing uploaded to any server.</p>
+        <p style="margin-top:8px; color:#333; font-size:12px">Runs entirely in your browser. Nothing uploaded.</p>
         <input type="file" id="fileInput" accept=".gguf" style="display:none">
     </div>
 
@@ -135,15 +165,36 @@ <h2>LLM in Your Browser — 189 KB</h2>
 let modelLoaded = false;
 let generating = false;
 
+// ---- Model registry ----
+const MODELS = {
+    'qwen3-0.6b': {
+        url: 'https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf',
+        name: 'Qwen3-0.6B Q4_K_M',
+        size: '~378 MB',
+        cacheKey: 'qwen3-0.6b-q4km',
+        chatTemplate: (text) => `<|im_start|>user\n${text}<|im_end|>\n<|im_start|>assistant\n`,
+    },
+    'llama-3.2-1b': {
+        url: 'https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/resolve/main/llama-3.2-1b-instruct-q4_k_m.gguf',
+        name: 'Llama-3.2-1B-Instruct Q4_K_M',
+        size: '~770 MB',
+        cacheKey: 'llama-3.2-1b-q4km',
+        chatTemplate: (text) => `<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n${text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`,
+    },
+};
+let activeModelId = null;
+
 // ---- IndexedDB model cache ----
 const DB_NAME = 'quantcpp_cache';
 const DB_STORE = 'models';
-const DEMO_KEY = 'smollm2-135m';
 
 function openDB() {
     return new Promise((resolve, reject) => {
-        const req = indexedDB.open(DB_NAME, 1);
-        req.onupgradeneeded = () => req.result.createObjectStore(DB_STORE);
+        const req = indexedDB.open(DB_NAME, 2);
+        req.onupgradeneeded = () => {
+            if (!req.result.objectStoreNames.contains(DB_STORE))
+                req.result.createObjectStore(DB_STORE);
+        };
         req.onsuccess = () => resolve(req.result);
         req.onerror = () => reject(req.error);
     });
@@ -199,27 +250,28 @@ <h2>LLM in Your Browser — 189 KB</h2>
 }
 
 // Demo model — cache-first, download only if not in IndexedDB
-async function loadDemoModel() {
-    const url = 'https://huggingface.co/Felladrin/gguf-Q8_0-SmolLM2-135M-Instruct/resolve/main/smollm2-135m-instruct-q8_0.gguf';
-    const btn = document.getElementById('demoBtn');
-    btn.disabled = true;
+async function loadDemoModel(modelId) {
+    const model = MODELS[modelId];
+    if (!model) return;
+
+    activeModelId = modelId;
+    const cards = document.querySelectorAll('.model-card');
+    cards.forEach(c => c.style.pointerEvents = 'none');
 
     try {
         // 1. Try cache first
         showLoading('Checking local cache...');
-        const cached = await getCachedModel(DEMO_KEY);
+        const cached = await getCachedModel(model.cacheKey);
         if (cached) {
-            btn.textContent = 'Loading from cache...';
-            showLoading('Loading cached model...');
-            loadModelFromBytes(new Uint8Array(cached), 'smollm2-135m (cached)');
+            showLoading(`Loading cached ${model.name}...`);
+            loadModelFromBytes(new Uint8Array(cached), `${model.name} (cached)`);
             return;
         }
 
         // 2. Download from HuggingFace
-        btn.textContent = 'Downloading...';
-        showLoading('Downloading SmolLM2-135M (~135 MB)...');
+        showLoading(`Downloading ${model.name} (${model.size})...`);
 
-        const response = await fetch(url);
+        const response = await fetch(model.url);
         if (!response.ok) throw new Error(`HTTP ${response.status}`);
 
         const total = parseInt(response.headers.get('content-length') || '0');
@@ -237,7 +289,7 @@ <h2>LLM in Your Browser — 189 KB</h2>
                 const mb = (received / 1048576).toFixed(0);
                 const totalMb = (total / 1048576).toFixed(0);
                 document.getElementById('loadingText').textContent =
-                    `Downloading... ${pct}% (${mb}/${totalMb} MB)`;
+                    `Downloading ${model.name}... ${pct}% (${mb}/${totalMb} MB)`;
             }
         }
 
@@ -247,26 +299,33 @@ <h2>LLM in Your Browser — 189 KB</h2>
 
         // 3. Cache for next time
         showLoading('Caching model for instant reload...');
-        await cacheModel(DEMO_KEY, arrayBuffer).catch(() => {});
+        await cacheModel(model.cacheKey, arrayBuffer).catch(() => {});
 
         showLoading('Loading model into WASM...');
-        loadModelFromBytes(data, 'smollm2-135m-instruct-q8_0.gguf');
+        loadModelFromBytes(data, model.name);
     } catch (err) {
         hideLoading();
-        btn.disabled = false;
-        btn.textContent = '▶ Try with SmolLM2-135M (~135 MB download)';
+        cards.forEach(c => c.style.pointerEvents = '');
+        activeModelId = null;
         alert('Download failed: ' + err.message + '\n\nTry dropping a local GGUF file instead.');
     }
 }
 
-// Auto-load cached model on page load
+// Auto-detect cached models on page load and show badges
 window.addEventListener('load', async () => {
     try {
-        const cached = await getCachedModel(DEMO_KEY);
-        if (cached) {
-            const btn = document.getElementById('demoBtn');
-            btn.textContent = '▶ Load cached SmolLM2-135M (instant)';
-            btn.style.background = '#047857';
+        for (const [id, model] of Object.entries(MODELS)) {
+            const cached = await getCachedModel(model.cacheKey);
+            if (cached) {
+                const cards = document.querySelectorAll('.model-card');
+                cards.forEach(card => {
+                    if (card.querySelector('.name').textContent.toLowerCase().includes(id.split('-')[0])) {
+                        const meta = card.querySelector('.meta');
+                        meta.textContent = 'Cached — instant load';
+                        meta.style.color = '#6ee7b7';
+                    }
+                });
+            }
         }
     } catch(e) {}
 });
@@ -275,7 +334,11 @@ <h2>LLM in Your Browser — 189 KB</h2>
     const chat = document.getElementById('chat');
     const div = document.createElement('div');
     div.className = `message ${role}`;
-    div.innerHTML = formatText(text);
+    if (role === 'assistant') {
+        div.textContent = '';
+    } else {
+        div.innerHTML = formatText(text);
+    }
     chat.appendChild(div);
     chat.scrollTop = chat.scrollHeight;
     return div;
@@ -290,7 +353,6 @@ <h2>LLM in Your Browser — 189 KB</h2>
 }
 
 function loadModelFromBytes(bytes, name) {
-    // Shared model loading from Uint8Array (used by both file drop and demo download)
     try {
         Module.FS.writeFile('/model.gguf', bytes);
         showLoading('Initializing model...');
@@ -318,6 +380,7 @@ <h2>LLM in Your Browser — 189 KB</h2>
 async function loadModel(file) {
     showLoading(`Loading ${file.name} (${(file.size/1024/1024).toFixed(0)} MB)...`);
     addMessage('system', `Loading ${file.name}...`);
+    activeModelId = null; // custom model — use generic template
     try {
         const buffer = await file.arrayBuffer();
         const bytes = new Uint8Array(buffer);
@@ -328,6 +391,14 @@ <h2>LLM in Your Browser — 189 KB</h2>
     hideLoading();
 }
 
+function getChatPrompt(text) {
+    if (activeModelId && MODELS[activeModelId]) {
+        return MODELS[activeModelId].chatTemplate(text);
+    }
+    // Generic ChatML fallback for custom GGUF
+    return `<|im_start|>user\n${text}<|im_end|>\n<|im_start|>assistant\n`;
+}
+
 async function generate() {
     if (!modelLoaded || generating) return;
     const input = document.getElementById('prompt');
@@ -337,45 +408,65 @@ <h2>LLM in Your Browser — 189 KB</h2>
     input.value = '';
     generating = true;
     document.getElementById('sendBtn').disabled = true;
+    input.disabled = true;
 
     addMessage('user', text);
-    const assistantDiv = addMessage('assistant', '<span class="cursor">▌</span>');
+    const assistantDiv = addMessage('assistant', '');
     let output = '';
+    let tokenCount = 0;
+    const startTime = performance.now();
 
-    // Set callbacks
+    // Set streaming token callback
     Module.onToken = (token) => {
         output += token;
-        assistantDiv.innerHTML = formatText(output) + '<span class="cursor">▌</span>';
-        document.getElementById('chat').scrollTop = document.getElementById('chat').scrollHeight;
+        tokenCount++;
+        // Update the assistant message with raw text + blinking cursor
+        assistantDiv.textContent = output;
+        const cursor = document.createElement('span');
+        cursor.className = 'cursor';
+        cursor.textContent = '▌';
+        assistantDiv.appendChild(cursor);
+        // Auto-scroll
+        const chat = document.getElementById('chat');
+        chat.scrollTop = chat.scrollHeight;
+        // Live stats
+        const elapsed = (performance.now() - startTime) / 1000;
+        if (elapsed > 0.1) {
+            document.getElementById('statTokens').textContent = `${tokenCount} tokens`;
+            document.getElementById('statSpeed').textContent = `${(tokenCount / elapsed).toFixed(1)} tok/s`;
+        }
     };
+
     Module.onDone = (nTokens, elapsedMs) => {
+        // Final render with markdown formatting
         assistantDiv.innerHTML = formatText(output);
-        const tps = (nTokens / (elapsedMs / 1000)).toFixed(1);
+        const tps = nTokens > 0 ? (nTokens / (elapsedMs / 1000)).toFixed(1) : '0';
         document.getElementById('statTokens').textContent = `${nTokens} tokens`;
         document.getElementById('statSpeed').textContent = `${tps} tok/s`;
         generating = false;
         document.getElementById('sendBtn').disabled = false;
-        document.getElementById('prompt').focus();
-    };
-    Module.onStatus = (msg) => {
-        addMessage('system', msg);
+        input.disabled = false;
+        input.focus();
     };
 
-    // Wrap with ChatML template (instruct models need this to generate)
-    const chatPrompt = `<|im_start|>user\n${text}<|im_end|>\n<|im_start|>assistant\n`;
+    const chatPrompt = getChatPrompt(text);
 
-    // Run generation asynchronously so the UI doesn't freeze
-    setTimeout(() => {
-        const promptPtr = Module.allocateUTF8(chatPrompt);
+    // Use ASYNCIFY: _wasm_generate_async yields to browser between tokens
+    const promptPtr = Module.allocateUTF8(chatPrompt);
+    try {
+        await Module._wasm_generate_async(promptPtr, 0.7, 256);
+    } catch(e) {
+        // Fallback for non-ASYNCIFY builds
         Module._wasm_generate(promptPtr, 0.7, 256);
-        Module._free(promptPtr);
+    }
+    Module._free(promptPtr);
 
-        if (!output) {
-            assistantDiv.innerHTML = '<em style="color:#666">No output generated. Try a longer prompt.</em>';
-        }
-        generating = false;
-        document.getElementById('sendBtn').disabled = false;
-    }, 50);  // yield to browser for one frame to show the spinner
+    if (!output) {
+        assistantDiv.innerHTML = '<em style="color:#666">No output generated. Try a longer prompt.</em>';
+    }
+    generating = false;
+    document.getElementById('sendBtn').disabled = false;
+    input.disabled = false;
 }
 </script>
 
@@ -389,7 +480,7 @@ <h2>LLM in Your Browser — 189 KB</h2>
     printErr: function(text) { console.warn(text); },
     onRuntimeInitialized: function() {
         console.log('quant.cpp WASM ready');
-        addMessage('system', 'Runtime ready. Drop a GGUF model file to begin.');
+        addMessage('system', 'Runtime ready. Choose a model or drop your own GGUF file.');
     }
 };
 </script>
diff --git a/wasm/quant_wasm.c b/wasm/quant_wasm.c
index a0cc34f..e8aa02b 100644
--- a/wasm/quant_wasm.c
+++ b/wasm/quant_wasm.c
@@ -3,6 +3,10 @@
  *
  * Compiled with Emscripten: emcc quant_wasm.c -o quant.js
  * Uses the single-header quant.h for zero-dependency builds.
+ *
+ * Build with -sASYNCIFY to enable wasm_generate_async(), which
+ * yields to the browser event loop between tokens for real-time
+ * streaming output.
  */
 
 #define QUANT_IMPLEMENTATION
@@ -33,8 +37,23 @@ EM_JS(void, js_on_status, (const char* msg), {
     if (Module.onStatus) Module.onStatus(UTF8ToString(msg));
 });
 
-/* Token callback for streaming */
-static void on_token(const char* text, void* ud) {
+/* Token callback for streaming — calls JS then yields to browser */
+static void on_token_streaming(const char* text, void* ud) {
+    (void)ud;
+    js_on_token(text);
+    int len = (int)strlen(text);
+    if (g_output_pos + len < (int)sizeof(g_output) - 1) {
+        memcpy(g_output + g_output_pos, text, len);
+        g_output_pos += len;
+        g_output[g_output_pos] = '\0';
+    }
+    /* Yield to browser event loop so DOM can repaint with the new token.
+     * emscripten_sleep(0) requires -sASYNCIFY but costs ~0 ms real time. */
+    emscripten_sleep(0);
+}
+
+/* Non-yielding callback (fallback for non-ASYNCIFY builds) */
+static void on_token_sync(const char* text, void* ud) {
     (void)ud;
     js_on_token(text);
     int len = (int)strlen(text);
@@ -82,9 +101,9 @@ int wasm_load_model(const char* path) {
     return 0;
 }
 
-/* Generate response */
+/* Async generate — yields to browser between tokens (requires -sASYNCIFY) */
 EMSCRIPTEN_KEEPALIVE
-int wasm_generate(const char* prompt, float temperature, int max_tokens) {
+int wasm_generate_async(const char* prompt, float temperature, int max_tokens) {
     if (!g_model || !g_ctx) {
         js_on_status("Error: no model loaded");
         return -1;
@@ -98,7 +117,6 @@ int wasm_generate(const char* prompt, float temperature, int max_tokens) {
     g_output_pos = 0;
     g_output[0] = '\0';
 
-    /* Reconfigure if needed */
     quant_config cfg = {
         .temperature = temperature,
         .top_p = 0.9f,
@@ -107,15 +125,58 @@ int wasm_generate(const char* prompt, float temperature, int max_tokens) {
         .kv_compress = 1,
     };
 
-    /* Free old context and create new one for fresh generation */
     if (g_ctx) quant_free_ctx(g_ctx);
     g_ctx = quant_new(g_model, &cfg);
 
     double t0 = emscripten_get_now();
 
-    /* Streaming generation via per-token callback */
-    int n_tokens = quant_generate(g_ctx, prompt, on_token, NULL);
+    /* Streaming generation — on_token_streaming calls emscripten_sleep(0)
+     * which yields back to the browser event loop after each token. */
+    int n_tokens = quant_generate(g_ctx, prompt, on_token_streaming, NULL);
+
+    double elapsed = emscripten_get_now() - t0;
+
+    if (n_tokens > 0) {
+        js_on_done(n_tokens, elapsed);
+    } else {
+        js_on_done(0, elapsed);
+        if (g_output_pos == 0)
+            js_on_status("No output \xe2\x80\x94 try a different prompt");
+    }
+
+    g_generating = 0;
+    return 0;
+}
+
+/* Sync generate — does NOT yield to browser (fallback) */
+EMSCRIPTEN_KEEPALIVE
+int wasm_generate(const char* prompt, float temperature, int max_tokens) {
+    if (!g_model || !g_ctx) {
+        js_on_status("Error: no model loaded");
+        return -1;
+    }
+    if (g_generating) {
+        js_on_status("Error: generation in progress");
+        return -1;
+    }
+
+    g_generating = 1;
+    g_output_pos = 0;
+    g_output[0] = '\0';
 
+    quant_config cfg = {
+        .temperature = temperature,
+        .top_p = 0.9f,
+        .max_tokens = max_tokens > 0 ? max_tokens : 256,
+        .n_threads = 1,
+        .kv_compress = 1,
+    };
+
+    if (g_ctx) quant_free_ctx(g_ctx);
+    g_ctx = quant_new(g_model, &cfg);
+
+    double t0 = emscripten_get_now();
+    int n_tokens = quant_generate(g_ctx, prompt, on_token_sync, NULL);
     double elapsed = emscripten_get_now() - t0;
 
     if (n_tokens > 0) {
@@ -123,7 +184,7 @@ int wasm_generate(const char* prompt, float temperature, int max_tokens) {
     } else {
         js_on_done(0, elapsed);
         if (g_output_pos == 0)
-            js_on_status("No output — try a different prompt");
+            js_on_status("No output \xe2\x80\x94 try a different prompt");
     }
 
     g_generating = 0;
@@ -149,6 +210,6 @@ int wasm_is_ready(void) {
 }
 
 int main() {
-    js_on_status("quant.cpp WASM runtime ready. Drop a GGUF model to start.");
+    js_on_status("quant.cpp WASM runtime ready. Choose a model to start.");
     return 0;
 }