From 9e508dcf7f31a053ee960e9f2f48dc75d3dc39d3 Mon Sep 17 00:00:00 2001 From: quantumaikr Date: Fri, 10 Apr 2026 13:57:01 +0900 Subject: [PATCH] feat(wasm): model selector (Qwen3 0.6B / Llama 3.2 1B) + real-time streaming MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the single SmolLM2-135M demo button with a two-card model selector: - Qwen3 0.6B Q4_K_M (~378 MB) — recommended default. Much better quality than 135M, multilingual, reasonable download size. - Llama 3.2 1B Q4_K_M (~770 MB) — "higher quality" option for users willing to wait. Each model has its own chat template (ChatML for Qwen, Llama 3 format for Llama) and IndexedDB cache key, so switching models doesn't evict the other from cache. Real-time streaming: - Add wasm_generate_async() in quant_wasm.c which calls emscripten_sleep(0) after each token, yielding control back to the browser event loop for DOM repaint. - Build with -sASYNCIFY + ASYNCIFY_IMPORTS=["emscripten_sleep"]. - JS generate() now awaits _wasm_generate_async() with fallback to sync _wasm_generate() for non-ASYNCIFY builds. - Live tok/s counter updates during generation. Also adds Qwen3-0.6B to the Python model registry. Co-Authored-By: Claude Opus 4.6 (1M context) --- bindings/python/quantcpp/__init__.py | 5 + wasm/build.sh | 5 +- wasm/index.html | 209 +++++++++++++++++++-------- wasm/quant_wasm.c | 81 +++++++++-- 4 files changed, 230 insertions(+), 70 deletions(-) diff --git a/bindings/python/quantcpp/__init__.py b/bindings/python/quantcpp/__init__.py index 2794b69..bea7137 100644 --- a/bindings/python/quantcpp/__init__.py +++ b/bindings/python/quantcpp/__init__.py @@ -53,6 +53,11 @@ "smollm2-135m-instruct-q8_0.gguf", 135, ), + "Qwen3-0.6B": ( + "unsloth/Qwen3-0.6B-GGUF", + "Qwen3-0.6B-Q4_K_M.gguf", + 378, + ), "Llama-3.2-1B": ( "hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF", "llama-3.2-1b-instruct-q4_k_m.gguf", diff --git a/wasm/build.sh b/wasm/build.sh index cb6c11f..5b79866 100755 --- a/wasm/build.sh +++ b/wasm/build.sh @@ -32,7 +32,7 @@ emcc "$SCRIPT_DIR/quant_wasm.c" \ -s ALLOW_MEMORY_GROWTH=1 \ -s MAXIMUM_MEMORY=4GB \ -s INITIAL_MEMORY=256MB \ - -s EXPORTED_FUNCTIONS='["_main","_wasm_load_model","_wasm_generate","_wasm_model_info","_wasm_is_ready","_malloc","_free"]' \ + -s EXPORTED_FUNCTIONS='["_main","_wasm_load_model","_wasm_generate","_wasm_generate_async","_wasm_model_info","_wasm_is_ready","_malloc","_free"]' \ -s EXPORTED_RUNTIME_METHODS='["UTF8ToString","allocateUTF8","FS"]' \ -s FORCE_FILESYSTEM=1 \ -s MODULARIZE=0 \ @@ -40,6 +40,9 @@ emcc "$SCRIPT_DIR/quant_wasm.c" \ -s NO_EXIT_RUNTIME=1 \ -s ASSERTIONS=0 \ -s STACK_SIZE=1MB \ + -s ASYNCIFY \ + -s 'ASYNCIFY_IMPORTS=["emscripten_sleep"]' \ + -s ASYNCIFY_STACK_SIZE=65536 \ -lm \ -DNDEBUG \ -D__EMSCRIPTEN__ \ diff --git a/wasm/index.html b/wasm/index.html index 6a85128..42d42ad 100644 --- a/wasm/index.html +++ b/wasm/index.html @@ -43,14 +43,33 @@ .dropzone.loaded { border-color: #2a5a3a; background: #0d1a14; padding: 16px; } .dropzone.loaded h2 { font-size: 14px; color: #6ee7b7; } +/* Model selector */ +.model-cards { + display: flex; gap: 12px; margin-bottom: 16px; justify-content: center; flex-wrap: wrap; +} +.model-card { + padding: 14px 20px; border: 1px solid #333; border-radius: 10px; + cursor: pointer; transition: all 0.2s; text-align: left; min-width: 220px; + background: #111; +} +.model-card:hover { border-color: #6ee7b7; background: #0d1f17; } +.model-card.recommended { border-color: #059669; } +.model-card .name { font-weight: 600; font-size: 14px; margin-bottom: 4px; } +.model-card .meta { font-size: 12px; color: #888; } +.model-card .tag { + display: inline-block; font-size: 10px; padding: 1px 6px; border-radius: 6px; + background: #1a3a2a; color: #6ee7b7; margin-top: 6px; +} +.model-card .tag.blue { background: #1a2a3a; color: #7bb8f0; } + /* Chat */ .chat { flex: 1; overflow-y: auto; margin-bottom: 16px; } -.message { padding: 12px 16px; margin-bottom: 8px; border-radius: 8px; font-size: 14px; line-height: 1.6; } +.message { padding: 12px 16px; margin-bottom: 8px; border-radius: 8px; font-size: 14px; line-height: 1.6; white-space: pre-wrap; word-wrap: break-word; } .message.user { background: #1a1a2e; border: 1px solid #2a2a4e; } .message.assistant { background: #111; border: 1px solid #222; } .message.assistant .cursor { animation: blink 1s step-end infinite; } @keyframes blink { 50% { opacity: 0; } } -.message.system { color: #666; font-size: 12px; text-align: center; } +.message.system { color: #666; font-size: 12px; text-align: center; white-space: normal; } .message code { background: #1a1a1a; padding: 1px 4px; border-radius: 3px; font-size: 13px; } .message pre { background: #1a1a1a; padding: 12px; border-radius: 6px; overflow-x: auto; margin: 8px 0; } .message pre code { background: none; padding: 0; } @@ -92,21 +111,32 @@

quant.cpp

WASM - + GitHub ↗
-

LLM in Your Browser — 189 KB

+

LLM in Your Browser

No install. No API key. No server. Just click.

- + +
+ +
+
Llama 3.2 1B
+
~770 MB download · Q4_K_M
+ Higher quality +
Better reasoning, longer wait
+
+
+

Or drop your own GGUF file.

-

Runs entirely in your browser. Nothing uploaded to any server.

+

Runs entirely in your browser. Nothing uploaded.

@@ -135,15 +165,36 @@

LLM in Your Browser — 189 KB

let modelLoaded = false; let generating = false; +// ---- Model registry ---- +const MODELS = { + 'qwen3-0.6b': { + url: 'https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf', + name: 'Qwen3-0.6B Q4_K_M', + size: '~378 MB', + cacheKey: 'qwen3-0.6b-q4km', + chatTemplate: (text) => `<|im_start|>user\n${text}<|im_end|>\n<|im_start|>assistant\n`, + }, + 'llama-3.2-1b': { + url: 'https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/resolve/main/llama-3.2-1b-instruct-q4_k_m.gguf', + name: 'Llama-3.2-1B-Instruct Q4_K_M', + size: '~770 MB', + cacheKey: 'llama-3.2-1b-q4km', + chatTemplate: (text) => `<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n${text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`, + }, +}; +let activeModelId = null; + // ---- IndexedDB model cache ---- const DB_NAME = 'quantcpp_cache'; const DB_STORE = 'models'; -const DEMO_KEY = 'smollm2-135m'; function openDB() { return new Promise((resolve, reject) => { - const req = indexedDB.open(DB_NAME, 1); - req.onupgradeneeded = () => req.result.createObjectStore(DB_STORE); + const req = indexedDB.open(DB_NAME, 2); + req.onupgradeneeded = () => { + if (!req.result.objectStoreNames.contains(DB_STORE)) + req.result.createObjectStore(DB_STORE); + }; req.onsuccess = () => resolve(req.result); req.onerror = () => reject(req.error); }); @@ -199,27 +250,28 @@

LLM in Your Browser — 189 KB

} // Demo model — cache-first, download only if not in IndexedDB -async function loadDemoModel() { - const url = 'https://huggingface.co/Felladrin/gguf-Q8_0-SmolLM2-135M-Instruct/resolve/main/smollm2-135m-instruct-q8_0.gguf'; - const btn = document.getElementById('demoBtn'); - btn.disabled = true; +async function loadDemoModel(modelId) { + const model = MODELS[modelId]; + if (!model) return; + + activeModelId = modelId; + const cards = document.querySelectorAll('.model-card'); + cards.forEach(c => c.style.pointerEvents = 'none'); try { // 1. Try cache first showLoading('Checking local cache...'); - const cached = await getCachedModel(DEMO_KEY); + const cached = await getCachedModel(model.cacheKey); if (cached) { - btn.textContent = 'Loading from cache...'; - showLoading('Loading cached model...'); - loadModelFromBytes(new Uint8Array(cached), 'smollm2-135m (cached)'); + showLoading(`Loading cached ${model.name}...`); + loadModelFromBytes(new Uint8Array(cached), `${model.name} (cached)`); return; } // 2. Download from HuggingFace - btn.textContent = 'Downloading...'; - showLoading('Downloading SmolLM2-135M (~135 MB)...'); + showLoading(`Downloading ${model.name} (${model.size})...`); - const response = await fetch(url); + const response = await fetch(model.url); if (!response.ok) throw new Error(`HTTP ${response.status}`); const total = parseInt(response.headers.get('content-length') || '0'); @@ -237,7 +289,7 @@

LLM in Your Browser — 189 KB

const mb = (received / 1048576).toFixed(0); const totalMb = (total / 1048576).toFixed(0); document.getElementById('loadingText').textContent = - `Downloading... ${pct}% (${mb}/${totalMb} MB)`; + `Downloading ${model.name}... ${pct}% (${mb}/${totalMb} MB)`; } } @@ -247,26 +299,33 @@

LLM in Your Browser — 189 KB

// 3. Cache for next time showLoading('Caching model for instant reload...'); - await cacheModel(DEMO_KEY, arrayBuffer).catch(() => {}); + await cacheModel(model.cacheKey, arrayBuffer).catch(() => {}); showLoading('Loading model into WASM...'); - loadModelFromBytes(data, 'smollm2-135m-instruct-q8_0.gguf'); + loadModelFromBytes(data, model.name); } catch (err) { hideLoading(); - btn.disabled = false; - btn.textContent = '▶ Try with SmolLM2-135M (~135 MB download)'; + cards.forEach(c => c.style.pointerEvents = ''); + activeModelId = null; alert('Download failed: ' + err.message + '\n\nTry dropping a local GGUF file instead.'); } } -// Auto-load cached model on page load +// Auto-detect cached models on page load and show badges window.addEventListener('load', async () => { try { - const cached = await getCachedModel(DEMO_KEY); - if (cached) { - const btn = document.getElementById('demoBtn'); - btn.textContent = '▶ Load cached SmolLM2-135M (instant)'; - btn.style.background = '#047857'; + for (const [id, model] of Object.entries(MODELS)) { + const cached = await getCachedModel(model.cacheKey); + if (cached) { + const cards = document.querySelectorAll('.model-card'); + cards.forEach(card => { + if (card.querySelector('.name').textContent.toLowerCase().includes(id.split('-')[0])) { + const meta = card.querySelector('.meta'); + meta.textContent = 'Cached — instant load'; + meta.style.color = '#6ee7b7'; + } + }); + } } } catch(e) {} }); @@ -275,7 +334,11 @@

LLM in Your Browser — 189 KB

const chat = document.getElementById('chat'); const div = document.createElement('div'); div.className = `message ${role}`; - div.innerHTML = formatText(text); + if (role === 'assistant') { + div.textContent = ''; + } else { + div.innerHTML = formatText(text); + } chat.appendChild(div); chat.scrollTop = chat.scrollHeight; return div; @@ -290,7 +353,6 @@

LLM in Your Browser — 189 KB

} function loadModelFromBytes(bytes, name) { - // Shared model loading from Uint8Array (used by both file drop and demo download) try { Module.FS.writeFile('/model.gguf', bytes); showLoading('Initializing model...'); @@ -318,6 +380,7 @@

LLM in Your Browser — 189 KB

async function loadModel(file) { showLoading(`Loading ${file.name} (${(file.size/1024/1024).toFixed(0)} MB)...`); addMessage('system', `Loading ${file.name}...`); + activeModelId = null; // custom model — use generic template try { const buffer = await file.arrayBuffer(); const bytes = new Uint8Array(buffer); @@ -328,6 +391,14 @@

LLM in Your Browser — 189 KB

hideLoading(); } +function getChatPrompt(text) { + if (activeModelId && MODELS[activeModelId]) { + return MODELS[activeModelId].chatTemplate(text); + } + // Generic ChatML fallback for custom GGUF + return `<|im_start|>user\n${text}<|im_end|>\n<|im_start|>assistant\n`; +} + async function generate() { if (!modelLoaded || generating) return; const input = document.getElementById('prompt'); @@ -337,45 +408,65 @@

LLM in Your Browser — 189 KB

input.value = ''; generating = true; document.getElementById('sendBtn').disabled = true; + input.disabled = true; addMessage('user', text); - const assistantDiv = addMessage('assistant', ''); + const assistantDiv = addMessage('assistant', ''); let output = ''; + let tokenCount = 0; + const startTime = performance.now(); - // Set callbacks + // Set streaming token callback Module.onToken = (token) => { output += token; - assistantDiv.innerHTML = formatText(output) + ''; - document.getElementById('chat').scrollTop = document.getElementById('chat').scrollHeight; + tokenCount++; + // Update the assistant message with raw text + blinking cursor + assistantDiv.textContent = output; + const cursor = document.createElement('span'); + cursor.className = 'cursor'; + cursor.textContent = '▌'; + assistantDiv.appendChild(cursor); + // Auto-scroll + const chat = document.getElementById('chat'); + chat.scrollTop = chat.scrollHeight; + // Live stats + const elapsed = (performance.now() - startTime) / 1000; + if (elapsed > 0.1) { + document.getElementById('statTokens').textContent = `${tokenCount} tokens`; + document.getElementById('statSpeed').textContent = `${(tokenCount / elapsed).toFixed(1)} tok/s`; + } }; + Module.onDone = (nTokens, elapsedMs) => { + // Final render with markdown formatting assistantDiv.innerHTML = formatText(output); - const tps = (nTokens / (elapsedMs / 1000)).toFixed(1); + const tps = nTokens > 0 ? (nTokens / (elapsedMs / 1000)).toFixed(1) : '0'; document.getElementById('statTokens').textContent = `${nTokens} tokens`; document.getElementById('statSpeed').textContent = `${tps} tok/s`; generating = false; document.getElementById('sendBtn').disabled = false; - document.getElementById('prompt').focus(); - }; - Module.onStatus = (msg) => { - addMessage('system', msg); + input.disabled = false; + input.focus(); }; - // Wrap with ChatML template (instruct models need this to generate) - const chatPrompt = `<|im_start|>user\n${text}<|im_end|>\n<|im_start|>assistant\n`; + const chatPrompt = getChatPrompt(text); - // Run generation asynchronously so the UI doesn't freeze - setTimeout(() => { - const promptPtr = Module.allocateUTF8(chatPrompt); + // Use ASYNCIFY: _wasm_generate_async yields to browser between tokens + const promptPtr = Module.allocateUTF8(chatPrompt); + try { + await Module._wasm_generate_async(promptPtr, 0.7, 256); + } catch(e) { + // Fallback for non-ASYNCIFY builds Module._wasm_generate(promptPtr, 0.7, 256); - Module._free(promptPtr); + } + Module._free(promptPtr); - if (!output) { - assistantDiv.innerHTML = 'No output generated. Try a longer prompt.'; - } - generating = false; - document.getElementById('sendBtn').disabled = false; - }, 50); // yield to browser for one frame to show the spinner + if (!output) { + assistantDiv.innerHTML = 'No output generated. Try a longer prompt.'; + } + generating = false; + document.getElementById('sendBtn').disabled = false; + input.disabled = false; } @@ -389,7 +480,7 @@

LLM in Your Browser — 189 KB

printErr: function(text) { console.warn(text); }, onRuntimeInitialized: function() { console.log('quant.cpp WASM ready'); - addMessage('system', 'Runtime ready. Drop a GGUF model file to begin.'); + addMessage('system', 'Runtime ready. Choose a model or drop your own GGUF file.'); } }; diff --git a/wasm/quant_wasm.c b/wasm/quant_wasm.c index a0cc34f..e8aa02b 100644 --- a/wasm/quant_wasm.c +++ b/wasm/quant_wasm.c @@ -3,6 +3,10 @@ * * Compiled with Emscripten: emcc quant_wasm.c -o quant.js * Uses the single-header quant.h for zero-dependency builds. + * + * Build with -sASYNCIFY to enable wasm_generate_async(), which + * yields to the browser event loop between tokens for real-time + * streaming output. */ #define QUANT_IMPLEMENTATION @@ -33,8 +37,23 @@ EM_JS(void, js_on_status, (const char* msg), { if (Module.onStatus) Module.onStatus(UTF8ToString(msg)); }); -/* Token callback for streaming */ -static void on_token(const char* text, void* ud) { +/* Token callback for streaming — calls JS then yields to browser */ +static void on_token_streaming(const char* text, void* ud) { + (void)ud; + js_on_token(text); + int len = (int)strlen(text); + if (g_output_pos + len < (int)sizeof(g_output) - 1) { + memcpy(g_output + g_output_pos, text, len); + g_output_pos += len; + g_output[g_output_pos] = '\0'; + } + /* Yield to browser event loop so DOM can repaint with the new token. + * emscripten_sleep(0) requires -sASYNCIFY but costs ~0 ms real time. */ + emscripten_sleep(0); +} + +/* Non-yielding callback (fallback for non-ASYNCIFY builds) */ +static void on_token_sync(const char* text, void* ud) { (void)ud; js_on_token(text); int len = (int)strlen(text); @@ -82,9 +101,9 @@ int wasm_load_model(const char* path) { return 0; } -/* Generate response */ +/* Async generate — yields to browser between tokens (requires -sASYNCIFY) */ EMSCRIPTEN_KEEPALIVE -int wasm_generate(const char* prompt, float temperature, int max_tokens) { +int wasm_generate_async(const char* prompt, float temperature, int max_tokens) { if (!g_model || !g_ctx) { js_on_status("Error: no model loaded"); return -1; @@ -98,7 +117,6 @@ int wasm_generate(const char* prompt, float temperature, int max_tokens) { g_output_pos = 0; g_output[0] = '\0'; - /* Reconfigure if needed */ quant_config cfg = { .temperature = temperature, .top_p = 0.9f, @@ -107,15 +125,58 @@ int wasm_generate(const char* prompt, float temperature, int max_tokens) { .kv_compress = 1, }; - /* Free old context and create new one for fresh generation */ if (g_ctx) quant_free_ctx(g_ctx); g_ctx = quant_new(g_model, &cfg); double t0 = emscripten_get_now(); - /* Streaming generation via per-token callback */ - int n_tokens = quant_generate(g_ctx, prompt, on_token, NULL); + /* Streaming generation — on_token_streaming calls emscripten_sleep(0) + * which yields back to the browser event loop after each token. */ + int n_tokens = quant_generate(g_ctx, prompt, on_token_streaming, NULL); + + double elapsed = emscripten_get_now() - t0; + + if (n_tokens > 0) { + js_on_done(n_tokens, elapsed); + } else { + js_on_done(0, elapsed); + if (g_output_pos == 0) + js_on_status("No output \xe2\x80\x94 try a different prompt"); + } + + g_generating = 0; + return 0; +} + +/* Sync generate — does NOT yield to browser (fallback) */ +EMSCRIPTEN_KEEPALIVE +int wasm_generate(const char* prompt, float temperature, int max_tokens) { + if (!g_model || !g_ctx) { + js_on_status("Error: no model loaded"); + return -1; + } + if (g_generating) { + js_on_status("Error: generation in progress"); + return -1; + } + + g_generating = 1; + g_output_pos = 0; + g_output[0] = '\0'; + quant_config cfg = { + .temperature = temperature, + .top_p = 0.9f, + .max_tokens = max_tokens > 0 ? max_tokens : 256, + .n_threads = 1, + .kv_compress = 1, + }; + + if (g_ctx) quant_free_ctx(g_ctx); + g_ctx = quant_new(g_model, &cfg); + + double t0 = emscripten_get_now(); + int n_tokens = quant_generate(g_ctx, prompt, on_token_sync, NULL); double elapsed = emscripten_get_now() - t0; if (n_tokens > 0) { @@ -123,7 +184,7 @@ int wasm_generate(const char* prompt, float temperature, int max_tokens) { } else { js_on_done(0, elapsed); if (g_output_pos == 0) - js_on_status("No output — try a different prompt"); + js_on_status("No output \xe2\x80\x94 try a different prompt"); } g_generating = 0; @@ -149,6 +210,6 @@ int wasm_is_ready(void) { } int main() { - js_on_status("quant.cpp WASM runtime ready. Drop a GGUF model to start."); + js_on_status("quant.cpp WASM runtime ready. Choose a model to start."); return 0; }