diff --git a/README.md b/README.md index 07284bb..7eae1de 100644 --- a/README.md +++ b/README.md @@ -33,20 +33,29 @@ --- -## Quick Start (30 seconds) +## Quick Start +**Terminal (one command):** ```bash pip install quantcpp +quantcpp "What is gravity?" ``` +**Python (3 lines):** ```python from quantcpp import Model - -m = Model.from_pretrained("Llama-3.2-1B") # auto-downloads ~750 MB, cached +m = Model.from_pretrained("Llama-3.2-1B") print(m.ask("What is gravity?")) ``` -No API key. No GPU. No configuration. [Try it in your browser →](https://quantumaikr.github.io/quant.cpp/) +**Interactive chat:** +```bash +quantcpp +# You: What is gravity? +# AI: Gravity is a fundamental force... +``` + +Downloads Llama-3.2-1B (~750 MB) on first use, cached locally. No API key, no GPU. [Try in browser →](https://quantumaikr.github.io/quant.cpp/) --- diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml index deaff6d..52138ce 100644 --- a/bindings/python/pyproject.toml +++ b/bindings/python/pyproject.toml @@ -43,6 +43,9 @@ Source = "https://github.com/quantumaikr/quant.cpp" Issues = "https://github.com/quantumaikr/quant.cpp/issues" Changelog = "https://github.com/quantumaikr/quant.cpp/blob/main/CHANGELOG.md" +[project.scripts] +quantcpp = "quantcpp.cli:main" + [project.optional-dependencies] dev = ["pytest>=7.0", "build", "twine"] diff --git a/bindings/python/quantcpp/__init__.py b/bindings/python/quantcpp/__init__.py index eee6196..8613998 100644 --- a/bindings/python/quantcpp/__init__.py +++ b/bindings/python/quantcpp/__init__.py @@ -1,18 +1,14 @@ """ -quantcpp -- The SQLite of LLMs. Single-header C inference in Python. +quantcpp -- Compress AI's memory 3x. It gets faster. -Quick start (3 lines): +Quick start: from quantcpp import Model - m = Model.from_pretrained("SmolLM2-135M") + m = Model.from_pretrained("Llama-3.2-1B") print(m.ask("What is gravity?")) -Full control: - - m = Model("path/to/model.gguf", temperature=0.7, max_tokens=256) - for token in m.generate("Once upon a time"): - print(token, end="", flush=True) - m.close() +Note: SmolLM2-135M downloads faster but produces low-quality output. +Use Llama-3.2-1B (~750 MB, one-time download) for good results. """ try: diff --git a/bindings/python/quantcpp/cli.py b/bindings/python/quantcpp/cli.py new file mode 100644 index 0000000..14cf1e8 --- /dev/null +++ b/bindings/python/quantcpp/cli.py @@ -0,0 +1,64 @@ +""" +quantcpp CLI — chat with a local LLM in your terminal. + +Usage: + quantcpp # auto-downloads Llama-3.2-1B, starts chat + quantcpp "What is gravity?" # one-shot question + quantcpp --model SmolLM2-135M # use a smaller model (faster download) + quantcpp --model path/to/file.gguf # use your own GGUF file +""" + +import sys +import os + + +def main(): + import argparse + parser = argparse.ArgumentParser( + prog="quantcpp", + description="Chat with a local LLM. No API key, no GPU, no server.", + ) + parser.add_argument("prompt", nargs="*", help="Question to ask (omit for interactive chat)") + parser.add_argument("--model", "-m", default="Llama-3.2-1B", + help="Model name or path to .gguf file (default: Llama-3.2-1B)") + parser.add_argument("--max-tokens", "-n", type=int, default=256) + parser.add_argument("--temperature", "-t", type=float, default=0.7) + args = parser.parse_args() + + from quantcpp import Model + + # Load model + model_path = args.model + if os.path.isfile(model_path): + print(f"Loading {model_path}...", file=sys.stderr) + m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature) + else: + print(f"Downloading {model_path}...", file=sys.stderr) + m = Model.from_pretrained(model_path, max_tokens=args.max_tokens, + temperature=args.temperature) + + # One-shot or interactive + if args.prompt: + question = " ".join(args.prompt) + for tok in m.generate(question): + print(tok, end="", flush=True) + print() + else: + print("quantcpp — type your message, Ctrl+C to exit", file=sys.stderr) + try: + while True: + question = input("\nYou: ") + if not question.strip(): + continue + print("AI: ", end="", flush=True) + for tok in m.generate(question): + print(tok, end="", flush=True) + print() + except (KeyboardInterrupt, EOFError): + print("\nBye!", file=sys.stderr) + + m.close() + + +if __name__ == "__main__": + main() diff --git a/wasm/build.sh b/wasm/build.sh index 9c04a2d..76fec05 100755 --- a/wasm/build.sh +++ b/wasm/build.sh @@ -1,44 +1,38 @@ #!/bin/bash -# Build quant.cpp WASM demo (multi-threaded + SIMD) +# Build quant.cpp WASM demo (multi-threaded + SIMD, no ASYNCIFY) # Requires: Emscripten SDK (emcc) # -# Usage: cd wasm && bash build.sh -# Then: python3 -m http.server 8080 -# Open: http://localhost:8080 -# -# Multi-threading requires Cross-Origin-Isolation headers. -# coi-serviceworker.js injects them on GitHub Pages / static hosts. +# Architecture: inference runs in a Web Worker (inference-worker.js) +# so the main thread stays responsive. No ASYNCIFY needed — the worker +# blocks on quant_generate() while postMessage streams tokens. set -e SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" PROJECT_DIR="$(dirname "$SCRIPT_DIR")" -echo "=== Building quant.cpp WASM (pthreads + SIMD) ===" +echo "=== Building quant.cpp WASM (pthreads + SIMD, no ASYNCIFY) ===" -# Check emcc if ! command -v emcc &>/dev/null; then - echo "Error: emcc not found. Install Emscripten:" - echo " brew install emscripten" - echo " # or: git clone https://github.com/emscripten-core/emsdk && ./emsdk install latest && ./emsdk activate latest" + echo "Error: emcc not found. Install Emscripten SDK." exit 1 fi echo "emcc version: $(emcc --version | head -1)" -# Build with pthreads + SIMD128 + ASYNCIFY emcc "$SCRIPT_DIR/quant_wasm.c" \ -I"$PROJECT_DIR" \ -o "$SCRIPT_DIR/quant.js" \ -O3 \ -msimd128 \ + -mrelaxed-simd \ -flto \ -pthread \ -s WASM=1 \ - -s ALLOW_MEMORY_GROWTH=1 \ + -s INITIAL_MEMORY=1GB \ -s MAXIMUM_MEMORY=4GB \ - -s INITIAL_MEMORY=256MB \ - -s EXPORTED_FUNCTIONS='["_main","_wasm_load_model","_wasm_generate","_wasm_generate_async","_wasm_model_info","_wasm_is_ready","_malloc","_free"]' \ + -s ALLOW_MEMORY_GROWTH=0 \ + -s EXPORTED_FUNCTIONS='["_main","_wasm_load_model","_wasm_generate","_wasm_model_info","_wasm_is_ready","_malloc","_free"]' \ -s EXPORTED_RUNTIME_METHODS='["UTF8ToString","allocateUTF8","FS"]' \ -s FORCE_FILESYSTEM=1 \ -s MODULARIZE=0 \ @@ -46,9 +40,6 @@ emcc "$SCRIPT_DIR/quant_wasm.c" \ -s NO_EXIT_RUNTIME=1 \ -s ASSERTIONS=0 \ -s STACK_SIZE=1MB \ - -s ASYNCIFY \ - -s 'ASYNCIFY_IMPORTS=["emscripten_sleep"]' \ - -s ASYNCIFY_STACK_SIZE=65536 \ -s PTHREAD_POOL_SIZE=4 \ -s PTHREAD_POOL_SIZE_STRICT=0 \ -lm \ @@ -59,14 +50,9 @@ emcc "$SCRIPT_DIR/quant_wasm.c" \ echo "" echo "=== Build complete ===" -echo "Files:" -for f in quant.js quant.wasm quant.worker.js; do +for f in quant.js quant.wasm; do [ -f "$SCRIPT_DIR/$f" ] && echo " $f ($(du -h "$SCRIPT_DIR/$f" | cut -f1))" done echo "" -echo "To serve locally:" -echo " cd $SCRIPT_DIR && python3 -m http.server 8080" -echo " Open http://localhost:8080" -echo "" -echo "Note: Multi-threading requires Cross-Origin-Isolation." -echo "coi-serviceworker.js handles this automatically on GitHub Pages." +echo " inference-worker.js — Web Worker wrapper (no ASYNCIFY overhead)" +echo " coi-serviceworker.js — COOP/COEP header injection for pthreads" diff --git a/wasm/index.html b/wasm/index.html index 716d0ee..597a2d4 100644 --- a/wasm/index.html +++ b/wasm/index.html @@ -356,28 +356,10 @@
KV compression active — 3x longer context
`; - document.getElementById('kvBadge').style.display = ''; - document.getElementById('prompt').disabled = false; - document.getElementById('sendBtn').disabled = false; - document.getElementById('prompt').focus(); - addMessage('system', `Model loaded! ${name} (${(bytes.length/1048576).toFixed(0)} MB). Ask anything.`); - } else { - addMessage('system', 'Failed to load model.'); - } - } catch(e) { - addMessage('system', `Error: ${e.message}`); - } - hideLoading(); + showLoading('Loading model into WASM...'); + // Transfer ArrayBuffer to worker (zero-copy) + const buffer = bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength); + worker.postMessage({ type: 'load', bytes: buffer, name: name }, [buffer]); } async function loadModel(file) { @@ -402,8 +384,81 @@KV compression active — 3x longer context
`; + document.getElementById('kvBadge').style.display = ''; + document.getElementById('prompt').disabled = false; + document.getElementById('sendBtn').disabled = false; + document.getElementById('prompt').focus(); + hideLoading(); + } + else if (msg.type === 'token' && pendingAssistantDiv) { + pendingOutput += msg.text; + pendingTokenCount++; + pendingAssistantDiv.textContent = pendingOutput; + const cursor = document.createElement('span'); + cursor.className = 'cursor'; + cursor.textContent = '▌'; + pendingAssistantDiv.appendChild(cursor); + const chat = document.getElementById('chat'); + chat.scrollTop = chat.scrollHeight; + const elapsed = (performance.now() - pendingStartTime) / 1000; + if (elapsed > 0.1) { + document.getElementById('statTokens').textContent = `${pendingTokenCount} tokens`; + document.getElementById('statSpeed').textContent = `${(pendingTokenCount / elapsed).toFixed(1)} tok/s`; + } + } + else if (msg.type === 'done') { + if (pendingAssistantDiv) { + if (pendingOutput) { + pendingAssistantDiv.innerHTML = formatText(pendingOutput); + } else { + pendingAssistantDiv.innerHTML = 'No output generated. Try a longer prompt.'; + } + const elapsed = (performance.now() - pendingStartTime) / 1000; + const tps = pendingTokenCount > 0 ? (pendingTokenCount / elapsed).toFixed(1) : '0'; + document.getElementById('statTokens').textContent = `${pendingTokenCount} tokens`; + document.getElementById('statSpeed').textContent = `${tps} tok/s`; + } + generating = false; + document.getElementById('sendBtn').disabled = false; + document.getElementById('prompt').disabled = false; + document.getElementById('prompt').focus(); + pendingAssistantDiv = null; + } + }; +} + +function generate() { + if (!modelLoaded || generating || !worker) return; const input = document.getElementById('prompt'); const text = input.value.trim(); if (!text) return; @@ -414,84 +469,19 @@