quantumaikr · unamedkr · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026
diff --git a/README.md b/README.md
@@ -33,20 +33,29 @@
 
 ---
 
-## Quick Start (30 seconds)
+## Quick Start
 
+**Terminal (one command):**
 ```bash
 pip install quantcpp
+quantcpp "What is gravity?"
 ```
 
+**Python (3 lines):**
 ```python
 from quantcpp import Model
-
-m = Model.from_pretrained("Llama-3.2-1B")   # auto-downloads ~750 MB, cached
+m = Model.from_pretrained("Llama-3.2-1B")
 print(m.ask("What is gravity?"))
 ```
 
-No API key. No GPU. No configuration. [Try it in your browser →](https://quantumaikr.github.io/quant.cpp/)
+**Interactive chat:**
+```bash
+quantcpp
+# You: What is gravity?
+# AI: Gravity is a fundamental force...
+```
+
+Downloads Llama-3.2-1B (~750 MB) on first use, cached locally. No API key, no GPU. [Try in browser →](https://quantumaikr.github.io/quant.cpp/)
 
 ---
 

diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml
@@ -43,6 +43,9 @@ Source        = "https://github.com/quantumaikr/quant.cpp"
 Issues        = "https://github.com/quantumaikr/quant.cpp/issues"
 Changelog     = "https://github.com/quantumaikr/quant.cpp/blob/main/CHANGELOG.md"
 
+[project.scripts]
+quantcpp = "quantcpp.cli:main"
+
 [project.optional-dependencies]
 dev = ["pytest>=7.0", "build", "twine"]
 

diff --git a/bindings/python/quantcpp/__init__.py b/bindings/python/quantcpp/__init__.py
@@ -1,18 +1,14 @@
 """
-quantcpp -- The SQLite of LLMs. Single-header C inference in Python.
+quantcpp -- Compress AI's memory 3x. It gets faster.
 
-Quick start (3 lines):
+Quick start:
 
     from quantcpp import Model
-    m = Model.from_pretrained("SmolLM2-135M")
+    m = Model.from_pretrained("Llama-3.2-1B")
     print(m.ask("What is gravity?"))
 
-Full control:
-
-    m = Model("path/to/model.gguf", temperature=0.7, max_tokens=256)
-    for token in m.generate("Once upon a time"):
-        print(token, end="", flush=True)
-    m.close()
+Note: SmolLM2-135M downloads faster but produces low-quality output.
+Use Llama-3.2-1B (~750 MB, one-time download) for good results.
 """
 
 try:

diff --git a/bindings/python/quantcpp/cli.py b/bindings/python/quantcpp/cli.py
@@ -0,0 +1,64 @@
+"""
+quantcpp CLI — chat with a local LLM in your terminal.
+
+Usage:
+    quantcpp                          # auto-downloads Llama-3.2-1B, starts chat
+    quantcpp "What is gravity?"       # one-shot question
+    quantcpp --model SmolLM2-135M     # use a smaller model (faster download)
+    quantcpp --model path/to/file.gguf  # use your own GGUF file
+"""
+
+import sys
+import os
+
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(
+        prog="quantcpp",
+        description="Chat with a local LLM. No API key, no GPU, no server.",
+    )
+    parser.add_argument("prompt", nargs="*", help="Question to ask (omit for interactive chat)")
+    parser.add_argument("--model", "-m", default="Llama-3.2-1B",
+                        help="Model name or path to .gguf file (default: Llama-3.2-1B)")
+    parser.add_argument("--max-tokens", "-n", type=int, default=256)
+    parser.add_argument("--temperature", "-t", type=float, default=0.7)
+    args = parser.parse_args()
+
+    from quantcpp import Model
+
+    # Load model
+    model_path = args.model
+    if os.path.isfile(model_path):
+        print(f"Loading {model_path}...", file=sys.stderr)
+        m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature)
+    else:
+        print(f"Downloading {model_path}...", file=sys.stderr)
+        m = Model.from_pretrained(model_path, max_tokens=args.max_tokens,
+                                   temperature=args.temperature)
+
+    # One-shot or interactive
+    if args.prompt:
+        question = " ".join(args.prompt)
+        for tok in m.generate(question):
+            print(tok, end="", flush=True)
+        print()
+    else:
+        print("quantcpp — type your message, Ctrl+C to exit", file=sys.stderr)
+        try:
+            while True:
+                question = input("\nYou: ")
+                if not question.strip():
+                    continue
+                print("AI: ", end="", flush=True)
+                for tok in m.generate(question):
+                    print(tok, end="", flush=True)
+                print()
+        except (KeyboardInterrupt, EOFError):
+            print("\nBye!", file=sys.stderr)
+
+    m.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/wasm/build.sh b/wasm/build.sh
@@ -1,54 +1,45 @@
 #!/bin/bash
-# Build quant.cpp WASM demo (multi-threaded + SIMD)
+# Build quant.cpp WASM demo (multi-threaded + SIMD, no ASYNCIFY)
 # Requires: Emscripten SDK (emcc)
 #
-# Usage: cd wasm && bash build.sh
-# Then:  python3 -m http.server 8080
-# Open:  http://localhost:8080
-#
-# Multi-threading requires Cross-Origin-Isolation headers.
-# coi-serviceworker.js injects them on GitHub Pages / static hosts.
+# Architecture: inference runs in a Web Worker (inference-worker.js)
+# so the main thread stays responsive. No ASYNCIFY needed — the worker
+# blocks on quant_generate() while postMessage streams tokens.
 
 set -e
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
 
-echo "=== Building quant.cpp WASM (pthreads + SIMD) ==="
+echo "=== Building quant.cpp WASM (pthreads + SIMD, no ASYNCIFY) ==="
 
-# Check emcc
 if ! command -v emcc &>/dev/null; then
-    echo "Error: emcc not found. Install Emscripten:"
-    echo "  brew install emscripten"
-    echo "  # or: git clone https://github.com/emscripten-core/emsdk && ./emsdk install latest && ./emsdk activate latest"
+    echo "Error: emcc not found. Install Emscripten SDK."
     exit 1
 fi
 
 echo "emcc version: $(emcc --version | head -1)"
 
-# Build with pthreads + SIMD128 + ASYNCIFY
 emcc "$SCRIPT_DIR/quant_wasm.c" \
     -I"$PROJECT_DIR" \
     -o "$SCRIPT_DIR/quant.js" \
     -O3 \
     -msimd128 \
+    -mrelaxed-simd \
     -flto \
     -pthread \
     -s WASM=1 \
-    -s ALLOW_MEMORY_GROWTH=1 \
+    -s INITIAL_MEMORY=1GB \
     -s MAXIMUM_MEMORY=4GB \
-    -s INITIAL_MEMORY=256MB \
-    -s EXPORTED_FUNCTIONS='["_main","_wasm_load_model","_wasm_generate","_wasm_generate_async","_wasm_model_info","_wasm_is_ready","_malloc","_free"]' \
+    -s ALLOW_MEMORY_GROWTH=0 \
+    -s EXPORTED_FUNCTIONS='["_main","_wasm_load_model","_wasm_generate","_wasm_model_info","_wasm_is_ready","_malloc","_free"]' \
     -s EXPORTED_RUNTIME_METHODS='["UTF8ToString","allocateUTF8","FS"]' \
     -s FORCE_FILESYSTEM=1 \
     -s MODULARIZE=0 \
     -s ENVIRONMENT='web,worker' \
     -s NO_EXIT_RUNTIME=1 \
     -s ASSERTIONS=0 \
     -s STACK_SIZE=1MB \
-    -s ASYNCIFY \
-    -s 'ASYNCIFY_IMPORTS=["emscripten_sleep"]' \
-    -s ASYNCIFY_STACK_SIZE=65536 \
     -s PTHREAD_POOL_SIZE=4 \
     -s PTHREAD_POOL_SIZE_STRICT=0 \
     -lm \
@@ -59,14 +50,9 @@ emcc "$SCRIPT_DIR/quant_wasm.c" \
 
 echo ""
 echo "=== Build complete ==="
-echo "Files:"
-for f in quant.js quant.wasm quant.worker.js; do
+for f in quant.js quant.wasm; do
     [ -f "$SCRIPT_DIR/$f" ] && echo "  $f ($(du -h "$SCRIPT_DIR/$f" | cut -f1))"
 done
 echo ""
-echo "To serve locally:"
-echo "  cd $SCRIPT_DIR && python3 -m http.server 8080"
-echo "  Open http://localhost:8080"
-echo ""
-echo "Note: Multi-threading requires Cross-Origin-Isolation."
-echo "coi-serviceworker.js handles this automatically on GitHub Pages."
+echo "  inference-worker.js — Web Worker wrapper (no ASYNCIFY overhead)"
+echo "  coi-serviceworker.js — COOP/COEP header injection for pthreads"