Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,29 @@

---

## Quick Start (30 seconds)
## Quick Start

**Terminal (one command):**
```bash
pip install quantcpp
quantcpp "What is gravity?"
```

**Python (3 lines):**
```python
from quantcpp import Model

m = Model.from_pretrained("Llama-3.2-1B") # auto-downloads ~750 MB, cached
m = Model.from_pretrained("Llama-3.2-1B")
print(m.ask("What is gravity?"))
```

No API key. No GPU. No configuration. [Try it in your browser →](https://quantumaikr.github.io/quant.cpp/)
**Interactive chat:**
```bash
quantcpp
# You: What is gravity?
# AI: Gravity is a fundamental force...
```

Downloads Llama-3.2-1B (~750 MB) on first use, cached locally. No API key, no GPU. [Try in browser →](https://quantumaikr.github.io/quant.cpp/)

---

Expand Down
3 changes: 3 additions & 0 deletions bindings/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ Source = "https://github.com/quantumaikr/quant.cpp"
Issues = "https://github.com/quantumaikr/quant.cpp/issues"
Changelog = "https://github.com/quantumaikr/quant.cpp/blob/main/CHANGELOG.md"

[project.scripts]
quantcpp = "quantcpp.cli:main"

[project.optional-dependencies]
dev = ["pytest>=7.0", "build", "twine"]

Expand Down
14 changes: 5 additions & 9 deletions bindings/python/quantcpp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,14 @@
"""
quantcpp -- The SQLite of LLMs. Single-header C inference in Python.
quantcpp -- Compress AI's memory 3x. It gets faster.

Quick start (3 lines):
Quick start:

from quantcpp import Model
m = Model.from_pretrained("SmolLM2-135M")
m = Model.from_pretrained("Llama-3.2-1B")
print(m.ask("What is gravity?"))

Full control:

m = Model("path/to/model.gguf", temperature=0.7, max_tokens=256)
for token in m.generate("Once upon a time"):
print(token, end="", flush=True)
m.close()
Note: SmolLM2-135M downloads faster but produces low-quality output.
Use Llama-3.2-1B (~750 MB, one-time download) for good results.
"""

try:
Expand Down
64 changes: 64 additions & 0 deletions bindings/python/quantcpp/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""
quantcpp CLI — chat with a local LLM in your terminal.

Usage:
quantcpp # auto-downloads Llama-3.2-1B, starts chat
quantcpp "What is gravity?" # one-shot question
quantcpp --model SmolLM2-135M # use a smaller model (faster download)
quantcpp --model path/to/file.gguf # use your own GGUF file
"""

import sys
import os


def main():
import argparse
parser = argparse.ArgumentParser(
prog="quantcpp",
description="Chat with a local LLM. No API key, no GPU, no server.",
)
parser.add_argument("prompt", nargs="*", help="Question to ask (omit for interactive chat)")
parser.add_argument("--model", "-m", default="Llama-3.2-1B",
help="Model name or path to .gguf file (default: Llama-3.2-1B)")
parser.add_argument("--max-tokens", "-n", type=int, default=256)
parser.add_argument("--temperature", "-t", type=float, default=0.7)
args = parser.parse_args()

from quantcpp import Model

# Load model
model_path = args.model
if os.path.isfile(model_path):
print(f"Loading {model_path}...", file=sys.stderr)
m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature)
else:
print(f"Downloading {model_path}...", file=sys.stderr)
m = Model.from_pretrained(model_path, max_tokens=args.max_tokens,
temperature=args.temperature)

# One-shot or interactive
if args.prompt:
question = " ".join(args.prompt)
for tok in m.generate(question):
print(tok, end="", flush=True)
print()
else:
print("quantcpp — type your message, Ctrl+C to exit", file=sys.stderr)
try:
while True:
question = input("\nYou: ")
if not question.strip():
continue
print("AI: ", end="", flush=True)
for tok in m.generate(question):
print(tok, end="", flush=True)
print()
except (KeyboardInterrupt, EOFError):
print("\nBye!", file=sys.stderr)

m.close()


if __name__ == "__main__":
main()
40 changes: 13 additions & 27 deletions wasm/build.sh
Original file line number Diff line number Diff line change
@@ -1,54 +1,45 @@
#!/bin/bash
# Build quant.cpp WASM demo (multi-threaded + SIMD)
# Build quant.cpp WASM demo (multi-threaded + SIMD, no ASYNCIFY)
# Requires: Emscripten SDK (emcc)
#
# Usage: cd wasm && bash build.sh
# Then: python3 -m http.server 8080
# Open: http://localhost:8080
#
# Multi-threading requires Cross-Origin-Isolation headers.
# coi-serviceworker.js injects them on GitHub Pages / static hosts.
# Architecture: inference runs in a Web Worker (inference-worker.js)
# so the main thread stays responsive. No ASYNCIFY needed — the worker
# blocks on quant_generate() while postMessage streams tokens.

set -e

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"

echo "=== Building quant.cpp WASM (pthreads + SIMD) ==="
echo "=== Building quant.cpp WASM (pthreads + SIMD, no ASYNCIFY) ==="

# Check emcc
if ! command -v emcc &>/dev/null; then
echo "Error: emcc not found. Install Emscripten:"
echo " brew install emscripten"
echo " # or: git clone https://github.com/emscripten-core/emsdk && ./emsdk install latest && ./emsdk activate latest"
echo "Error: emcc not found. Install Emscripten SDK."
exit 1
fi

echo "emcc version: $(emcc --version | head -1)"

# Build with pthreads + SIMD128 + ASYNCIFY
emcc "$SCRIPT_DIR/quant_wasm.c" \
-I"$PROJECT_DIR" \
-o "$SCRIPT_DIR/quant.js" \
-O3 \
-msimd128 \
-mrelaxed-simd \
-flto \
-pthread \
-s WASM=1 \
-s ALLOW_MEMORY_GROWTH=1 \
-s INITIAL_MEMORY=1GB \
-s MAXIMUM_MEMORY=4GB \
-s INITIAL_MEMORY=256MB \
-s EXPORTED_FUNCTIONS='["_main","_wasm_load_model","_wasm_generate","_wasm_generate_async","_wasm_model_info","_wasm_is_ready","_malloc","_free"]' \
-s ALLOW_MEMORY_GROWTH=0 \
-s EXPORTED_FUNCTIONS='["_main","_wasm_load_model","_wasm_generate","_wasm_model_info","_wasm_is_ready","_malloc","_free"]' \
-s EXPORTED_RUNTIME_METHODS='["UTF8ToString","allocateUTF8","FS"]' \
-s FORCE_FILESYSTEM=1 \
-s MODULARIZE=0 \
-s ENVIRONMENT='web,worker' \
-s NO_EXIT_RUNTIME=1 \
-s ASSERTIONS=0 \
-s STACK_SIZE=1MB \
-s ASYNCIFY \
-s 'ASYNCIFY_IMPORTS=["emscripten_sleep"]' \
-s ASYNCIFY_STACK_SIZE=65536 \
-s PTHREAD_POOL_SIZE=4 \
-s PTHREAD_POOL_SIZE_STRICT=0 \
-lm \
Expand All @@ -59,14 +50,9 @@ emcc "$SCRIPT_DIR/quant_wasm.c" \

echo ""
echo "=== Build complete ==="
echo "Files:"
for f in quant.js quant.wasm quant.worker.js; do
for f in quant.js quant.wasm; do
[ -f "$SCRIPT_DIR/$f" ] && echo " $f ($(du -h "$SCRIPT_DIR/$f" | cut -f1))"
done
echo ""
echo "To serve locally:"
echo " cd $SCRIPT_DIR && python3 -m http.server 8080"
echo " Open http://localhost:8080"
echo ""
echo "Note: Multi-threading requires Cross-Origin-Isolation."
echo "coi-serviceworker.js handles this automatically on GitHub Pages."
echo " inference-worker.js — Web Worker wrapper (no ASYNCIFY overhead)"
echo " coi-serviceworker.js — COOP/COEP header injection for pthreads"
Loading
Loading