Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 0 additions & 54 deletions quant.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
// Section 1: Types and Specs (from tq_types.h, tq_spec.h)
// ============================================================================



/* Cross-language static assert: works in both C11 and C++11/17 */
#ifdef __cplusplus
#define TQ_STATIC_ASSERT(cond, msg) static_assert(cond, msg)
Expand All @@ -219,8 +217,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
#define TQ_PI_2 1.5707963267948966f
#endif



/* ============================================================
* Constants
* ============================================================ */
Expand Down Expand Up @@ -398,8 +394,6 @@ typedef struct {
int enable_recompression;/* Tier 1 → Tier 2 re-compression */
} tq_progressive_config_t;



/* TurboQuant KV cache block: RHT + Lloyd-Max codebook + QJL residual
* 3-bit variant: 2-bit codebook (4 levels) + 1-bit QJL sign hash
* Block covers TQ_BK elements (128).
Expand Down Expand Up @@ -469,12 +463,6 @@ TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK * 3 / 8 + TQ_BK / 8);
TQ_CHECK_SIZE(block_tq_turbo_kv_1b, 8 + TQ_BK / 8);
TQ_CHECK_SIZE(block_tq_turbo_kv_2b, 8 + TQ_BK / 8 + TQ_BK / 8);







/* Format specification — version-aware, ONNX-inspired */

#define TQ_SPEC_VERSION 1
Expand All @@ -500,18 +488,10 @@ typedef struct {
uint8_t flags; /* TQ_FLAG_* bitmask */
} tq_format_spec_t;





// ============================================================================
// Section 2: Engine Types (from tq_engine.h)
// ============================================================================





/* ============================================================
* Model configuration
* ============================================================ */
Expand Down Expand Up @@ -1123,9 +1103,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
/* Max threads supported by thread pool */
#define TQ_TP_MAX 16




// ============================================================================
// Section 3: GGUF Types (from tq_gguf.h)
// ============================================================================
Expand All @@ -1143,10 +1120,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
* directly into TurboQuant inference engine.
*/





/* ============================================================
* GGUF format constants
* ============================================================ */
Expand Down Expand Up @@ -1462,24 +1435,17 @@ int tq_metal_moe_forward(
const int* up_types, /* per-expert up quant types, NULL = use weight_type */
const int* down_types); /* per-expert down quant types, NULL = use weight_type */




// ============================================================================
// Section 4: Internal API (from turboquant.h)
// ============================================================================


/**
* TurboQuant.cpp — Cross-platform KV cache compression library
*
* Public C API — single header include for all functionality.
* Zero external dependencies (libc/libm only).
*/




/* ============================================================
* Version
* ============================================================ */
Expand Down Expand Up @@ -1753,15 +1719,10 @@ void tq_progressive_free(tq_progressive_t* p);

tq_progressive_config_t tq_progressive_default_config(void);





// ============================================================================
// Section 5: quant_ctx struct definition
// ============================================================================


struct quant_ctx {
tq_model_t* model;
tq_state_t* state;
Expand All @@ -1788,7 +1749,6 @@ struct quant_ctx {
* - Random signs decorrelate channels across different blocks
*/


#ifdef __ARM_NEON
#include <arm_neon.h>
#endif
Expand Down Expand Up @@ -1902,7 +1862,6 @@ void tq_rht_inverse(float* data, int n, uint32_t seed) {
*/
/* Generic reference — no compiler-specific pragmas */


/* ---------- FP16 helpers ---------- */

static uint16_t uni_fp32_to_fp16(float v) {
Expand Down Expand Up @@ -2285,7 +2244,6 @@ void tq_uniform_3b_attention_ref(const float* query, const void* kv,
// Section 8: Type Traits (from tq_traits.c)
// ============================================================================


/* Stub implementations for excluded quantization types (polar, qjl, turbo, mixed) */
static void tq_stub_quantize(const float* src, void* dst, int n) {
(void)src; (void)dst; (void)n;
Expand Down Expand Up @@ -2583,7 +2541,6 @@ tq_type tq_type_from_name(const char* name) {
* No external dependencies — libc/libm only.
*/


#ifdef __ARM_NEON
#include <arm_neon.h>
#endif
Expand Down Expand Up @@ -2617,7 +2574,6 @@ static struct {

static int g_n_threads = 1;


static void* tp_worker(void* arg) {
int id = (int)(intptr_t)arg;
int my_gen = 0;
Expand Down Expand Up @@ -4388,8 +4344,6 @@ void tq_matmul_1bit(float* out, const float* x,
* SPDX-License-Identifier: MIT
*/



#ifdef _WIN32
#else
#endif
Expand Down Expand Up @@ -5098,8 +5052,6 @@ const tq_gguf_tensor_t* tq_gguf_find_tensor(const tq_gguf_ctx_t* ctx, const char
* Pure C11, no external dependencies.
*/



#if defined(__ARM_NEON) || defined(__ARM_NEON__)
#include <arm_neon.h>
#define TQ_HAS_NEON 1
Expand Down Expand Up @@ -7174,7 +7126,6 @@ void tq_metal_batch_end_if_available(void) {
* Also supports the legacy llama2.c binary tokenizer format as fallback.
*/


/* Global for qsort comparator (vocab index sorting) */
static char** g_vocab_for_sort;
static int cmp_vocab_idx(const void* a, const void* b) {
Expand Down Expand Up @@ -8519,7 +8470,6 @@ const char* tq_decode(const tq_tokenizer_t* tok, int prev_token, int token) {
* Supports hybrid architectures (e.g., Qwen3.5 DeltaNet + self_attn).
*/


#ifdef _WIN32
#else
#endif
Expand Down Expand Up @@ -12934,7 +12884,6 @@ void tq_quantize_weights_1bit(tq_model_t* model) {
* -> residual add
*/


/* Unified Q2/1-bit matmul dispatch.
* When model->use_1bit_weights, Q2 fields contain sign bits + norms,
* dispatched to tq_matmul_1bit (FP32 input required).
Expand Down Expand Up @@ -15194,7 +15143,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
}
}


/* Increment profile token count if profiling is active */
if (s->profile_kv) {
s->profile_kv_count++;
Expand Down Expand Up @@ -15245,7 +15193,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
* - Full generation loop with streaming callback
*/


/* ============================================================
* Argmax sampling: return token with highest logit
* ============================================================ */
Expand Down Expand Up @@ -15673,7 +15620,6 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
return generated;
}


// ============================================================================

// ============================================================================
Expand Down
19 changes: 16 additions & 3 deletions wasm/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -174,10 +174,15 @@ <h2>Run an <span>LLM</span> in your browser</h2>
<p class="subtitle">No install. No API key. No server.</p>

<div class="model-cards" id="modelCards">
<div class="model-card recommended" id="card-llama" onclick="loadDemoModel('llama-3.2-1b')">
<div class="model-card recommended" id="card-smol" onclick="loadDemoModel('smollm2-135m')">
<div class="name">SmolLM2 135M</div>
<div class="meta" id="meta-smol">~135 MB &middot; Fast response</div>
<span class="tag">Fast</span>
</div>
<div class="model-card" id="card-llama" onclick="loadDemoModel('llama-3.2-1b')">
<div class="name">Llama 3.2 1B Instruct</div>
<div class="meta" id="meta-llama">~770 MB &middot; Verified quality</div>
<span class="tag">Recommended</span>
<div class="meta" id="meta-llama">~770 MB &middot; Better quality</div>
<span class="tag blue">Quality</span>
</div>
</div>

Expand Down Expand Up @@ -218,6 +223,14 @@ <h2>Run an <span>LLM</span> in your browser</h2>
let activeModelId = null;

const MODELS = {
'smollm2-135m': {
url: 'https://huggingface.co/Felladrin/gguf-Q8_0-SmolLM2-135M-Instruct/resolve/main/smollm2-135m-instruct-q8_0.gguf',
name: 'SmolLM2 135M',
size: 135,
cacheKey: 'smollm2-135m-q8',
chatTemplate: (t) => t, // SmolLM2 works best with plain text prompts
cardId: 'card-smol', metaId: 'meta-smol',
},
'llama-3.2-1b': {
url: 'https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/resolve/main/llama-3.2-1b-instruct-q4_k_m.gguf',
name: 'Llama 3.2 1B Instruct',
Expand Down
Loading