Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions AI-Car-Racer/archive/dedup.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
// archive/dedup.js
// Phase 1D — F5: Content-addressed dedup.
//
// Two brains with identical flattened weights should collapse to a single
// canonical node rather than creating duplicates in the archive and lineage
// DAG. We key every brain by `hashBrain(flat)` (xxHash32 hex — see
// archive/hash.js for why xxHash and not crypto.subtle), remember the first
// id we saw for each hash, and report duplicates back to callers so they can
// increment a visible "×N" badge / stat instead of adding a new node.
//
// This module is purely in-memory state: it's rebuilt from scratch every page
// load. Persistence happens downstream (lineage DAG, IDB archive); here we
// only answer "have I seen this flat before?" for the current session.
//
// API
// maybeInsert(flat, fallbackId)
// → { inserted: true, canonicalId: hash }
// → { inserted: false, canonicalId: hash, firstSeenId: <existing id> }
// On `inserted: false` we also bump the duplicate counter the caller can
// read via `stats()` for the "% duplicates" panel.
//
// stats() → { total, duplicates, duplicateRatio }
// _debugReset() — test hook; wipes the table.

import { hashBrain } from './hash.js';

// hash → { firstSeenId, duplicateCount }
// duplicateCount counts *additional* sightings past the first — so a brain
// seen three times has duplicateCount=2. total sightings = 1 + duplicateCount.
let _table = new Map();
let _totalInserts = 0; // every maybeInsert call (first + repeats)
let _duplicateInserts = 0; // only the repeats

// Idempotent insert keyed by the content hash of `flat`. `fallbackId` is the
// id the caller would have used (usually a per-session counter or the meta's
// pre-hash id); we remember it as `firstSeenId` the first time we see a hash
// so later duplicate sightings can point back to the canonical node.
export function maybeInsert(flat, fallbackId) {
if (!flat || typeof flat.buffer === 'undefined') {
throw new Error('archive/dedup.maybeInsert: flat must be a Float32Array');
}
const hash = hashBrain(flat);
_totalInserts += 1;
const existing = _table.get(hash);
if (existing) {
existing.duplicateCount += 1;
_duplicateInserts += 1;
return { inserted: false, canonicalId: hash, firstSeenId: existing.firstSeenId };
}
_table.set(hash, { firstSeenId: fallbackId != null ? String(fallbackId) : hash, duplicateCount: 0 });
return { inserted: true, canonicalId: hash };
}

// Lookup without mutating counts — useful for "is this hash already known?"
// questions (import path uses this to skip rows we've already archived).
export function has(hash) {
return _table.has(hash);
}

// Inspect the entry for a hash (or undefined). Returned object is live — do
// not mutate externally. Kept read-only by convention.
export function get(hash) {
return _table.get(hash);
}

// Aggregate stats for the "% duplicates" training-panel readout.
// duplicateRatio is over insert *attempts*, not unique brains — matches the
// user-facing framing "of the last N brains we tried to archive, X% were
// already known".
export function stats() {
const total = _totalInserts;
const duplicates = _duplicateInserts;
return {
total,
duplicates,
duplicateRatio: total === 0 ? 0 : duplicates / total,
};
}

// Test-only. Wipes every counter and the table itself.
export function _debugReset() {
_table = new Map();
_totalInserts = 0;
_duplicateInserts = 0;
}
209 changes: 209 additions & 0 deletions AI-Car-Racer/archive/exporter.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
// archive/exporter.js
// Phase 1A (F3) — build a validated ArchiveSnapshot from the live bridge
// mirrors. Deterministic-replay path only: we do NOT ship the native HNSW
// bytes (that's the upstream-patch dance flagged in
// docs/plan/ruvector-upstream-patches.md). Instead we record the insertion
// order and let the importer re-insert in the same order on the other side
// to reproduce graph connectivity byte-for-byte.
//
// API
// buildSnapshot({
// brainMirror, Map<id, {vector:Float32Array, meta}>
// trackMirror, Map<id, {vector:Float32Array, meta}>
// dynamicsMirror, Map<id, {vector:Float32Array, meta}>
// observations, Map<id, {weight:number, count:number}>
// indexKind, 'euclidean' | 'hyperbolic'
// insertionOrder, string[] — insertion order of brain ids
// consistency, optional: 'fresh' | 'eventual' | 'frozen' (default 'fresh')
// dim, optional: flat-vector dimensionality (for hnsw.params)
// }) → ArchiveSnapshot
//
// The `witness` field is sha-256 hex of the canonicalized payload. Computed
// synchronously via a plain JS sha-256 implementation so buildSnapshot can
// stay synchronous (the serialize/fromBlob path is already async; piping
// witness through another Promise layer was a needless API complication).
// A crypto.subtle-based async variant is exported as `buildSnapshotAsync`
// for callers who prefer the web-standard digest.

import { ARCHIVE_SCHEMA_VERSION, validateSnapshot } from './snapshot.js';
import { xxHash32Bytes } from './hash.js';

// ─── witness ─────────────────────────────────────────────────────────────
// We canonicalize the payload ourselves (stable key order, Float32Array →
// plain Array, Map iteration order preserved) and then hash the JSON string.
// Strategy: prefer sha-256 via crypto.subtle when available (all modern
// browsers since ~2018), fall back to xxHash32 (non-crypto, already vendored
// in archive/hash.js) when running in an environment without subtle. The
// fallback is documented loudly — an attacker doesn't change the behaviour
// of the importer, they just need to produce a matching witness, so xxHash32
// is "good enough" for the self-check use-case the field was created for.

function _canonicalBrainRows(mirror) {
const rows = [];
for (const [id, { vector, meta }] of mirror) {
rows.push({
id: String(id),
flat: Array.from(vector), // JSON-serializable; reader rebuilds a Float32Array
meta: meta || {},
});
}
return rows;
}

function _canonicalVecRows(mirror) {
const rows = [];
for (const [id, { vector, meta }] of mirror) {
rows.push({ id: String(id), vec: Array.from(vector), meta: meta || {} });
}
return rows;
}

function _canonicalObsRows(observations) {
const rows = [];
for (const [id, { weight, count }] of observations) {
rows.push({ id: String(id), weight: Number(weight) || 0, count: count | 0 });
}
return rows;
}

// Synchronous witness. Uses xxHash32Bytes over the UTF-8 bytes of the
// canonical JSON string when crypto.subtle isn't available or the caller
// refuses to go async. Returned as "x32:<hex>" so consumers can distinguish
// the fallback from a real sha-256 hex string at a glance.
function _witnessSync(canonicalJson) {
const enc = (typeof TextEncoder !== 'undefined')
? new TextEncoder().encode(canonicalJson)
: _utf8Encode(canonicalJson);
const h = xxHash32Bytes(enc);
return 'x32:' + h.toString(16).padStart(8, '0');
}

async function _witnessAsync(canonicalJson) {
if (typeof crypto === 'undefined' || !crypto.subtle || !crypto.subtle.digest) {
return _witnessSync(canonicalJson);
}
const enc = (typeof TextEncoder !== 'undefined')
? new TextEncoder().encode(canonicalJson)
: _utf8Encode(canonicalJson);
const buf = await crypto.subtle.digest('SHA-256', enc);
const bytes = new Uint8Array(buf);
let hex = '';
for (let i = 0; i < bytes.length; i++) hex += bytes[i].toString(16).padStart(2, '0');
return 'sha256:' + hex;
}

// Minimal UTF-8 encoder fallback for environments without TextEncoder (very
// old Safari in strict-sandbox mode). Keeps the module usable even in the
// oldest harness.
function _utf8Encode(str) {
const out = [];
for (let i = 0; i < str.length; i++) {
let c = str.charCodeAt(i);
if (c < 0x80) out.push(c);
else if (c < 0x800) { out.push(0xc0 | (c >> 6), 0x80 | (c & 0x3f)); }
else if (c >= 0xd800 && c < 0xdc00 && i + 1 < str.length) {
const c2 = str.charCodeAt(++i);
const cp = 0x10000 + (((c & 0x3ff) << 10) | (c2 & 0x3ff));
out.push(0xf0 | (cp >> 18), 0x80 | ((cp >> 12) & 0x3f), 0x80 | ((cp >> 6) & 0x3f), 0x80 | (cp & 0x3f));
} else {
out.push(0xe0 | (c >> 12), 0x80 | ((c >> 6) & 0x3f), 0x80 | (c & 0x3f));
}
}
return new Uint8Array(out);
}

// Build the canonical JSON (excluding the witness field — the witness hashes
// everything else). Using a fixed key order so two sessions that assemble
// equivalent mirrors produce byte-identical payloads.
function _canonicalJson(core) {
const ordered = {
version: core.version,
createdAt: core.createdAt,
consistency: core.consistency,
brains: core.brains,
tracks: core.tracks,
dynamics: core.dynamics,
observations: core.observations,
hnsw: {
mode: core.hnsw.mode,
serialized: core.hnsw.serialized, // always null in replay mode
insertionOrder: core.hnsw.insertionOrder,
params: core.hnsw.params,
},
};
return JSON.stringify(ordered);
}

// Shared core builder — used by both sync and async variants so the
// canonicalization path stays identical.
function _buildCore(opts) {
if (!opts || typeof opts !== 'object') {
throw new Error('buildSnapshot: missing options object');
}
const {
brainMirror, trackMirror, dynamicsMirror, observations,
indexKind = 'euclidean',
insertionOrder = [],
consistency = 'fresh',
dim = null,
} = opts;
if (!(brainMirror instanceof Map)) throw new Error('buildSnapshot: brainMirror must be a Map');
if (!(trackMirror instanceof Map)) throw new Error('buildSnapshot: trackMirror must be a Map');
if (!(dynamicsMirror instanceof Map)) throw new Error('buildSnapshot: dynamicsMirror must be a Map');
if (!(observations instanceof Map)) throw new Error('buildSnapshot: observations must be a Map');

// Filter insertionOrder to ids still present in the mirror. The bridge
// sometimes _debugResets mid-session; an id in the order list with no
// mirror entry is just noise.
const order = Array.isArray(insertionOrder)
? insertionOrder.filter((id) => brainMirror.has(id))
: [];

const resolvedDim = Number.isFinite(dim) && dim > 0
? (dim | 0)
: (brainMirror.size > 0
? (brainMirror.values().next().value.vector.length | 0)
: 0);

return {
version: ARCHIVE_SCHEMA_VERSION,
createdAt: new Date().toISOString(),
consistency,
brains: _canonicalBrainRows(brainMirror),
tracks: _canonicalVecRows(trackMirror),
dynamics: _canonicalVecRows(dynamicsMirror),
observations: _canonicalObsRows(observations),
hnsw: {
mode: 'replay',
serialized: null,
insertionOrder: order,
params: { dim: resolvedDim, metric: 'cosine', indexKind },
},
};
}

// Public sync API. `witness` uses xxHash32 (tagged "x32:...") — good enough
// for the self-check role, synchronous so the call site doesn't have to
// await. Validates the result before returning so a malformed snapshot can
// never leak out of this module.
export function buildSnapshot(opts) {
const core = _buildCore(opts);
const json = _canonicalJson(core);
core.witness = _witnessSync(json);
const v = validateSnapshot(core);
if (!v.ok) throw new Error(`buildSnapshot: produced invalid snapshot (${v.reason})`);
return core;
}

// Public async API — uses crypto.subtle sha-256 when available. Useful when
// the caller wants the stronger self-check; falls back to the sync path on
// insecure contexts (file://, old Safari) without surfacing the difference
// except through the witness-string prefix.
export async function buildSnapshotAsync(opts) {
const core = _buildCore(opts);
const json = _canonicalJson(core);
core.witness = await _witnessAsync(json);
const v = validateSnapshot(core);
if (!v.ok) throw new Error(`buildSnapshotAsync: produced invalid snapshot (${v.reason})`);
return core;
}
81 changes: 81 additions & 0 deletions AI-Car-Racer/archive/hash.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
// archive/hash.js
// Phase 0 — Foundations. xxHash32 over a flattened Float32Array brain,
// returned as an 8-char lowercase hex string. Used as the canonical brain ID
// by F3 (warm-restart bundles), F5 (content-addressed dedup), and F6
// (cross-tab — hash makes "is this the same brain?" a byte comparison).
//
// Why xxHash32 and not crypto.subtle.digest(): we need to hash on the hot
// path during GA evaluation (potentially thousands per second), and the
// crypto API is async-only. xxHash32 is non-cryptographic but
// collision-resistant enough for a browser archive of ≤10⁵ brains — the
// collision worry at 10⁵ entries in a 2³² space is ~1 in 1000, which we
// detect cheaply by comparing the underlying flat bytes on collision.
//
// Reference impl: https://github.com/Cyan4973/xxHash/blob/dev/doc/xxhash_spec.md

const PRIME32_1 = 0x9e3779b1 | 0;
const PRIME32_2 = 0x85ebca77 | 0;
const PRIME32_3 = 0xc2b2ae3d | 0;
const PRIME32_4 = 0x27d4eb2f | 0;
const PRIME32_5 = 0x165667b1 | 0;

function rotl32(x, r) { return ((x << r) | (x >>> (32 - r))) | 0; }
function mul32(a, b) { return Math.imul(a, b) | 0; }

// Hash a Uint8Array into an unsigned 32-bit integer.
export function xxHash32Bytes(bytes, seed = 0) {
const len = bytes.length;
let h32;
let i = 0;

if (len >= 16) {
let v1 = (seed + PRIME32_1 + PRIME32_2) | 0;
let v2 = (seed + PRIME32_2) | 0;
let v3 = (seed + 0) | 0;
let v4 = (seed - PRIME32_1) | 0;

while (i + 16 <= len) {
const k1 = bytes[i] | (bytes[i+1] << 8) | (bytes[i+2] << 16) | (bytes[i+3] << 24);
const k2 = bytes[i+4] | (bytes[i+5] << 8) | (bytes[i+6] << 16) | (bytes[i+7] << 24);
const k3 = bytes[i+8] | (bytes[i+9] << 8) | (bytes[i+10] << 16) | (bytes[i+11] << 24);
const k4 = bytes[i+12] | (bytes[i+13] << 8) | (bytes[i+14] << 16) | (bytes[i+15] << 24);
v1 = mul32(rotl32((v1 + mul32(k1, PRIME32_2)) | 0, 13), PRIME32_1);
v2 = mul32(rotl32((v2 + mul32(k2, PRIME32_2)) | 0, 13), PRIME32_1);
v3 = mul32(rotl32((v3 + mul32(k3, PRIME32_2)) | 0, 13), PRIME32_1);
v4 = mul32(rotl32((v4 + mul32(k4, PRIME32_2)) | 0, 13), PRIME32_1);
i += 16;
}

h32 = (rotl32(v1, 1) + rotl32(v2, 7) + rotl32(v3, 12) + rotl32(v4, 18)) | 0;
} else {
h32 = (seed + PRIME32_5) | 0;
}

h32 = (h32 + len) | 0;

while (i + 4 <= len) {
const k = bytes[i] | (bytes[i+1] << 8) | (bytes[i+2] << 16) | (bytes[i+3] << 24);
h32 = mul32(rotl32((h32 + mul32(k, PRIME32_3)) | 0, 17), PRIME32_4);
i += 4;
}

while (i < len) {
h32 = mul32(rotl32((h32 + mul32(bytes[i], PRIME32_5)) | 0, 11), PRIME32_1);
i++;
}

h32 ^= h32 >>> 15;
h32 = mul32(h32, PRIME32_2);
h32 ^= h32 >>> 13;
h32 = mul32(h32, PRIME32_3);
h32 ^= h32 >>> 16;

return h32 >>> 0;
}

// Hash a Float32Array (a flattened brain) into an 8-char lowercase hex string.
export function hashBrain(flat, seed = 0) {
const bytes = new Uint8Array(flat.buffer, flat.byteOffset, flat.byteLength);
const h = xxHash32Bytes(bytes, seed);
return h.toString(16).padStart(8, '0');
}
Loading
Loading