Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -920,6 +920,7 @@
"lint:fix": "bun --bun oxlint -c oxlint.config.ts --type-aware --fix ."
},
"devDependencies": {
"@brazilian-utils/brazilian-utils": "^2.3.0",
"@stll/oxlint-config": "^0.3.0",
"@stll/typescript-config": "^0.3.0",
"@types/node": "^25.9.1",
Expand All @@ -933,6 +934,7 @@
"oxfmt": "^0.52.0",
"oxlint": "^1.67.0",
"oxlint-tsgolint": "^0.23.0",
"rut.js": "^2.1.0",
"stdnum": "^1.11.14",
"tsdown": "0.22.1",
"typescript": "^5.9.3",
Expand Down
257 changes: 256 additions & 1 deletion scripts/oracle.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
* bun run oracle:survey
*/

import {
isValidCnpj,
isValidCpf,
} from "@brazilian-utils/brazilian-utils";
Comment on lines +20 to +23
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Import tmpdir from node:os to enable cross-platform and secure temporary file creation.

Suggested change
import {
isValidCnpj,
isValidCpf,
} from "@brazilian-utils/brazilian-utils";
import {
isValidCnpj,
isValidCpf,
} from "@brazilian-utils/brazilian-utils";
import { tmpdir } from "node:os";

Copy link
Copy Markdown
Contributor Author

@jan-kubica jan-kubica Jun 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Imported in fd776d6.

CC on behalf of @jan-kubica

import fc from "fast-check";
import IBAN from "iban";
import { isValidIBAN } from "ibantools";
Expand Down Expand Up @@ -45,7 +49,10 @@ import {
norway,
} from "jsvat";
import { execSync } from "node:child_process";
import { writeFileSync } from "node:fs";
import { unlinkSync, writeFileSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { validate as validateRut } from "rut.js";
Comment on lines 51 to +55
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Import unlinkSync from node:fs to clean up the temporary Python script after execution.

Suggested change
import { execSync } from "node:child_process";
import { writeFileSync } from "node:fs";
import { validate as validateRut } from "rut.js";
import { execSync } from "node:child_process";
import { writeFileSync, unlinkSync } from "node:fs";
import { validate as validateRut } from "rut.js";

Copy link
Copy Markdown
Contributor Author

@jan-kubica jan-kubica Jun 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Imported in fd776d6.

CC on behalf of @jan-kubica

import {
validateEntity as stdnumEntity,
validatePerson as stdnumPerson,
Expand Down Expand Up @@ -506,6 +513,89 @@ const CUSTOM_ARB: Record<string, fc.Arbitrary<string>> = {
),
)
.map(([p, d, c]) => `${p}00${d}${c}`),
// Indian PAN: 5 letters + 4 digits + 1 letter.
// Without a custom arb, the default digit-only
// generator never produces a format-valid PAN, so
// the oracle would compare 0/N valid samples.
"in_.pan": fc
.tuple(
fc
.array(letters(L), { minLength: 5, maxLength: 5 })
.map((c) => c.join("")),
digs(4),
letters(L),
)
.map(([p, d, s]) => `${p}${d}${s}`),
// Mexican CURP: 4 letters + DDMMYY + H|M + 2 state
// letters + 3 consonants + 1 alphanumeric + 1 digit.
// The 2nd letter must be a vowel (or X) per the
// canonical regex; chars 14-16 must be consonants.
"mx.curp": validDateParts(1900, 2099).chain(
({ year, month, day }) =>
fc
.tuple(
letters(L),
letters("AEIOUX"),
letters(L),
letters(L),
fc.constantFrom("H", "M"),
fc
.array(letters(L), {
minLength: 2,
maxLength: 2,
})
.map((c) => c.join("")),
fc
.array(letters("BCDFGHJKLMNPQRSTVWXYZ"), {
minLength: 3,
maxLength: 3,
})
.map((c) => c.join("")),
alnumStr(1, 1),
digs(1),
)
.map(([a, b, c, d2, g, st, cs, alpha, dg]) => {
const yy = p2(year % 100);
const mm = p2(month);
const dd = p2(day);
return `${a}${b}${c}${d2}${yy}${mm}${dd}${g}${st}${cs}${alpha}${dg}`;
}),
),
// Mexican RFC: persona física = 4 letters + YYMMDD
// + 3 alphanumeric (13 chars); persona moral = 3
// letters + YYMMDD + 3 alphanumeric (12 chars).
"mx.rfc": validDateParts(1900, 2099).chain(
({ year, month, day }) => {
const yy = p2(year % 100);
const mm = p2(month);
const dd = p2(day);
const date = `${yy}${mm}${dd}`;
return fc.oneof(
fc
.tuple(
fc
.array(letters(L), {
minLength: 4,
maxLength: 4,
})
.map((c) => c.join("")),
alnumStr(3, 3),
)
.map(([n, c]) => `${n}${date}${c}`),
fc
.tuple(
fc
.array(letters(L), {
minLength: 3,
maxLength: 3,
})
.map((c) => c.join("")),
alnumStr(3, 3),
)
.map(([n, c]) => `${n}${date}${c}`),
);
},
),
"za.idnr": dateDigs(13, "ymd"),
"mu.brn": fc.oneof(
fc
Expand Down Expand Up @@ -634,6 +724,13 @@ const hasPython = () =>
probe(`${PYTHON} -c "import stdnum"`);
const hasIdnumbers = () =>
probe(`${PYTHON} -c "import idnumbers"`);
const hasLocalflavor = () =>
probe(
`${PYTHON} -c ` +
`"from django.conf import settings;` +
` settings.configure(USE_I18N=False);` +
` import localflavor"`,
);
const hasRust = () => probe(`test -f ${RUST_BIN}`);
const hasRubyValvat = () =>
probe(`GEM_HOME=${RUBY_GEM} ruby -e "require 'valvat'"`);
Expand Down Expand Up @@ -679,6 +776,51 @@ const pyIdnBatch: SubBatch = (cls, vals) => {
.map((l) => l === "1");
};

// django-localflavor (Python): "{mod}.forms.{Field}"
// The module path may contain dots (e.g. "in_.forms"),
// so we split on the last dot to separate the class.
const localflavorBatch: SubBatch = (path, vals) => {
const lastDot = path.lastIndexOf(".");
const mod = path.slice(0, lastDot);
const name = path.slice(lastDot + 1);
const json = JSON.stringify(vals);
const s = `import json, sys
from django.conf import settings
if not settings.configured:
settings.configure(USE_I18N=False)
from django.core.exceptions import ValidationError
from localflavor.${mod} import ${name}
field = ${name}()
vals = json.loads(sys.stdin.read())
for v in vals:
try:
field.clean(v)
print("1")
except (ValidationError, Exception):
print("0")`;
const tmp = join(
tmpdir(),
`_stdnum_localflavor_${String(process.pid)}.py`,
);
writeFileSync(tmp, s);
try {
return execSync(`${PYTHON} ${tmp}`, {
input: json,
encoding: "utf-8",
timeout: 60_000,
})
.trim()
.split("\n")
.map((l) => l === "1");
} finally {
try {
unlinkSync(tmp);
} catch {
// Best-effort cleanup; ignore if already gone.
}
}
};

const rustBatch: SubBatch = (fmt, vals) => {
const json = JSON.stringify(vals);
return execSync(`${RUST_BIN} ${fmt}`, {
Expand Down Expand Up @@ -887,6 +1029,43 @@ const IDNUMBERS: Record<string, string> = {
"tr.tckimlik": "TUR.PersonalID",
};

// django-localflavor: key → "{module}.forms.{Field}".
// localflavor ships Django form fields whose .clean()
// performs format + checksum validation; we drive the
// field's clean() directly via a configured-but-empty
// Django settings setup so no full project is needed.
const LOCALFLAVOR: Record<string, string> = {
"ar.cuit": "ar.forms.ARCUITField",
"ar.dni": "ar.forms.ARDNIField",
"au.abn": "au.forms.AUBusinessNumberField",
"au.acn": "au.forms.AUCompanyNumberField",
"au.tfn": "au.forms.AUTaxFileNumberField",
"br.cpf": "br.forms.BRCPFField",
"br.cnpj": "br.forms.BRCNPJField",
"ca.sin": "ca.forms.CASocialInsuranceNumberField",
"cl.rut": "cl.forms.CLRutField",
"es.dni": "es.forms.ESIdentityCardNumberField",
"in_.aadhaar": "in_.forms.INAadhaarNumberField",
"in_.pan": "in_.forms.INPANCardNumberFormField",
"mx.clabe": "mx.forms.MXCLABEField",
"mx.curp": "mx.forms.MXCURPField",
"mx.rfc": "mx.forms.MXRFCField",
"us.ssn": "us.forms.USSocialSecurityNumberField",
};

// Some localflavor fields require punctuated input
// (e.g., CASocialInsuranceNumberField rejects bare
// digits). Apply a per-key shape before sending.
const LOCALFLAVOR_FORMAT: Record<
string,
(v: string) => string
> = {
"ca.sin": (v) =>
v.length === 9
? `${v.slice(0, 3)}-${v.slice(3, 6)}-${v.slice(6)}`
: v,
};

// valvat (Ruby): key → VAT prefix
const VALVAT: Record<string, string> = {
"at.uid": "AT",
Expand Down Expand Up @@ -1013,6 +1192,49 @@ const SURVEY_ONLY_ENTRIES = new Set([
"stdnum-js:lt.asmens",
"stdnum-js:ro.cnp",
"validate-polish:pl.pesel",
// rut.js rejects any RUT body that starts with 0
// as a stylistic policy. Our validator follows the
// checksum math only, so leading-zero bodies are
// valid for us. Useful as a probe, not a gate.
"rut.js:cl.rut",
// localflavor's BRCNPJField does not yet support
// the alphanumeric (v2) CNPJ format that Receita
// Federal began issuing in July 2026. Our
// validator does. Probe-only until upstream catches up.
"localflavor:br.cnpj",
// localflavor's INAadhaarNumberField checks only
// format ("XXXX XXXX XXXX" / no all-zero group),
// not the Verhoeff checksum required by UIDAI.
// Our validator is stricter; expect ~85% false
// positives from the oracle.
"localflavor:in_.aadhaar",
// Same leading-zero policy disagreement as rut.js.
"localflavor:cl.rut",
// ARCUITField only allows the individual/company
// prefix set {20,23,24,27,30,33,34}. AFIP also
// issues CUITs with the international prefixes
// {50,51,55}, which both our validator and
// python-stdnum accept. localflavor is the outlier
// here, so the pairing stays a probe, not a gate.
"localflavor:ar.cuit",
// python-stdnum's mx.rfc is_valid() defaults to
// validate_check_digits=False, so it accepts any
// format-valid RFC. Our validator always checks
// the SAT mod-11 check digit, producing systematic
// drift.
"python-stdnum:mx.rfc",
// localflavor's MXRFCField requires the 2nd
// character of a persona física RFC to be a vowel.
// We accept any letter, matching the SAT regex on
// python-stdnum.
"localflavor:mx.rfc",
// python-stdnum accepts holder-type 'K' (deprecated
// but listed in their _pan_holder_types) and rejects
// PANs whose 4-digit serial is "0000" (per the
// Income Tax Dept tutorial). Our validator excludes
// 'K' and does not reject "0000"; both differences
// are defensible per source.
"python-stdnum:in_.pan",
]);

const tierFor = (source: string, key: string): OracleMode =>
Expand Down Expand Up @@ -1067,6 +1289,39 @@ const buildOracles = (): OracleEntry[] => {
);
}

// django-localflavor
if (hasLocalflavor()) {
for (const [key, path] of Object.entries(LOCALFLAVOR)) {
const shape = LOCALFLAVOR_FORMAT[key];
safe(
`${key} (vs localflavor)`,
"localflavor",
key,
(v) =>
localflavorBatch(path, shape ? v.map(shape) : v),
Comment on lines +1300 to +1301
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Add valid generators for the new localflavor mappings

With these new entries, any mapped validator that does not declare lengths and has no CUSTOM_ARB override falls through arbFor to the default 10-digit generator. That makes mappings such as au.abn, au.acn, au.tfn, br.cpf, and us.ssn compare only invalid-length samples, so bun run oracle can report zero gate disagreements while never exercising valid values for those new localflavor comparators; add per-key arbs or lengths before treating them as gate coverage.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in e5e34d8. Two things going on:

  1. Digit-only mappings (au.abn, au.acn, au.tfn, br.cpf, ca.sin, us.ssn) are now actually covered after the rebase onto main, which picked up fix(oracle): probe validators with their real lengths #107's lengthsFromExamples fix in inferArb — these validators don't declare lengths but their examples are 11/9/9/11/9/9 chars, so the arb generates the right length and the comparators do exercise the checksum path. Re-running with ORACLE_SAMPLES=1000 shows valid-sample rates that match expectations (~1-3% for Luhn / similar weighted checks, 21/1000 for us.ssn, etc.).

  2. Alphanumeric mappings were the genuine gap — in_.pan (5 letters + 4 digits + 1 letter), mx.curp (18-char structured), and mx.rfc (12/13-char persona física/moral) were producing 0/N valid samples because the default arb is digit-only. Added per-key CUSTOM_ARB entries that respect each format's character classes (mx.curp vowel/consonant constraints, mx.rfc persona-física vs moral lengths). The new arbs immediately surfaced real semantic differences between us and the oracles, which I marked survey-only with documented reasons:

    • python-stdnum:mx.rfc — their is_valid() defaults to validate_check_digits=False; ours always verifies the SAT mod-11 check digit.
    • localflavor:mx.rfc — MXRFCField requires the 2nd char of a persona física to be a vowel; we follow the SAT regex on python-stdnum.
    • python-stdnum:in_.pan — they accept holder-type 'K' (deprecated per their own comment) and reject 0000-serial PANs; ours excludes 'K' and accepts 0000.

Gate-mode disagreement count stays 0 for the new mappings that remain in gate.

CC on behalf of @jan-kubica

);
}
}

// brazilian-utils (always available)
safe(
"br.cpf (vs brazilian-utils)",
"brazilian-utils",
"br.cpf",
(v) => v.map(isValidCpf),
);
safe(
"br.cnpj (vs brazilian-utils)",
"brazilian-utils",
"br.cnpj",
(v) => v.map((x) => isValidCnpj(x, { version: 2 })),
);

// rut.js (always available)
safe("cl.rut (vs rut.js)", "rut.js", "cl.rut", (v) =>
v.map(validateRut),
);

// jsvat (always available)
for (const [key, [cfg, pfx]] of Object.entries(JSVAT))
e.push({
Expand Down
Loading