diff --git a/app.ts b/app.ts index 27228cd..0b67279 100644 --- a/app.ts +++ b/app.ts @@ -15,6 +15,7 @@ import caniuseRouter from "./routes/caniuse/index.js"; import githubRouter from "./routes/github/index.js"; import respecRouter from "./routes/respec/index.js"; import w3cRouter from "./routes/w3c/index.js"; +import apiRouter from "./routes/api/index.js"; import wellKnownRouter from "./routes/well-known/index.js"; import docsRouter from "./routes/docs/index.js"; @@ -45,6 +46,7 @@ app.use("/caniuse", caniuseRouter); app.use("/github/:org/:repo", githubRouter); app.use("/respec", respecRouter); app.use("/w3c", w3cRouter); +app.use("/api", apiRouter); app.use("/.well-known", wellKnownRouter); app.use("/docs", docsRouter); app.get("/", (_req, res) => res.redirect("/docs/")); diff --git a/routes/api/index.ts b/routes/api/index.ts new file mode 100644 index 0000000..23790e7 --- /dev/null +++ b/routes/api/index.ts @@ -0,0 +1,8 @@ +import express from "express"; +import unicode from "./unicode/index.js"; + +const router = express.Router({ mergeParams: true }); + +router.use("/unicode", unicode); + +export default router; diff --git a/routes/api/unicode/index.ts b/routes/api/unicode/index.ts new file mode 100644 index 0000000..8d2d143 --- /dev/null +++ b/routes/api/unicode/index.ts @@ -0,0 +1,21 @@ +import path from "node:path"; + +import express from "express"; +import cors from "cors"; + +import { env, ms } from "../../../utils/misc.js"; + +import namesRoute from "./names.js"; +import updateRoute from "./update.js"; + +const DATA_DIR = env("DATA_DIR"); + +const router = express.Router({ mergeParams: true }); + +router + .options("/names", cors({ methods: ["POST", "GET"], maxAge: ms("1day") })) + .post("/names", express.json({ limit: "2mb" }), cors(), namesRoute); +router.post("/update", updateRoute); +router.use("/data", express.static(path.join(DATA_DIR, "unicode"))); + +export default router; diff --git a/routes/api/unicode/lib/scraper.ts b/routes/api/unicode/lib/scraper.ts new file mode 100644 index 0000000..9d0bc47 --- /dev/null +++ b/routes/api/unicode/lib/scraper.ts @@ -0,0 +1,89 @@ +import path from "node:path"; +import { tmpdir } from "node:os"; +import { createReadStream, createWriteStream } from "node:fs"; +import { mkdir, rm } from "node:fs/promises"; +import { Readable } from "node:stream"; +import { ReadableStream } from "node:stream/web"; +import { finished } from "node:stream/promises"; +import { createInterface } from "node:readline/promises"; + +import { env } from "../../../../utils/misc.js"; + +const DATA_DIR = env("DATA_DIR"); + +export const INPUT_DATA_SOURCE = `https://unicode.org/Public/UNIDATA/UnicodeData.txt`; +const OUT_DIR_BASE = path.join(DATA_DIR, "unicode"); +const OUT_FILE_BY_CODEPOINT = path.resolve( + OUT_DIR_BASE, + "./codepoint-to-name.json", +); + +const defaultOptions = { forceUpdate: false }; +type Options = typeof defaultOptions; + +export default async function main(options: Partial = {}) { + options = { ...defaultOptions, ...options } as Options; + const hasUpdated = await updateInputSource(); + if (!hasUpdated && !options.forceUpdate) { + console.log("Nothing to update"); + return false; + } + + return true; +} + +// download file and convert its data to JSON +async function updateInputSource() { + await mkdir(OUT_DIR_BASE, { recursive: true }); + + const namesJs = path.join(tmpdir(), "unicode-all-names.js"); + await rm(namesJs, { force: true }); + await rm(OUT_FILE_BY_CODEPOINT, { force: true }); + + console.log(`Downloading`, INPUT_DATA_SOURCE, "to", namesJs); + await downloadFile(INPUT_DATA_SOURCE, namesJs); + + console.log("Converting to JSON and writing to", OUT_FILE_BY_CODEPOINT); + const rl = createInterface({ + input: createReadStream(namesJs), + crlfDelay: Infinity, + }); + const dest = createWriteStream(OUT_FILE_BY_CODEPOINT, { flags: "a" }); + dest.write("[\n"); + for await (const line of rl) { + const parsed = parseLine(line); + if (!parsed) continue; + dest.write(JSON.stringify(parsed) + ",\n"); + } + dest.write(`["null", {"name": ""}]`); + dest.write("\n]\n"); + await new Promise(resolve => dest.end(resolve)); + + console.log("Wrote to", OUT_FILE_BY_CODEPOINT); + await rm(namesJs, { force: true }); + + return true; +} + +// Parse a line based on https://www.unicode.org/Public/5.1.0/ucd/UCD.html#UnicodeData.txt +// e.g. 0001;;Cc;0;BN;;;;;N;START OF HEADING;;;; +// -> 0001 -> {name: "[control]", generalCategory: "Cc", ...} +function parseLine(line: string) { + if (line.startsWith("#")) { + return null; // comments + } + + const parts = line.split(";"); + const codepoint = parts[0]; + const name = parts[1].replace(/[<>]/g, s => (s === "<" ? "[" : "]")); + return [codepoint, { name }] as const; +} + +async function downloadFile(url: string, destination: string) { + const res = await fetch(url); + await mkdir(path.dirname(destination), { recursive: true }); + const outStream = createWriteStream(destination, { flags: "wx" }); + await finished( + Readable.fromWeb(res.body as ReadableStream).pipe(outStream), + ); +} diff --git a/routes/api/unicode/lib/store-init.ts b/routes/api/unicode/lib/store-init.ts new file mode 100644 index 0000000..11ea147 --- /dev/null +++ b/routes/api/unicode/lib/store-init.ts @@ -0,0 +1,5 @@ +import { Store } from "./store.js"; + +export const store = new Store(); + +export type { Store }; diff --git a/routes/api/unicode/lib/store.ts b/routes/api/unicode/lib/store.ts new file mode 100644 index 0000000..ba4a6c7 --- /dev/null +++ b/routes/api/unicode/lib/store.ts @@ -0,0 +1,35 @@ +import path from "path"; +import { readFileSync } from "fs"; + +import { env } from "../../../../utils/misc.js"; +import { INPUT_DATA_SOURCE } from "./scraper.js"; + +export class Store { + version = -1; + private codepointToName: Map = new Map(); + + constructor() { + this.fill(); + } + + /** Fill the store with its contents from the filesystem. */ + fill() { + this.codepointToName = new Map(readJson("codepoint-to-name.json")); + this.version = Date.now(); + } + + getNameByHexCodePoint(hex: string) { + return this.codepointToName.get(hex) ?? null; + } + + get dataSource() { + return INPUT_DATA_SOURCE; + } +} + +function readJson(filename: string) { + const DATA_DIR = env("DATA_DIR"); + const dataFile = path.resolve(DATA_DIR, `./unicode/${filename}`); + const text = readFileSync(dataFile, "utf8"); + return JSON.parse(text); +} diff --git a/routes/api/unicode/names.ts b/routes/api/unicode/names.ts new file mode 100644 index 0000000..a213dae --- /dev/null +++ b/routes/api/unicode/names.ts @@ -0,0 +1,59 @@ +import type { Request, Response } from "express"; + +import { store, type Store } from "./lib/store-init.js"; + +interface Query { + /** Codepoint as hex */ + hex: string; +} +interface Result { + name: string; +} + +type Options = Record; + +interface RequestBody { + queries: Query[]; + options?: Options; +} +type IRequest = Request; + +interface ResponseData { + data: Array<{ query: Query; result: Result | null }>; + metadata: { lastParsedAt: string; dataSource: string }; +} + +export default function route(req: IRequest, res: Response) { + const { options = {}, queries = [] } = req.body; + const data: ResponseData["data"] = queries.map(query => ({ + query, + result: search(query, store, options), + })); + + Object.assign(res.locals, { + errors: getErrorCount(data), + queries: queries.length, + }); + + const result: ResponseData = { + data, + metadata: { + lastParsedAt: store.version.toString(), + dataSource: store.dataSource, + }, + }; + res.json(result); +} + +function search(query: Query, store: Store, _options: Options): Result | null { + if (query.hex) { + query.hex = query.hex.toUpperCase().padStart(4, "0"); + const data = store.getNameByHexCodePoint(query.hex); + return data; + } + return null; +} + +function getErrorCount(results: ResponseData["data"]) { + return results.filter(({ result }) => !result).length; +} diff --git a/routes/api/unicode/update.ts b/routes/api/unicode/update.ts new file mode 100644 index 0000000..cee0ec7 --- /dev/null +++ b/routes/api/unicode/update.ts @@ -0,0 +1,26 @@ +import path from "path"; +import { legacyDirname } from "../../../utils/misc.js"; +import { BackgroundTaskQueue } from "../../../utils/background-task-queue.js"; +import { store } from "./lib/store-init.js"; +import type { Request, Response } from "express"; + +const workerFile = path.join(legacyDirname(import.meta), "update.worker.js"); +const taskQueue = new BackgroundTaskQueue( + workerFile, + "unicode_update", +); + +export default async function route(req: Request, res: Response) { + const job = taskQueue.add({}); + try { + const { updated } = await job.run(); + if (updated) { + store.fill(); + } + } catch { + res.status(500); + } finally { + res.locals.job = job.id; + res.send(job.id); + } +} diff --git a/routes/api/unicode/update.worker.ts b/routes/api/unicode/update.worker.ts new file mode 100644 index 0000000..d159772 --- /dev/null +++ b/routes/api/unicode/update.worker.ts @@ -0,0 +1,8 @@ +import unicodeScraper from "./lib/scraper.js"; + +interface Input {} + +export default async function unicodeUpdate(_input: Input) { + const updated = await unicodeScraper(); + return { updated }; +} diff --git a/scripts/update-data-sources.ts b/scripts/update-data-sources.ts index e6687cb..c57754a 100644 --- a/scripts/update-data-sources.ts +++ b/scripts/update-data-sources.ts @@ -3,6 +3,7 @@ import { mkdir } from "fs/promises"; import { env } from "../utils/misc.js"; import caniuse from "../routes/caniuse/lib/scraper.js"; import xref from "../routes/xref/lib/scraper.js"; +import unicode from "../routes/api/unicode/lib/scraper.js"; import w3cGroupsList from "./update-w3c-groups-list.js"; // ensure the data directory exists @@ -16,6 +17,10 @@ console.group("xref"); await xref({ forceUpdate: true }); console.groupEnd(); +console.group("unicode"); +await unicode({ forceUpdate: true }); +console.groupEnd(); + console.group("W3C Groups List"); await w3cGroupsList(); console.groupEnd();