Skip to content

Commit

Permalink
fix: split
Browse files Browse the repository at this point in the history
  • Loading branch information
seanghay committed Jul 5, 2022
1 parent e87ad7e commit 5380cff
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 82 deletions.
3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,8 @@
"tsup": "^6.1.2",
"typescript": "^4.7.4",
"vitest": "^0.16.0"
},
"dependencies": {
"magic-string": "^0.26.2"
}
}
15 changes: 15 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

86 changes: 34 additions & 52 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,56 +1,6 @@
const ZWSP_CHAR = "\u200b";

function delimiterSplit(input: string): string[] {
return input.split(ZWSP_CHAR);
}

function createReplacer(value: string) {
const slices = delimiterSplit(value);
const items = [];
for (const slice of slices) {
for (const segmentation of segment(slice)) {
items.push(segmentation.segment);
}
}
return joinToString(items);
}

export function processText(input: string) {
const khmerRegex = /[\u1780-\u17ff]+/gm;
let result: RegExpExecArray | null;

const matches = [];
while ((result = khmerRegex.exec(input))) {
if (!result) continue;
matches.push({
index: result.index,
value: result[0],
});
}

let text = input;
let offset = 0;

for (const match of matches) {
const start = match.index - offset;
const end = match.value.length + start;
const replacer = createReplacer(match.value);
text = text.slice(0, start) + replacer + text.slice(end, text.length);
offset += match.value.length - replacer.length;
}

return text;
}
import MagicString from "magic-string";

export function split(input: string): string[] {
return processText(input).split(/[\u200b\s]/);
}

function segment(input: string) {
const segmenter = new Intl.Segmenter("km", { granularity: "word" });
const segments = segmenter.segment(input);
return Array.from(segments);
}
const ZWSP_CHAR = "\u200b";

/**
* Join a string with zero-width space.
Expand Down Expand Up @@ -82,3 +32,35 @@ export function normalize(input: string): string {
export function sanitize(input: string): string {
return input.replace(/\u200b*/gm, "");
}


export function tokenize(input: string) {
const khmerRegex = /[\u1780-\u17ff]+/gm;
let result: RegExpExecArray | null;
const entities = [];
while ((result = khmerRegex.exec(input))) {
if (!result) continue;
const value = result[0];
entities.push({
start: result.index,
end: result.index + value.length,
value,
});
}
return entities;
}

export function split(input: string): string[] {
const text = sanitize(normalize(input));
const entities = tokenize(text);
const str = new MagicString(text);
const segmenter = new Intl.Segmenter("km", { granularity: "word" });
for (const entity of entities) {
const results = Array.from(segmenter.segment(entity.value)).map(
(it) => it.segment
);
const joined = results.join(ZWSP_CHAR);
str.overwrite(entity.start, entity.end, joined);
}
return str.toString().split(ZWSP_CHAR);
}
35 changes: 5 additions & 30 deletions test/segment.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@ import { joinToString, split } from "../dist";
import { normalize, sanitize } from "../src";

describe("segment", () => {

it('should ignore unknown chars', () => {
expect(split('ខ្មែរខ្មែរ ABC ខ្មែរ')).toEqual(['ខ្មែរ', 'ខ្មែរ ABC ខ្មែរ'])
});

it("should remove all zwsp from a string", () => {
expect(sanitize(`A\u200bB`)).toBe(`AB`);
expect(sanitize(`A\u200b\u200bB\u200b`)).toBe(`AB`);
Expand All @@ -21,34 +26,4 @@ describe("segment", () => {
expect(joinToString(["A"])).toBe("A");
});

it("should create zero width space", () => {
expect(split("មិនមានអ្នកណាដឹងថា")).toEqual([
"មិន",
"មាន",
"អ្នកណា",
"ដឹង",
"ថា",
]);

expect(split("មិនមានអ្នក ណា ដឹងថា")).toEqual([
"មិន",
"មាន",
"អ្នក",
"ណា",
"ដឹង",
"ថា",
]);

expect(split("ABC មិនមាន ABC អ្នក\u200bណា\u200bដឹងថា ABC")).toEqual([
"ABC",
"មិន",
"មាន",
"ABC",
"អ្នក",
"ណា",
"ដឹង",
"ថា",
"ABC",
]);
});
});

0 comments on commit 5380cff

Please sign in to comment.