chore: initial commit

seanghay · Nov 5, 2023 · 8687412 · 8687412
commit 8687412
Show file tree

Hide file tree

Showing 8 changed files with 366 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,157 @@
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+lerna-debug.log*
+.pnpm-debug.log*
+
+# Diagnostic reports (https://nodejs.org/api/report.html)
+report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
+
+# Runtime data
+pids
+*.pid
+*.seed
+*.pid.lock
+
+# Directory for instrumented libs generated by jscoverage/JSCover
+lib-cov
+
+# Coverage directory used by tools like istanbul
+coverage
+*.lcov
+
+# nyc test coverage
+.nyc_output
+
+# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
+.grunt
+
+# Bower dependency directory (https://bower.io/)
+bower_components
+
+# node-waf configuration
+.lock-wscript
+
+# Compiled binary addons (https://nodejs.org/api/addons.html)
+build/Release
+
+# Dependency directories
+node_modules/
+jspm_packages/
+
+# Snowpack dependency directory (https://snowpack.dev/)
+web_modules/
+
+# TypeScript cache
+*.tsbuildinfo
+
+# Optional npm cache directory
+.npm
+
+# Optional eslint cache
+.eslintcache
+
+# Optional stylelint cache
+.stylelintcache
+
+# Microbundle cache
+.rpt2_cache/
+.rts2_cache_cjs/
+.rts2_cache_es/
+.rts2_cache_umd/
+
+# Optional REPL history
+.node_repl_history
+
+# Output of 'npm pack'
+*.tgz
+
+# Yarn Integrity file
+.yarn-integrity
+
+# dotenv environment variable files
+.env
+.env.development.local
+.env.test.local
+.env.production.local
+.env.local
+
+# parcel-bundler cache (https://parceljs.org/)
+.cache
+.parcel-cache
+
+# Next.js build output
+.next
+out
+
+# Nuxt.js build / generate output
+.nuxt
+dist
+
+# Gatsby files
+.cache/
+# Comment in the public line in if your project uses Gatsby and not Next.js
+# https://nextjs.org/blog/next-9-1#public-directory-support
+# public
+
+# vuepress build output
+.vuepress/dist
+
+# vuepress v2.x temp and cache directory
+.temp
+.cache
+
+# Docusaurus cache and generated files
+.docusaurus
+
+# Serverless directories
+.serverless/
+
+# FuseBox cache
+.fusebox/
+
+# DynamoDB Local files
+.dynamodb/
+
+# TernJS port file
+.tern-port
+
+# Stores VSCode versions used for testing VSCode extensions
+.vscode-test
+
+# yarn v2
+.yarn/cache
+.yarn/unplugged
+.yarn/build-state.yml
+.yarn/install-state.gz
+.pnp.*
+
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk

diff --git a/.npmrc b/.npmrc
@@ -0,0 +1 @@
+package-lock=false
diff --git a/package.json b/package.json
@@ -0,0 +1,17 @@
+{
+  "name": "slugify-khmer",
+  "license": "MIT",
+  "version": "0.1.0",
+  "repository": "seanghay/slugify-khmer",
+  "description": "Slugify Khmer text into a latin form.",
+  "main": "./slugify.js",
+  "author": {
+    "name": "Seanghay Yath",
+    "email": "seanghay.dev@gmail.com",
+    "url": "https://github.com/seanghay"
+  },
+  "files": ["slugify.js", "slugify.d.ts"],
+  "dependencies": {
+    "split-khmer": "^1.0.1"
+  }
+}
diff --git a/readme.md b/readme.md
@@ -0,0 +1,24 @@
+### Slugify Khmer
+
+A simple Khmer text slugify built for speed not correctness and was built on top of [`split-khmer`](https://github.com/seanghay/split-khmer)
+
+### Install
+```
+npm install slugify-khmer
+```
+
+### Usage
+
+```javascript
+import { slugify } from 'slugify-khmer';
+
+slugify('មិនដឹងទេ that\'s nice') 
+// => mindoeng-te that's nice
+
+slugify('មិនដឹងទេ that\'s nice', "_") 
+// => mindoeng_te that's nice
+```
+
+### License 
+
+MIT
diff --git a/slugify.d.ts b/slugify.d.ts
@@ -0,0 +1,2 @@
+export function transform(text: string, delimiter?: string): Generator<string>;
+export function slugify(text: string, delimiter?: string): string;
diff --git a/slugify.js b/slugify.js
@@ -0,0 +1,159 @@
+const { split } = require('split-khmer');
+
+const firstSeries = new Set('កខចឆដឋណតថបផឝសហឡអ');
+const vowelsDefault = ['a', 'o'];
+const consonants = new Map([
+	["ក", "k"],
+	["ខ", "kh"],
+	["គ", "k"],
+	["ឃ", "kh"],
+	["ង", "ng"],
+	["ច", "ch"],
+	["ឆ", "chh"],
+	["ជ", "ch"],
+	["ឈ", "chh"],
+	["ញ", "nh"],
+	["ដ", "d"],
+	["ឋ", "th"],
+	["ឌ", "d"],
+	["ឍ", "th"],
+	["ណ", "n"],
+	["ត", "t"],
+	["ថ", "th"],
+	["ទ", "t"],
+	["ធ", "th"],
+	["ន", "n"],
+	["ប", "b"],
+	["ផ", "ph"],
+	["ព", "p"],
+	["ភ", "ph"],
+	["ម", "m"],
+	["យ", "y"],
+	["រ", "r"],
+	["ល", "l"],
+	["វ", "v"],
+	["ឝ", "sh"],
+	["ឞ", "ss"],
+	["ស", "s"],
+	["ហ", "h"],
+	["ឡ", "l"],
+	["អ", "a"],
+	//
+	['ឥ', 'e'],
+	['ឦ', 'ei'],
+	['ឧ', 'u'],
+	['ឩ', 'u'],
+	['ឩ', 'au'],
+	['ឫ', 'rue'],
+	['ឭ', 'lue'],
+	['ឭ', 'lue'],
+	['ឮ', 'lueu'],
+	['ឯ', 'ae'],
+	['ឰ', 'ai'],
+	['ឲ', 'ao'],
+	['ឱ', 'ao'],
+	['ឳ', 'au'],
+]);
+
+const vowelEntries = [
+	['◌់', ['a', 'o']],
+	['ា', ['a', 'ea']],
+	['ា់', ['a', 'ea']],
+	[' ័◌', ['a', 'oa']],
+	['ៈ', ['ak', 'eak']],
+	['័យ', ['ai', 'ey']],
+	['ិ', ['e', 'i']],
+	['ី', ['ei', 'i']],
+	['ឹ', ['oe', 'ue']],
+	['ឺ', ['eu', 'ueu']],
+	['ុ', ['o', 'u']],
+	['ូ', ['ou', 'u']],
+	['ួ', ['uo', 'uo']],
+	['ើ', ['aeu', 'eu']],
+	['ឿ', ['oea', 'oea']],
+	['ៀ', ['ie', 'ie']],
+	['េ', ['e', 'e']],
+	['ែ', ['ae', 'eae']],
+	['ៃ', ['ai', 'ey']],
+	['ោ', ['ao', 'ou']],
+	['ៅ', ['au', 'ov']],
+	['ុំ', ['om', 'um']],
+	['ំ', ['am', 'um']],
+	['ាំ', ['am', 'oam']],
+	['ាំង', ['ang', 'eang']],
+	['ះ', ['ah', 'eah']],
+	['ិះ', ['eh', 'is']],
+	['ឹះ', ['oeh', 'ueh']],
+	['ុះ', ['oh', 'uh']],
+	['េះ', ['eh', 'eh']],
+	['ើះ', ['aeuh', 'euh']],
+	['ែះ', ['aeh', 'eaeh']],
+	['ោះ', ['aoh', 'uoh']],
+].sort((a, b) => {
+	return b[0].length - a[0].length
+}).map(([k, v]) => [new RegExp(`^${k}`), v])
+
+/**
+ * Convert Khmer word into a romanization form
+ * @param {string} input 
+ * @returns {Generator<string>}
+ */
+function* transform(input) {
+	let pc = null;
+
+	const sindex = () => +!(pc != null && firstSeries.has(pc));
+
+	for (let i = 0; i < input.length; i++) {
+		let c = input[i];
+
+		if (/[\s\u200b\u200a]|[^\u1780-\u17dd]/.test(c)) {
+			vowelsMatch = null;
+			pc = null;
+			yield c;
+			continue;
+		}
+
+
+
+		if (consonants.has(c)) {
+
+			if (pc != null) {
+				if (i - 2 >= 0) {
+					if (consonants.has(input[i - 2])) {
+						yield vowelsDefault[sindex()];
+					}
+				}
+			}
+
+			pc = c;
+			yield consonants.get(c);
+			continue
+		}
+
+		for (const [pattern, values] of vowelEntries) {
+			const m = pattern.exec(input.slice(i));
+			if (!m) continue;
+			i += m[0].length - 1;
+			const r = values[sindex()];
+			yield r
+			break
+		}
+
+		pc = null;
+		// yield c;
+
+	}
+}
+
+exports.transform = transform;
+
+/**
+ * Convert Khmer text into a romaization representation
+ * @param {string} text 
+ * @returns {string}
+ */
+exports.slugify = function (text, delimiter = '-') {
+	return split(text)
+		.map(word => Array.from(transform(word)).join(""))
+		.join(delimiter)
+}
diff --git a/slugify.test.js b/slugify.test.js
@@ -0,0 +1,3 @@
+const { slugify } = require('.')
+
+console.log(slugify('មិនដឹងទេ that\'s nice'))
diff --git a/slugify.test.mjs b/slugify.test.mjs
@@ -0,0 +1,3 @@
+import { slugify } from './slugify.js'
+
+console.log(slugify('មិនដឹងទេ that\'s nice'))