Simplify english (#237)

* wip: diff between hunspell_words and en_US.txt * ci: make it easier to add words to en_US Adjust the publishing process to catch state dictionaries. * Staring will en_US and will add others later.
streetsidesoftware · Oct 21, 2020 · a2db50d · a2db50d
1 parent 80a4b21
commit a2db50d
Show file tree

Hide file tree

Showing 10 changed files with 55 additions and 133,111 deletions.
diff --git a/cSpell.json b/cSpell.json
@@ -12,7 +12,9 @@
         "configstore",
         "django",
         "django's",
-        "Mutex"
+        "Mutex",
+        "shasum",
+        "xargs"
     ],
     // flagWords - list of words to be always considered incorrect
     "flagWords": [],
@@ -62,4 +64,4 @@
         "ada",
         "shellscript"
     ]
-}
+}
diff --git a/package.json b/package.json
@@ -4,9 +4,11 @@
 		"test": "npm run lint && lerna run test",
 		"lint": "eslint .",
 		"build": "lerna run --concurrency 2 --stream build --no-bail",
+		"conditional-build": "lerna run --concurrency 2 --stream conditional-build --no-bail",
+		"checksum": "lerna run checksum",
 		"clean": "rimraf \"packages/*/*.txt.gz\"",
 		"postinstall": "lerna bootstrap",
-		"pub": "lerna bootstrap && npm test && lerna publish",
+		"pub": "lerna bootstrap && npm test && npm run checksum && lerna publish",
 		"prepublishOnly": "npm test",
 		"update-packages": "lerna exec \"npm update -S && rm -rf node_modules package-lock.json && npm i\""
 	},

diff --git a/packages/en_US/README.md b/packages/en_US/README.md
@@ -43,6 +43,10 @@ Building is only necessary if you want to modify the contents of the dictionary.
 npm run build
 ```
 
+## Adding Words
+
+Please add any words to [src/additional_words.txt](./src/additional_words.txt) by making a pull request.
+
 ## Resources
 
 The Hunspell source for this dictionary can be found:

diff --git a/packages/en_US/checksum.txt b/packages/en_US/checksum.txt
@@ -0,0 +1,4 @@
+992510ad9a274f96e60f92412400acb574b2b81e ?en_US.trie.gz
+3f23e3eac82926ce3c46244aa34486193e6d7daa ?./src/additional_words.txt
+e7354aa22bb5593d71deb2852467dbb1fff15556 ?./src/en_US.txt
+93502f493fd250c11dc87625ecfa723f08dc77cb ?./src/hunspell/en_US.dic
diff --git a/packages/en_US/en_US.trie.gz b/packages/en_US/en_US.trie.gz
diff --git a/packages/en_US/package.json b/packages/en_US/package.json
@@ -7,10 +7,13 @@
     "cspell-dict-en-us-unlink": "./unlink.js"
   },
   "scripts": {
-    "compile": "cspell-tools compile-trie --merge en_US src/hunspell/en_US.dic src/en_US.txt -o .",
     "build": "npm run compile",
+    "checksum": "shasum -c checksum.txt",
+    "compile": "cat source-files.txt | xargs cspell-tools compile-trie --merge en_US -o . && npm run gen-checksum",
+    "conditional-build": "npm run --silent checksum || npm run build",
     "test-dict": "hunspell-reader words -n 10000 \"src/hunspell/en_US.dic\" | cspell -v -c ./cspell-ext.json --local=en --languageId=* stdin",
     "test-text": "cspell -v -c ./cspell-ext.json --local=en --languageId=* \"tests/*.txt\"",
+    "gen-checksum": "cat source-files.txt | xargs shasum -p en_US.trie.gz > checksum.txt",
     "test": "npm run test-dict && npm run test-text",
     "prepare": "cp ../../util/* .",
     "cspell-link": "node link.js",
@@ -43,6 +46,8 @@
   "files": [
     "en_US.trie.gz",
     "cspell-ext.json",
+    "src/hunspell/README_en_US-large.txt",
+    "!scripts",
     "*.js",
     "*.d.ts"
   ],

diff --git a/packages/en_US/scripts/diff.js b/packages/en_US/scripts/diff.js
@@ -0,0 +1,29 @@
+#!/usr/bin/env node
+
+'use strict';
+
+/**
+ * This script was used to pair down the number of duplicate words in
+ * en_US.txt and hunspell/en_US.dic.
+ *
+ * I might use it to generate an base english dictionary from both en_US and en_GB.
+ * But it will need a bit of cleaning up first.
+ */
+
+const fs = require('fs');
+const path = require('path');
+
+function readWords(filename) {
+    const contents = fs.readFileSync(filename, 'utf8')
+    return new Set(contents.split('\n').map(s => s.trim()).filter(s => !!s));
+}
+
+const addedWordsFile = path.join('src', 'en_US.txt');
+const hunspellWordsFile = path.join('src', 'hunspell_words.txt');
+
+const added = readWords(addedWordsFile);
+const hunspell = readWords(hunspellWordsFile);
+
+const diff = [...added].filter(w => !hunspell.has(w));
+
+fs.writeFileSync(path.join('src', 'diff.txt'), diff.join('\n') + '\n', 'utf8')
diff --git a/packages/en_US/source-files.txt b/packages/en_US/source-files.txt
@@ -0,0 +1,3 @@
+./src/additional_words.txt
+./src/en_US.txt
+./src/hunspell/en_US.dic
diff --git a/packages/en_US/src/additional_words.txt b/packages/en_US/src/additional_words.txt
@@ -0,0 +1 @@
+# Add words below