Skip to content

Commit

Permalink
Simplify english (#237)
Browse files Browse the repository at this point in the history
* wip: diff between hunspell_words and en_US.txt

* ci: make it easier to add words to en_US

Adjust the publishing process to catch state dictionaries.
* Staring will en_US and will add others later.
  • Loading branch information
Jason3S committed Oct 21, 2020
1 parent 80a4b21 commit a2db50d
Show file tree
Hide file tree
Showing 10 changed files with 55 additions and 133,111 deletions.
6 changes: 4 additions & 2 deletions cSpell.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
"configstore",
"django",
"django's",
"Mutex"
"Mutex",
"shasum",
"xargs"
],
// flagWords - list of words to be always considered incorrect
"flagWords": [],
Expand Down Expand Up @@ -62,4 +64,4 @@
"ada",
"shellscript"
]
}
}
4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
"test": "npm run lint && lerna run test",
"lint": "eslint .",
"build": "lerna run --concurrency 2 --stream build --no-bail",
"conditional-build": "lerna run --concurrency 2 --stream conditional-build --no-bail",
"checksum": "lerna run checksum",
"clean": "rimraf \"packages/*/*.txt.gz\"",
"postinstall": "lerna bootstrap",
"pub": "lerna bootstrap && npm test && lerna publish",
"pub": "lerna bootstrap && npm test && npm run checksum && lerna publish",
"prepublishOnly": "npm test",
"update-packages": "lerna exec \"npm update -S && rm -rf node_modules package-lock.json && npm i\""
},
Expand Down
4 changes: 4 additions & 0 deletions packages/en_US/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ Building is only necessary if you want to modify the contents of the dictionary.
npm run build
```

## Adding Words

Please add any words to [src/additional_words.txt](./src/additional_words.txt) by making a pull request.

## Resources

The Hunspell source for this dictionary can be found:
Expand Down
4 changes: 4 additions & 0 deletions packages/en_US/checksum.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
992510ad9a274f96e60f92412400acb574b2b81e ?en_US.trie.gz
3f23e3eac82926ce3c46244aa34486193e6d7daa ?./src/additional_words.txt
e7354aa22bb5593d71deb2852467dbb1fff15556 ?./src/en_US.txt
93502f493fd250c11dc87625ecfa723f08dc77cb ?./src/hunspell/en_US.dic
Binary file modified packages/en_US/en_US.trie.gz
Binary file not shown.
7 changes: 6 additions & 1 deletion packages/en_US/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@
"cspell-dict-en-us-unlink": "./unlink.js"
},
"scripts": {
"compile": "cspell-tools compile-trie --merge en_US src/hunspell/en_US.dic src/en_US.txt -o .",
"build": "npm run compile",
"checksum": "shasum -c checksum.txt",
"compile": "cat source-files.txt | xargs cspell-tools compile-trie --merge en_US -o . && npm run gen-checksum",
"conditional-build": "npm run --silent checksum || npm run build",
"test-dict": "hunspell-reader words -n 10000 \"src/hunspell/en_US.dic\" | cspell -v -c ./cspell-ext.json --local=en --languageId=* stdin",
"test-text": "cspell -v -c ./cspell-ext.json --local=en --languageId=* \"tests/*.txt\"",
"gen-checksum": "cat source-files.txt | xargs shasum -p en_US.trie.gz > checksum.txt",
"test": "npm run test-dict && npm run test-text",
"prepare": "cp ../../util/* .",
"cspell-link": "node link.js",
Expand Down Expand Up @@ -43,6 +46,8 @@
"files": [
"en_US.trie.gz",
"cspell-ext.json",
"src/hunspell/README_en_US-large.txt",
"!scripts",
"*.js",
"*.d.ts"
],
Expand Down
29 changes: 29 additions & 0 deletions packages/en_US/scripts/diff.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env node

'use strict';

/**
* This script was used to pair down the number of duplicate words in
* en_US.txt and hunspell/en_US.dic.
*
* I might use it to generate an base english dictionary from both en_US and en_GB.
* But it will need a bit of cleaning up first.
*/

const fs = require('fs');
const path = require('path');

function readWords(filename) {
const contents = fs.readFileSync(filename, 'utf8')
return new Set(contents.split('\n').map(s => s.trim()).filter(s => !!s));
}

const addedWordsFile = path.join('src', 'en_US.txt');
const hunspellWordsFile = path.join('src', 'hunspell_words.txt');

const added = readWords(addedWordsFile);
const hunspell = readWords(hunspellWordsFile);

const diff = [...added].filter(w => !hunspell.has(w));

fs.writeFileSync(path.join('src', 'diff.txt'), diff.join('\n') + '\n', 'utf8')
3 changes: 3 additions & 0 deletions packages/en_US/source-files.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
./src/additional_words.txt
./src/en_US.txt
./src/hunspell/en_US.dic
1 change: 1 addition & 0 deletions packages/en_US/src/additional_words.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Add words below

0 comments on commit a2db50d

Please sign in to comment.