Skip to content

Commit cc65532

Browse files
minor updates
1 parent 0cddf6e commit cc65532

File tree

1 file changed

+192
-26
lines changed

1 file changed

+192
-26
lines changed

scriptum.js

Lines changed: 192 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4190,8 +4190,8 @@ Object.defineProperty(_Map.deDE, "monthsRev", {
41904190
["Okt", 10], ["Nov", 11], ["Dez", 12],
41914191
]);
41924192

4193-
delete this.months;
4194-
this.months = m;
4193+
delete this.monthsRev;
4194+
this.monthsRev = m;
41954195
return m;
41964196
},
41974197

@@ -4208,8 +4208,8 @@ Object.defineProperty(_Map.deDE, "weekdays", {
42084208
[1, "Mo"], [2, "Di"], [3, "Mi"], [4, "Do"], [5, "Fr"], [6, "Sa"], [0, "So"],
42094209
]);
42104210

4211-
delete this.months;
4212-
this.months = m;
4211+
delete this.weekdays;
4212+
this.weekdays = m;
42134213
return m;
42144214
},
42154215

@@ -4226,8 +4226,39 @@ Object.defineProperty(_Map.deDE, "weekdaysRev", {
42264226
["Mo", 1], ["Di", 2], ["Mi", 3], ["Do", 4], ["Fr", 5], ["Sa", 6], ["So", 0],
42274227
]);
42284228

4229-
delete this.months;
4230-
this.months = m;
4229+
delete this.weekdaysRev;
4230+
this.weekdaysRev = m;
4231+
return m;
4232+
},
4233+
4234+
configurable: true
4235+
});
4236+
4237+
4238+
Object.defineProperty(_Map.deDE, "generalAlteration", {
4239+
get() {
4240+
const m = new Map([
4241+
["a", "ä"], ["ä", "a"], ["o", "ö"], ["ö", "o"], ["u", "ü"], ["ü", "u"],
4242+
]);
4243+
4244+
delete this.generalAlteration;
4245+
this.generalAlteration = m;
4246+
return m;
4247+
},
4248+
4249+
configurable: true
4250+
});
4251+
4252+
4253+
Object.defineProperty(_Map.deDE, "nominalAlteration", {
4254+
get() {
4255+
const m = new Map([
4256+
["A", "Ä"], ["Ä", "A"], ["O", "Ö"], ["Ö", "O"], ["U", "Ü"], ["Ü", "U"],
4257+
["a", "ä"], ["ä", "a"], ["o", "ö"], ["ö", "o"], ["u", "ü"], ["ü", "u"],
4258+
]);
4259+
4260+
delete this.nominalAlteration;
4261+
this.nominalAlteration = m;
42314262
return m;
42324263
},
42334264

@@ -7117,8 +7148,98 @@ Object.defineProperty(_Set, "currencies", {
71177148
"EUR", "USD", "CNY", "JPY", "GBP", "INR", "RUB", "TRY", "CHF"
71187149
]);
71197150

7120-
delete this.months;
7121-
this.months = s;
7151+
delete this.currencies;
7152+
this.currencies = s;
7153+
return s;
7154+
},
7155+
7156+
configurable: true
7157+
});
7158+
7159+
7160+
Object.defineProperty(_Set.deDE, "nominalInterfixes", {
7161+
get() {
7162+
const s = new Set([
7163+
"e", "n", "s", "en", "er", "es", "ens",
7164+
]);
7165+
7166+
delete this.nominalInterfixes;
7167+
this.nominalInterfixes = s;
7168+
return s;
7169+
},
7170+
7171+
configurable: true
7172+
});
7173+
7174+
7175+
Object.defineProperty(_Set.deDE, "verbalInterfixes", {
7176+
get() {
7177+
const s = new Set([
7178+
"s", "n", "en", "ge", "zu",
7179+
]);
7180+
7181+
delete this.verbalInterfixes;
7182+
this.verbalInterfixes = s;
7183+
return s;
7184+
},
7185+
7186+
configurable: true
7187+
});
7188+
7189+
7190+
Object.defineProperty(_Set.deDE, "adjectivalInterfixes", {
7191+
get() {
7192+
const s = new Set([
7193+
"e", "n", "s", "en", "er", "es", "ens",
7194+
]);
7195+
7196+
delete this.adjectivalInterfixes;
7197+
this.adjectivalInterfixes = s;
7198+
return s;
7199+
},
7200+
7201+
configurable: true
7202+
});
7203+
7204+
7205+
Object.defineProperty(_Set.deDE, "numeralInterfixes", {
7206+
get() {
7207+
const s = new Set([
7208+
"und",
7209+
]);
7210+
7211+
delete this.numeralInterfixes;
7212+
this.numeralInterfixes = s;
7213+
return s;
7214+
},
7215+
7216+
configurable: true
7217+
});
7218+
7219+
7220+
Object.defineProperty(_Set.deDE, "inflectionElisions", {
7221+
get() {
7222+
const s = new Set([
7223+
"e", "en",
7224+
]);
7225+
7226+
delete this.inflectionElisions;
7227+
this.inflectionElisions = s;
7228+
return s;
7229+
},
7230+
7231+
configurable: true
7232+
});
7233+
7234+
7235+
Object.defineProperty(_Set.deDE, "compositaElisions", {
7236+
get() {
7237+
const s = new Set([
7238+
"e",
7239+
]);
7240+
7241+
delete this.compositaElisions;
7242+
this.compositaElisions = s;
71227243
return s;
71237244
},
71247245

@@ -7411,6 +7532,14 @@ S.bigram = S.splitChunk({size: 2, overlap: true});
74117532
S.trigram = S.splitChunk({size: 3, overlap: true});
74127533

74137534

7535+
S.fromNgram = ngram => {
7536+
let s = "";
7537+
for (const t of ngram) s += t[0];
7538+
s += ngram[ngram.length - 1].slice(1);
7539+
return s;
7540+
};
7541+
7542+
74147543
/* Split at character transitions:
74157544
S.splitChars("abbccc") // yields ["a", "bb", "ccc"] */
74167545

@@ -7448,14 +7577,14 @@ S.splitAscii = s => {
74487577
// retrieve similar words based on bigrams
74497578

74507579

7451-
S.Retieve = {};
7580+
S.Retrieve = {};
74527581

74537582

74547583
// Word :: Str
74557584
// Bigram :: [Str]
74567585
// Index :: Nat
7457-
// [Word] => Corpus{words: [Bigram], lookup: Map<Str, Set<Index>>}
7458-
S.Retieve.createCorpus = words => {
7586+
// [Word] => Corpus{bigrams: [Bigram], lookup: Map<Str, Set<Index>>}
7587+
S.Retrieve.createCorpus = words => {
74597588
const bigrams = words.map(S.bigram),
74607589
lookup = new Map();
74617590

@@ -7484,17 +7613,18 @@ The comparison is conducted in a case-insensitive manner.
74847613
lenDiff: sets the lower and upper bounds for the allowed length difference
74857614
by calculating the length ratio between the query and the corpus word.
74867615
7487-
threshold: sets the lower bound of necessary the bigram matches by calculating
7488-
the quotient of matching over total bigrams of the query word.
7616+
threshold: sets the lower bound of necessary bigram matches by calculating the
7617+
quotient of matching over total bigrams of the query word.
74897618
74907619
The result is ordered by score in descending order. Consecutive bigram matches
74917620
yield a higher score than scattered ones. */
74927621

74937622
// Nat :: Num
74947623
// Word :: Str
7495-
// Corpus{words: [Word], lookup: Map<Str, Set<Index>>}
7624+
// Bigram :: [Str]
7625+
// Corpus{bigrams: [Bigram], lookup: Map<Str, Set<Index>>}
74967626
// {corpus: Corpus, lenDiff: [Num, Num], threshold: Num} => Word => [{i: Index, score: Nat}]
7497-
S.Retieve.query = ({corpus, lenDiff = [0.75, 1.34], threshold = 0.25}) => word => {
7627+
S.Retrieve.query = ({corpus, lenDiff = [0.75, 1.34], threshold = 0.25}) => word => {
74987628
const queryBigram = S.bigram(word.toLowerCase()),
74997629
queryMetas = A.bigram(queryBigram);
75007630

@@ -7509,9 +7639,10 @@ S.Retieve.query = ({corpus, lenDiff = [0.75, 1.34], threshold = 0.25}) => word =
75097639
corpus.lookup.get(k).forEach(i => {
75107640
const ratio = queryBigram.length / corpus.bigrams[i].length;
75117641

7512-
if (ratio >= lenDiff[0] && ratio <= lenDiff[1]) {
7513-
if (m.has(i)) m.set(i, m.get(i) + 1);
7514-
else m.set(i, 1);
7642+
if ((lenDiff[0] === null || ratio >= lenDiff[0])
7643+
&& (lenDiff[1] === null || ratio <= lenDiff[1])) {
7644+
if (m.has(i)) m.set(i, m.get(i) + 1);
7645+
else m.set(i, 1);
75157646
}
75167647
});
75177648
}
@@ -7583,14 +7714,6 @@ S.Retieve.query = ({corpus, lenDiff = [0.75, 1.34], threshold = 0.25}) => word =
75837714
};
75847715

75857716

7586-
S.Retieve.toStr = bigram => {
7587-
let s = "";
7588-
for (const pair of bigram) s += pair[0];
7589-
s += bigram[bigram.length - 1] [1];
7590-
return s;
7591-
};
7592-
7593-
75947717
//█████ Diffing ███████████████████████████████████████████████████████████████
75957718

75967719

@@ -9791,6 +9914,49 @@ S.Word.parsePos = trigramDicts => word => {
97919914
};
97929915

97939916

9917+
S.Word.parseProperName = word => {
9918+
/*
9919+
* signal words:
9920+
* Herr, Frau, Hr, Fr, Dr, med, jur, rer, nat, phil, oec, ing, Ing,
9921+
Dipl, hc, Prof, Kfm, Kffr, MA , MSc, BA, BSc, Mag, PD, PhD
9922+
* Mama, Papa, Oma, Opa, Tante, Onkel, Sohn, Tochter, Bruder, Schwester,
9923+
Cousin, Cousine, Neffe, Nichte, Ehemann, Ehefrau, Schwiegermutter,
9924+
Schwiegervater, Gatte, Gattin
9925+
* preceding signal pos:
9926+
+pron
9927+
-conj
9928+
-num
9929+
-art
9930+
-adv
9931+
-inter
9932+
* properties
9933+
* two or more consecutive title case words (except for BOS)
9934+
* Michael Beck, C.H. Beck
9935+
* genitive "s" (Becks)
9936+
* nobility: von, von der, etc.
9937+
* suffixes:
9938+
* -mann, -er, -sen, -son, -ke, -ow, -berg, -bach, -hoff, -stein
9939+
* -burg, -stadt, -dorf, -hausen, -heim, -ingen, -au, -berg, -tal, -furt, -brück, -kirche(n)
9940+
* no plural, weird flection
9941+
* rare modification through adjectives: the big Max Mustermann
9942+
* often include non-latin letters/rare trigrams
9943+
* burstiness: word rarely appears but if it does, it regularly reappears in this local context
9944+
*/
9945+
};
9946+
9947+
9948+
S.Word.splitSentences = s => {
9949+
// TODO
9950+
// split at periods/exclamation/question mark but
9951+
// take abbreviation periods into account
9952+
// take ellipses into account
9953+
// take several exclamation/question marks into account?!?
9954+
// newlines might be considered like implicit periods
9955+
// trim redundant spaces
9956+
// encode type of sentence: expressive, interrogative, exclamatory
9957+
};
9958+
9959+
97949960
/*█████████████████████████████████████████████████████████████████████████████
97959961
███████████████████████████████████████████████████████████████████████████████
97969962
█████████████████████████████████ TRANSDUCER ██████████████████████████████████

0 commit comments

Comments
 (0)