@@ -4190,8 +4190,8 @@ Object.defineProperty(_Map.deDE, "monthsRev", {
41904190 [ "Okt" , 10 ] , [ "Nov" , 11 ] , [ "Dez" , 12 ] ,
41914191 ] ) ;
41924192
4193- delete this . months ;
4194- this . months = m ;
4193+ delete this . monthsRev ;
4194+ this . monthsRev = m ;
41954195 return m ;
41964196 } ,
41974197
@@ -4208,8 +4208,8 @@ Object.defineProperty(_Map.deDE, "weekdays", {
42084208 [ 1 , "Mo" ] , [ 2 , "Di" ] , [ 3 , "Mi" ] , [ 4 , "Do" ] , [ 5 , "Fr" ] , [ 6 , "Sa" ] , [ 0 , "So" ] ,
42094209 ] ) ;
42104210
4211- delete this . months ;
4212- this . months = m ;
4211+ delete this . weekdays ;
4212+ this . weekdays = m ;
42134213 return m ;
42144214 } ,
42154215
@@ -4226,8 +4226,39 @@ Object.defineProperty(_Map.deDE, "weekdaysRev", {
42264226 [ "Mo" , 1 ] , [ "Di" , 2 ] , [ "Mi" , 3 ] , [ "Do" , 4 ] , [ "Fr" , 5 ] , [ "Sa" , 6 ] , [ "So" , 0 ] ,
42274227 ] ) ;
42284228
4229- delete this . months ;
4230- this . months = m ;
4229+ delete this . weekdaysRev ;
4230+ this . weekdaysRev = m ;
4231+ return m ;
4232+ } ,
4233+
4234+ configurable : true
4235+ } ) ;
4236+
4237+
4238+ Object . defineProperty ( _Map . deDE , "generalAlteration" , {
4239+ get ( ) {
4240+ const m = new Map ( [
4241+ [ "a" , "ä" ] , [ "ä" , "a" ] , [ "o" , "ö" ] , [ "ö" , "o" ] , [ "u" , "ü" ] , [ "ü" , "u" ] ,
4242+ ] ) ;
4243+
4244+ delete this . generalAlteration ;
4245+ this . generalAlteration = m ;
4246+ return m ;
4247+ } ,
4248+
4249+ configurable : true
4250+ } ) ;
4251+
4252+
4253+ Object . defineProperty ( _Map . deDE , "nominalAlteration" , {
4254+ get ( ) {
4255+ const m = new Map ( [
4256+ [ "A" , "Ä" ] , [ "Ä" , "A" ] , [ "O" , "Ö" ] , [ "Ö" , "O" ] , [ "U" , "Ü" ] , [ "Ü" , "U" ] ,
4257+ [ "a" , "ä" ] , [ "ä" , "a" ] , [ "o" , "ö" ] , [ "ö" , "o" ] , [ "u" , "ü" ] , [ "ü" , "u" ] ,
4258+ ] ) ;
4259+
4260+ delete this . nominalAlteration ;
4261+ this . nominalAlteration = m ;
42314262 return m ;
42324263 } ,
42334264
@@ -7117,8 +7148,98 @@ Object.defineProperty(_Set, "currencies", {
71177148 "EUR" , "USD" , "CNY" , "JPY" , "GBP" , "INR" , "RUB" , "TRY" , "CHF"
71187149 ] ) ;
71197150
7120- delete this . months ;
7121- this . months = s ;
7151+ delete this . currencies ;
7152+ this . currencies = s ;
7153+ return s ;
7154+ } ,
7155+
7156+ configurable : true
7157+ } ) ;
7158+
7159+
7160+ Object . defineProperty ( _Set . deDE , "nominalInterfixes" , {
7161+ get ( ) {
7162+ const s = new Set ( [
7163+ "e" , "n" , "s" , "en" , "er" , "es" , "ens" ,
7164+ ] ) ;
7165+
7166+ delete this . nominalInterfixes ;
7167+ this . nominalInterfixes = s ;
7168+ return s ;
7169+ } ,
7170+
7171+ configurable : true
7172+ } ) ;
7173+
7174+
7175+ Object . defineProperty ( _Set . deDE , "verbalInterfixes" , {
7176+ get ( ) {
7177+ const s = new Set ( [
7178+ "s" , "n" , "en" , "ge" , "zu" ,
7179+ ] ) ;
7180+
7181+ delete this . verbalInterfixes ;
7182+ this . verbalInterfixes = s ;
7183+ return s ;
7184+ } ,
7185+
7186+ configurable : true
7187+ } ) ;
7188+
7189+
7190+ Object . defineProperty ( _Set . deDE , "adjectivalInterfixes" , {
7191+ get ( ) {
7192+ const s = new Set ( [
7193+ "e" , "n" , "s" , "en" , "er" , "es" , "ens" ,
7194+ ] ) ;
7195+
7196+ delete this . adjectivalInterfixes ;
7197+ this . adjectivalInterfixes = s ;
7198+ return s ;
7199+ } ,
7200+
7201+ configurable : true
7202+ } ) ;
7203+
7204+
7205+ Object . defineProperty ( _Set . deDE , "numeralInterfixes" , {
7206+ get ( ) {
7207+ const s = new Set ( [
7208+ "und" ,
7209+ ] ) ;
7210+
7211+ delete this . numeralInterfixes ;
7212+ this . numeralInterfixes = s ;
7213+ return s ;
7214+ } ,
7215+
7216+ configurable : true
7217+ } ) ;
7218+
7219+
7220+ Object . defineProperty ( _Set . deDE , "inflectionElisions" , {
7221+ get ( ) {
7222+ const s = new Set ( [
7223+ "e" , "en" ,
7224+ ] ) ;
7225+
7226+ delete this . inflectionElisions ;
7227+ this . inflectionElisions = s ;
7228+ return s ;
7229+ } ,
7230+
7231+ configurable : true
7232+ } ) ;
7233+
7234+
7235+ Object . defineProperty ( _Set . deDE , "compositaElisions" , {
7236+ get ( ) {
7237+ const s = new Set ( [
7238+ "e" ,
7239+ ] ) ;
7240+
7241+ delete this . compositaElisions ;
7242+ this . compositaElisions = s ;
71227243 return s ;
71237244 } ,
71247245
@@ -7411,6 +7532,14 @@ S.bigram = S.splitChunk({size: 2, overlap: true});
74117532S . trigram = S . splitChunk ( { size : 3 , overlap : true } ) ;
74127533
74137534
7535+ S . fromNgram = ngram => {
7536+ let s = "" ;
7537+ for ( const t of ngram ) s += t [ 0 ] ;
7538+ s += ngram [ ngram . length - 1 ] . slice ( 1 ) ;
7539+ return s ;
7540+ } ;
7541+
7542+
74147543/* Split at character transitions:
74157544 S.splitChars("abbccc") // yields ["a", "bb", "ccc"] */
74167545
@@ -7448,14 +7577,14 @@ S.splitAscii = s => {
74487577// retrieve similar words based on bigrams
74497578
74507579
7451- S . Retieve = { } ;
7580+ S . Retrieve = { } ;
74527581
74537582
74547583// Word :: Str
74557584// Bigram :: [Str]
74567585// Index :: Nat
7457- // [Word] => Corpus{words : [Bigram], lookup: Map<Str, Set<Index>>}
7458- S . Retieve . createCorpus = words => {
7586+ // [Word] => Corpus{bigrams : [Bigram], lookup: Map<Str, Set<Index>>}
7587+ S . Retrieve . createCorpus = words => {
74597588 const bigrams = words . map ( S . bigram ) ,
74607589 lookup = new Map ( ) ;
74617590
@@ -7484,17 +7613,18 @@ The comparison is conducted in a case-insensitive manner.
74847613lenDiff: sets the lower and upper bounds for the allowed length difference
74857614by calculating the length ratio between the query and the corpus word.
74867615
7487- threshold: sets the lower bound of necessary the bigram matches by calculating
7488- the quotient of matching over total bigrams of the query word.
7616+ threshold: sets the lower bound of necessary bigram matches by calculating the
7617+ quotient of matching over total bigrams of the query word.
74897618
74907619The result is ordered by score in descending order. Consecutive bigram matches
74917620yield a higher score than scattered ones. */
74927621
74937622// Nat :: Num
74947623// Word :: Str
7495- // Corpus{words: [Word], lookup: Map<Str, Set<Index>>}
7624+ // Bigram :: [Str]
7625+ // Corpus{bigrams: [Bigram], lookup: Map<Str, Set<Index>>}
74967626// {corpus: Corpus, lenDiff: [Num, Num], threshold: Num} => Word => [{i: Index, score: Nat}]
7497- S . Retieve . query = ( { corpus, lenDiff = [ 0.75 , 1.34 ] , threshold = 0.25 } ) => word => {
7627+ S . Retrieve . query = ( { corpus, lenDiff = [ 0.75 , 1.34 ] , threshold = 0.25 } ) => word => {
74987628 const queryBigram = S . bigram ( word . toLowerCase ( ) ) ,
74997629 queryMetas = A . bigram ( queryBigram ) ;
75007630
@@ -7509,9 +7639,10 @@ S.Retieve.query = ({corpus, lenDiff = [0.75, 1.34], threshold = 0.25}) => word =
75097639 corpus . lookup . get ( k ) . forEach ( i => {
75107640 const ratio = queryBigram . length / corpus . bigrams [ i ] . length ;
75117641
7512- if ( ratio >= lenDiff [ 0 ] && ratio <= lenDiff [ 1 ] ) {
7513- if ( m . has ( i ) ) m . set ( i , m . get ( i ) + 1 ) ;
7514- else m . set ( i , 1 ) ;
7642+ if ( ( lenDiff [ 0 ] === null || ratio >= lenDiff [ 0 ] )
7643+ && ( lenDiff [ 1 ] === null || ratio <= lenDiff [ 1 ] ) ) {
7644+ if ( m . has ( i ) ) m . set ( i , m . get ( i ) + 1 ) ;
7645+ else m . set ( i , 1 ) ;
75157646 }
75167647 } ) ;
75177648 }
@@ -7583,14 +7714,6 @@ S.Retieve.query = ({corpus, lenDiff = [0.75, 1.34], threshold = 0.25}) => word =
75837714} ;
75847715
75857716
7586- S . Retieve . toStr = bigram => {
7587- let s = "" ;
7588- for ( const pair of bigram ) s += pair [ 0 ] ;
7589- s += bigram [ bigram . length - 1 ] [ 1 ] ;
7590- return s ;
7591- } ;
7592-
7593-
75947717//█████ Diffing ███████████████████████████████████████████████████████████████
75957718
75967719
@@ -9791,6 +9914,49 @@ S.Word.parsePos = trigramDicts => word => {
97919914} ;
97929915
97939916
9917+ S . Word . parseProperName = word => {
9918+ /*
9919+ * signal words:
9920+ * Herr, Frau, Hr, Fr, Dr, med, jur, rer, nat, phil, oec, ing, Ing,
9921+ Dipl, hc, Prof, Kfm, Kffr, MA , MSc, BA, BSc, Mag, PD, PhD
9922+ * Mama, Papa, Oma, Opa, Tante, Onkel, Sohn, Tochter, Bruder, Schwester,
9923+ Cousin, Cousine, Neffe, Nichte, Ehemann, Ehefrau, Schwiegermutter,
9924+ Schwiegervater, Gatte, Gattin
9925+ * preceding signal pos:
9926+ +pron
9927+ -conj
9928+ -num
9929+ -art
9930+ -adv
9931+ -inter
9932+ * properties
9933+ * two or more consecutive title case words (except for BOS)
9934+ * Michael Beck, C.H. Beck
9935+ * genitive "s" (Becks)
9936+ * nobility: von, von der, etc.
9937+ * suffixes:
9938+ * -mann, -er, -sen, -son, -ke, -ow, -berg, -bach, -hoff, -stein
9939+ * -burg, -stadt, -dorf, -hausen, -heim, -ingen, -au, -berg, -tal, -furt, -brück, -kirche(n)
9940+ * no plural, weird flection
9941+ * rare modification through adjectives: the big Max Mustermann
9942+ * often include non-latin letters/rare trigrams
9943+ * burstiness: word rarely appears but if it does, it regularly reappears in this local context
9944+ */
9945+ } ;
9946+
9947+
9948+ S . Word . splitSentences = s => {
9949+ // TODO
9950+ // split at periods/exclamation/question mark but
9951+ // take abbreviation periods into account
9952+ // take ellipses into account
9953+ // take several exclamation/question marks into account?!?
9954+ // newlines might be considered like implicit periods
9955+ // trim redundant spaces
9956+ // encode type of sentence: expressive, interrogative, exclamatory
9957+ } ;
9958+
9959+
97949960/*█████████████████████████████████████████████████████████████████████████████
97959961███████████████████████████████████████████████████████████████████████████████
97969962█████████████████████████████████ TRANSDUCER ██████████████████████████████████
0 commit comments