Browse files

first commit

  • Loading branch information...
0 parents commit 8e829064dba0c361c8c4f77801f6fbeb66f1407a @reyesr committed Aug 26, 2012
11 .project
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+ <name>fullproof</name>
+ <comment></comment>
+ <projects>
+ </projects>
+ <buildSpec>
+ </buildSpec>
+ <natures>
+ </natures>
+</projectDescription>
114 example-analyzers.html
@@ -0,0 +1,114 @@
+<!DOCTYPE html>
+<html>
+<head>
+<meta charset="UTF-8">
+<title>Javascript Search Engine : Analyzers testing</title>
+<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.8.0/jquery.min.js"></script>
+<script type="text/javascript" src="src/unicode/categ_letters_numbers.js"></script>
+<script type="text/javascript" src="src/unicode/normalizer_lowercase_nomark.js"></script>
+<script type="text/javascript" src="src/unicode/unicode.js"></script>
+<script type="text/javascript" src="src/analyzers.js"></script>
+<script type="text/javascript" src="src/normalizers.js"></script>
+<script type="text/javascript" src="src/normalizers/normalizers-english.js"></script>
+<script type="text/javascript" src="src/normalizers/normalizers-french.js"></script>
+<style TYPE="text/css">
+
+div.floatbox {
+ float:left; width: 300px;
+}
+div.resultbox {
+ border: 1px solid black; margin:0.5em;
+ color: black; background-color: #EEEEEE;
+ overflow: hidden;
+}
+textarea.typebox {
+ float: left;
+ width: 300px; height: 200px; margin:1em;"
+ color: black; background-color: white;
+}
+</style>
+</head>
+<body>
+<h1>Testing the Analyzers</h1>
+
+The javascript search engine is provided with a set of normalizers that can be chained to the parser.
+An analyzer is just a parser associated to a set (possibly empty) of normalizers.
+The goal of the normalization process is to reduce the set of words to index and to provide a way to
+automatically remove typos, stem, or reduce words to phonetic equivalents so that they are represented
+in the index by one single form.
+<p/>
+Each normalizer have their own strategy and "aggressiveness". For instance:
+<ul>
+ <li><tt>remove_duplicate_letters</tt> just remove sequences of the same letter.</li>
+ <li><tt>to_lowercase_decomp</tt> provides a unicode-decomposed, lower-cased form of the words.</li>
+ <li><tt>to_lowercase_nomark</tt> lowercases the words and remove all the diacritical marks</li>
+ <li><tt>porter_stemmer</tt> stems the words using english-based rules</li>
+ <li><tt>french_normalizer</tt> agressively normalizes french words by stemming and phonetically simplifying them</li>
+</ul>
+
+You can try some normalizers by typing text here:
+
+<div>
+ <textarea id="typehere" class="typebox"></textarea>
+ <div class="floatbox">Parsed<div class="resultbox" id="result_parsed" ></div></div>
+</div>
+<div style="clear:both" id="results">
+</div>
+
+<script type="text/javascript">
+
+$(document).ready(function() {
+
+ var under_test = [
+ {
+ label: "Standard parser + lowercase + remove dups",
+ parser: new fullproof.StandardAnalyzer([fullproof.normalizer.to_lowercase_nomark, fullproof.normalizer.remove_duplicate_letters])
+ },
+ {
+ label: "Standard parser + lowercase + english porter stemmer",
+ parser: new fullproof.StandardAnalyzer([fullproof.normalizer.to_lowercase_nomark,fullproof.normalizer.english.porter_stemmer])
+ },
+ {
+ label: "Standard parser + lowercase + french stemmer",
+ parser: new fullproof.StandardAnalyzer([fullproof.normalizer.to_lowercase_nomark,fullproof.normalizer.french.simple_stemmer])
+ }
+ ];
+
+ var nakedParser = {
+ parser: new fullproof.StandardAnalyzer(),
+ id: "result_parsed"
+ };
+
+ function process(obj, text) {
+ var result = [];
+ obj.parser.sendFalseWhenComplete = true;
+ obj.parser.parse(text, function(word) {
+ if (word) {
+ result.push(word);
+ } else {
+ $("#"+obj.id).html(result.join(", "));
+ }
+ });
+ }
+
+ for (var i=0; i<under_test.length; ++i) {
+ var obj = under_test[i];
+ obj.id = 'el-xxxxxxxxxxx'.replace(/[xy]/g, function(c) {return String.fromCharCode(65+parseInt(Math.random()*26)); });
+ $("#results").append(function() {
+ return "<div class='floatbox'>"+obj.label+"<div class='resultbox' id='"+obj.id+"'></div></div>";
+ });
+ }
+
+ $("#typehere").keyup(function() {
+ var value = $(this).val();
+ process(nakedParser, value);
+ for (var i=0; i<under_test.length; ++i) {
+ var o = under_test[i];
+ process(o, value);
+ }
+
+
+ });
+});</script>
+</body>
+</html>
181 example-storage.html
@@ -0,0 +1,181 @@
+<!DOCTYPE html>
+<html>
+<head>
+<meta charset="UTF-8">
+<title>Javascript Search Engine : parser testing</title>
+<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.8.0/jquery.min.js"></script>
+
+<script type="text/javascript" src="src/unicode/categ_letters_numbers.js"></script>
+<script type="text/javascript" src="src/unicode/normalizer_lowercase_nomark.js"></script>
+<script type="text/javascript" src="src/unicode/unicode.js"></script>
+<script type="text/javascript" src="src/store/memory_store.js"></script>
+<script type="text/javascript" src="src/store/websql_store.js"></script>
+<script type="text/javascript" src="src/analyzers.js"></script>
+<script type="text/javascript" src="src/normalizers.js"></script>
+
+<style TYPE="text/css">
+
+div.floatbox {
+ float:left; width: 300px;
+}
+div.resultbox {
+ border: 1px solid black; margin:0.5em;
+ color: black; background-color: #EEEEEE;
+ overflow: hidden;
+}
+textarea.typebox {
+ float: left;
+ width: 300px; height: 200px; margin:1em;"
+ color: black; background-color: white;
+}
+</style>
+</head>
+<body>
+<h1>Testing storage engines</h1>
+
+Search engines need to store their data, mainly a set of indexes, On the client-side, things get
+at little complicated, specially on devices with little memory or bad/old browsers. Here are the
+different storage engines supported by the library.
+<ul>
+<li><em>memory index</em>: stores all the data in memory (actually a javascript object). The advantages of
+this index are compatibility with all browser, and obviously its speed. However, the index needs to be
+built every time the application loads, and it consumes as much memory as the database size itself.</li>
+
+<li><em>websql</em>: This is a disk-based index based on the deprecated websql api. It's main drawback is that
+it is only supported on a few browsers, and it's not developped anymore.</li>
+
+<li><em>indexeddb</em></li>
+
+</ul>
+
+You can try to store data by typing text here: every non-blank line will be indexed as a document,
+the key is the line number.
+
+<div>
+ <textarea id="typehere" class="typebox">première ligne
+deuxième ligne
+troisième ligne
+ </textarea>
+ <div style="float:left; min-width: 10em; margin: 2em;">
+ <div><button id="action" > STORE &gt;&gt; </button></div>
+ <div style="margin-top:2em;">Search a word:<input id="search" type="text" /></div>
+ <div><button id="lookup" > SEARCH &gt;&gt; </button></div>
+ </div>
+ <div class="floatbox">Analysis<div class="resultbox" id="analysis" ></div></div>
+</div>
+<div style="clear:both" id="results">
+</div>
+
+<script type="text/javascript">
+
+
+console.log("TESTS");
+console.log(window);
+console.log(window.indexedDB);
+if (window.indexedDB) {
+ console.log("ok");
+}
+
+$(document).ready(function() {
+
+ var parser_for_all = new fullproof.StandardAnalyzer([fullproof.normalizer.to_lowercase_nomark, fullproof.normalizer.remove_duplicate_letters]);
+
+ var under_test = [
+ {
+ label: "memory parser",
+ engine: new fullproof.store.MemoryStore(),
+ parser: parser_for_all
+ },
+ {
+ label: "websql parser",
+ engine: (function() {
+ var e = new fullproof.store.WebSQLStore();
+ e.open("testsearch", "ind1", 1024*1024*20, function() {
+ // do something
+ console.log(arguments);
+ });
+ return e;
+ })(),
+ parser: parser_for_all
+ }
+/* {
+ label: "Standard parser + lowercase + english porter stemmer",
+ func: gluedom(net.kornr.searchengine.parser.parse, [net.kornr.searchengine.normalizer.to_lowercase_nomark,net.kornr.searchengine.normalizer.english.porter_stemmer])
+ },
+ {
+ label: "Standard parser + lowercase + french stemmer",
+ func: gluedom(net.kornr.searchengine.parser.parse, [net.kornr.searchengine.normalizer.to_lowercase_nomark,net.kornr.searchengine.normalizer.french.simple_stemmer])
+ }
+ */ ];
+
+ for (var i=0; i<under_test.length; ++i) {
+ var obj = under_test[i];
+ obj.id = 'el-xxxxxxxxxxx'.replace(/[xy]/g, function(c) {return String.fromCharCode(65+parseInt(Math.random()*26)); });
+ $("#results").append(function() {
+ return "<div class='floatbox'>"+obj.label+"<div class='resultbox' id='"+obj.id+"'></div></div>";
+ });
+ }
+
+/* var parser = function(sentence, value, word_callback) {
+ console.log("parssing " + sentence);
+ fullproof.parser.parse(sentence, function(word) {
+ if (word) {
+ word = SEARCH.normalizer.to_lowercase_nomark(word);
+ word_callback(word, value);
+ }
+ });
+ }
+ */
+ $("#action").click(function() {
+ var value = $("#typehere").val();
+
+ var lines = value.split("\n");
+ $("#analysis").html("Storing " + lines.length + " lines");
+ for (var i=0; i<under_test.length; ++i) {
+ var o = under_test[i];
+ o.engine.clear(function() {
+ var ilog = "Storing " + lines.length + " lines<p/><pre>";
+ for (var l=0; l<lines.length; ++l) {
+ if (lines[l].length) {
+
+ o.parser.parse(lines[l], function(word) {
+ ilog += ("storing word " + word +" => " + l + "\n");
+ o.engine.inject(word, l); // the line number is the value stored
+ });
+ }
+ }
+ ilog += "</pre>";
+ $("#"+o.id).html(ilog);
+ });
+ }
+ });
+
+ $("#lookup").click(function() {
+ var value = $("#search").val();
+ var split = value.split(" ");
+ if (split.length) {
+ value = split[0].trim();
+
+ for (var i=0; i<under_test.length; ++i) {
+ var o = under_test[i];
+ var ilog = "lookup of " + value + " <p/><pre>";
+ o.parser.parse(value, function(word) {
+ var set = o.engine.lookup(word, function(data) {
+ if (data) {
+ ilog += "found "+word+" on lines: " + data.join(",");
+ console.log(data);
+ } else {
+ ilog += "Not found: the database is empty or the word is not contained";
+ }
+
+ ilog += "</pre>";
+ $("#"+o.id).html(ilog);
+ });
+ });
+ }
+ }
+ });
+
+});</script>
+</body>
+</html>
90 src/analyzers.js
@@ -0,0 +1,90 @@
+var fullproof = (function(NAMESPACE) {
+
+ /**
+ * A simple private parser that relies on the unicode letter/number
+ * categories. Word boundaries are whatever is not a letter
+ * or a number.
+ */
+ var simple_parser = function(str, callback, functor) {
+ functor = functor||net.kornr.unicode.is_letter_number;
+ var current_word = "";
+ for (var i=0,max=str.length; i<max; ++i) {
+ if (functor(str.charCodeAt(i))) {
+ current_word += str[i];
+ } else {
+ if (current_word.length>0) {
+ callback(current_word);
+ current_word = "";
+ }
+ }
+ }
+ if (current_word.length>0) {
+ callback(current_word);
+ }
+ callback(false);
+ };
+
+ /**
+ * A Parser object with a parse() method. This
+ */
+ NAMESPACE.StandardAnalyzer = function() {
+
+ // Stores the normalizers... (don't store arguments, as it contains much more that the array)
+ var normalizers = [];
+ for (var i=0; i<arguments.length; ++i) {
+ if (arguments[i].constructor == Array) {
+ normalizers = normalizers.concat(arguments[i]);
+ } else {
+ normalizers.push(arguments[i]);
+ }
+ }
+
+ // Enforces new object
+ if (!(this instanceof NAMESPACE.StandardAnalyzer)) {
+ return new NAMESPACE.StandardAnalyzer(normalizers);
+ }
+
+ this.sendFalseWhenComplete = false;
+
+ /**
+ * The main method: cuts the text, and calls the normalizers on each word,
+ * than calls the callback with each non empty word.
+ */
+ this.parse = function(text, callback) {
+ var self = this;
+ simple_parser(text, function(word) {
+ if (typeof word == "string") {
+ word = word.trim();
+ if (word != "") {
+ for (var i=0; i<normalizers.length; ++i) {
+ word = normalizers[i](word);
+ }
+ }
+ if (callback) {
+ callback(word);
+ }
+ } else if (word===false && self.sendFalseWhenComplete && callback) {
+ callback(word);
+ }
+ });
+ }
+ };
+
+
+// NAMESPACE.parser.split_parse = function(str, callback, parserfunc, functor) {
+// parserfunc=parserfunc||NAMESPACE.parser.parse;
+// functor = functor||net.kornr.unicode.is_letter_number;
+//
+// var result = [];
+// parserfunc(str, function(w) {
+// if (w) {
+// result.push(w);
+// } else {
+// callback(result);
+// }
+// }, functor);
+// };
+
+ return NAMESPACE;
+
+})(fullproof||{});
50 src/normalizers.js
@@ -0,0 +1,50 @@
+var fullproof = (function(NAMESPACE) {
+
+ NAMESPACE.normalizer = NAMESPACE.normalizer||{};
+
+ //
+ // Normalizing functions take a word and return another word.
+ // If the word is cancelled by a function, it gets replaced
+ // by the boolean value false, otherwise it returns and/or
+ // sends forward the callback chain the new normalized form
+ // for the word (or the unchanged form, if the normalizer
+ // doesn't perform any transformation).
+ //
+
+ NAMESPACE.normalizer.to_lowercase_decomp = function(word, callback) {
+ word = word?net.kornr.unicode.lowercase(word):word;
+ return callback?callback(word):word;
+ };
+
+ NAMESPACE.normalizer.to_lowercase_nomark = function(word, callback) {
+ word = word?net.kornr.unicode.lowercase_nomark(word):word;
+ return callback?callback(word):word;
+ };
+
+ NAMESPACE.normalizer.remove_duplicate_letters = function(word, callback) {
+ var res = word?"":false;
+ var last = false;
+ if (word) {
+ for (var i=0,max=word.length; i<max; ++i) {
+ if (last) {
+ if (last != word[i]) {
+ res +=last;
+ }
+ }
+ last = word[i];
+ }
+ res += last?last:"";
+ }
+ return callback?callback(res):res;
+ };
+
+ NAMESPACE.normalizer.filter_in_array = function(word, array, callback) {
+ if (array[word]) {
+ return callback?callback(false):false;
+ }
+ return callback?callback(word):word;
+ };
+
+ return NAMESPACE;
+
+})(fullproof||{});
283 src/normalizers/normalizers-english.js
@@ -0,0 +1,283 @@
+var fullproof = fullproof || {};
+fullproof.normalizer = (function(NAMESPACE) {
+
+ NAMESPACE.english = NAMESPACE.english|| {};
+
+ /**
+ * Porter stemmer adapted from http://code.google.com/p/yeti-witch/source/browse/trunk/lib/porter-stemmer.js
+ * Original license header below, declared as Apache License V2 on the project site
+ */
+ /**
+ * 18 May 2008
+ * Stemming is the process for reducing inflected (or sometimes derived) words to their stem, base or root
+ * form. Porter stemming is designed for the English language.
+ *
+ * This code has been slighly adapted from Martin Porter's examples.
+ * - http://tartarus.org/~martin/PorterStemmer/
+ *
+ * Please assume any errors found in the below code are translation errors
+ * inserted by myself and not those of the original authors.
+ *
+ * @author Matt Chadburn <matt@commuterjoy.co.uk>
+ */
+ NAMESPACE.english.porter_stemmer = (function(){
+ "use strict";
+
+ var step2list = new Array();
+ step2list["ational"]="ate";
+ step2list["tional"]="tion";
+ step2list["enci"]="ence";
+ step2list["anci"]="ance";
+ step2list["izer"]="ize";
+ step2list["bli"]="ble";
+ step2list["alli"]="al";
+ step2list["entli"]="ent";
+ step2list["eli"]="e";
+ step2list["ousli"]="ous";
+ step2list["ization"]="ize";
+ step2list["ation"]="ate";
+ step2list["ator"]="ate";
+ step2list["alism"]="al";
+ step2list["iveness"]="ive";
+ step2list["fulness"]="ful";
+ step2list["ousness"]="ous";
+ step2list["aliti"]="al";
+ step2list["iviti"]="ive";
+ step2list["biliti"]="ble";
+ step2list["logi"]="log";
+
+ var step3list = new Array();
+ step3list["icate"]="ic";
+ step3list["ative"]="";
+ step3list["alize"]="al";
+ step3list["iciti"]="ic";
+ step3list["ical"]="ic";
+ step3list["ful"]="";
+ step3list["ness"]="";
+
+ var c = "[^aeiou]"; // consonant
+ var v = "[aeiouy]"; // vowel
+ var C = c + "[^aeiouy]*"; // consonant sequence
+ var V = v + "[aeiou]*"; // vowel sequence
+
+ var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0
+ var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1
+ var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1
+ var s_v = "^(" + C + ")?" + v; // vowel in stem
+
+ return function(word) {
+
+ var stem;
+ var suffix;
+ var firstch;
+ var origword = w;
+ var w = word;
+
+ if (word.length < 3) { return word; }
+
+ var re;
+ var re2;
+ var re3;
+ var re4;
+
+ firstch = word.substr(0,1);
+ if (firstch == "y") {
+ w = firstch.toUpperCase() + w.substr(1);
+ }
+
+ // Step 1a
+ re = /^(.+?)(ss|i)es$/;
+ re2 = /^(.+?)([^s])s$/;
+
+ if (re.test(w)) { w = w.replace(re,"$1$2"); }
+ else if (re2.test(w)) { w = w.replace(re2,"$1$2"); }
+
+ // Step 1b
+ re = /^(.+?)eed$/;
+ re2 = /^(.+?)(ed|ing)$/;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ re = new RegExp(mgr0);
+ if (re.test(fp[1])) {
+ re = /.$/;
+ w = w.replace(re,"");
+ }
+ } else if (re2.test(w)) {
+ var fp = re2.exec(w);
+ stem = fp[1];
+ re2 = new RegExp(s_v);
+ if (re2.test(stem)) {
+ w = stem;
+ re2 = /(at|bl|iz)$/;
+ re3 = new RegExp("([^aeiouylsz])\\1$");
+ re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+ if (re2.test(w)) { w = w + "e"; }
+ else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); }
+ else if (re4.test(w)) { w = w + "e"; }
+ }
+ }
+
+ // Step 1c
+ re = /^(.+?)y$/;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ stem = fp[1];
+ re = new RegExp(s_v);
+ if (re.test(stem)) { w = stem + "i"; }
+ }
+
+ // Step 2
+ re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ stem = fp[1];
+ suffix = fp[2];
+ re = new RegExp(mgr0);
+ if (re.test(stem)) {
+ w = stem + step2list[suffix];
+ }
+ }
+
+ // Step 3
+ re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ stem = fp[1];
+ suffix = fp[2];
+ re = new RegExp(mgr0);
+ if (re.test(stem)) {
+ w = stem + step3list[suffix];
+ }
+ }
+
+ // Step 4
+ re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
+ re2 = /^(.+?)(s|t)(ion)$/;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ stem = fp[1];
+ re = new RegExp(mgr1);
+ if (re.test(stem)) {
+ w = stem;
+ }
+ } else if (re2.test(w)) {
+ var fp = re2.exec(w);
+ stem = fp[1] + fp[2];
+ re2 = new RegExp(mgr1);
+ if (re2.test(stem)) {
+ w = stem;
+ }
+ }
+
+ // Step 5
+ re = /^(.+?)e$/;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ stem = fp[1];
+ re = new RegExp(mgr1);
+ re2 = new RegExp(meq1);
+ re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+ if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
+ w = stem;
+ }
+ }
+
+ re = /ll$/;
+ re2 = new RegExp(mgr1);
+ if (re.test(w) && re2.test(w)) {
+ re = /.$/;
+ w = w.replace(re,"");
+ }
+
+ // and turn initial Y back to y
+
+ if (firstch == "y") {
+ w = firstch.toLowerCase() + w.substr(1);
+ }
+
+ return w;
+ }
+ })();
+
+
+ var stopwords = {
+ "a" : 1, "a's" : 1, "able" : 1, "about" : 1, "above" : 1, "according" : 1, "accordingly" : 1, "across" : 1,
+ "actually" : 1, "after" : 1, "afterwards" : 1, "again" : 1, "against" : 1, "ain't" : 1, "all" : 1, "allow" : 1,
+ "allows" : 1, "almost" : 1, "alone" : 1, "along" : 1, "already" : 1, "also" : 1, "although" : 1, "always" : 1,
+ "am" : 1, "among" : 1, "amongst" : 1, "an" : 1, "and" : 1, "another" : 1, "any" : 1, "anybody" : 1,
+ "anyhow" : 1, "anyone" : 1, "anything" : 1, "anyway" : 1, "anyways" : 1, "anywhere" : 1, "apart" : 1,
+ "appear" : 1, "appreciate" : 1, "appropriate" : 1, "are" : 1, "aren't" : 1, "around" : 1, "as" : 1,
+ "aside" : 1, "ask" : 1, "asking" : 1, "associated" : 1, "at" : 1, "available" : 1, "away" : 1, "awfully" : 1,
+ "b" : 1, "be" : 1, "became" : 1, "because" : 1, "become" : 1, "becomes" : 1, "becoming" : 1, "been" : 1,
+ "before" : 1, "beforehand" : 1, "behind" : 1, "being" : 1, "believe" : 1, "below" : 1, "beside" : 1,
+ "besides" : 1, "best" : 1, "better" : 1, "between" : 1, "beyond" : 1, "both" : 1, "brief" : 1, "but" : 1,
+ "by" : 1, "c" : 1, "c'mon" : 1, "c's" : 1, "came" : 1, "can" : 1, "can't" : 1, "cannot" : 1, "cant" : 1,
+ "cause" : 1, "causes" : 1, "certain" : 1, "certainly" : 1, "changes" : 1, "clearly" : 1, "co" : 1, "com" : 1,
+ "come" : 1, "comes" : 1, "concerning" : 1, "consequently" : 1, "consider" : 1, "considering" : 1,
+ "contain" : 1, "containing" : 1, "contains" : 1, "corresponding" : 1, "could" : 1, "couldn't" : 1,
+ "course" : 1, "currently" : 1, "d" : 1, "definitely" : 1, "described" : 1, "despite" : 1, "did" : 1,
+ "didn't" : 1, "different" : 1, "do" : 1, "does" : 1, "doesn't" : 1, "doing" : 1, "don't" : 1, "done" : 1,
+ "down" : 1, "downwards" : 1, "during" : 1, "e" : 1, "each" : 1, "edu" : 1, "eg" : 1, "eight" : 1, "either" : 1,
+ "else" : 1, "elsewhere" : 1, "enough" : 1, "entirely" : 1, "especially" : 1, "et" : 1, "etc" : 1, "even" : 1,
+ "ever" : 1, "every" : 1, "everybody" : 1, "everyone" : 1, "everything" : 1, "everywhere" : 1, "ex" : 1,
+ "exactly" : 1, "example" : 1, "except" : 1, "f" : 1, "far" : 1, "few" : 1, "fifth" : 1, "first" : 1,
+ "five" : 1, "followed" : 1, "following" : 1, "follows" : 1, "for" : 1, "former" : 1, "formerly" : 1,
+ "forth" : 1, "four" : 1, "from" : 1, "further" : 1, "furthermore" : 1, "g" : 1, "get" : 1, "gets" : 1,
+ "getting" : 1, "given" : 1, "gives" : 1, "go" : 1, "goes" : 1, "going" : 1, "gone" : 1, "got" : 1,
+ "gotten" : 1, "greetings" : 1, "h" : 1, "had" : 1, "hadn't" : 1, "happens" : 1, "hardly" : 1, "has" : 1,
+ "hasn't" : 1, "have" : 1, "haven't" : 1, "having" : 1, "he" : 1, "he's" : 1, "hello" : 1, "help" : 1,
+ "hence" : 1, "her" : 1, "here" : 1, "here's" : 1, "hereafter" : 1, "hereby" : 1, "herein" : 1, "hereupon" : 1,
+ "hers" : 1, "herself" : 1, "hi" : 1, "him" : 1, "himself" : 1, "his" : 1, "hither" : 1, "hopefully" : 1,
+ "how" : 1, "howbeit" : 1, "however" : 1, "i" : 1, "i'd" : 1, "i'll" : 1, "i'm" : 1, "i've" : 1, "ie" : 1,
+ "if" : 1, "ignored" : 1, "immediate" : 1, "in" : 1, "inasmuch" : 1, "inc" : 1, "indeed" : 1, "indicate" : 1,
+ "indicated" : 1, "indicates" : 1, "inner" : 1, "insofar" : 1, "instead" : 1, "into" : 1, "inward" : 1,
+ "is" : 1, "isn't" : 1, "it" : 1, "it'd" : 1, "it'll" : 1, "it's" : 1, "its" : 1, "itself" : 1, "j" : 1,
+ "just" : 1, "k" : 1, "keep" : 1, "keeps" : 1, "kept" : 1, "know" : 1, "knows" : 1, "known" : 1, "l" : 1,
+ "last" : 1, "lately" : 1, "later" : 1, "latter" : 1, "latterly" : 1, "least" : 1, "less" : 1, "lest" : 1,
+ "let" : 1, "let's" : 1, "like" : 1, "liked" : 1, "likely" : 1, "little" : 1, "look" : 1, "looking" : 1,
+ "looks" : 1, "ltd" : 1, "m" : 1, "mainly" : 1, "many" : 1, "may" : 1, "maybe" : 1, "me" : 1, "mean" : 1,
+ "meanwhile" : 1, "merely" : 1, "might" : 1, "more" : 1, "moreover" : 1, "most" : 1, "mostly" : 1, "much" : 1,
+ "must" : 1, "my" : 1, "myself" : 1, "n" : 1, "name" : 1, "namely" : 1, "nd" : 1, "near" : 1, "nearly" : 1,
+ "necessary" : 1, "need" : 1, "needs" : 1, "neither" : 1, "never" : 1, "nevertheless" : 1, "new" : 1,
+ "next" : 1, "nine" : 1, "no" : 1, "nobody" : 1, "non" : 1, "none" : 1, "noone" : 1, "nor" : 1, "normally" : 1,
+ "not" : 1, "nothing" : 1, "novel" : 1, "now" : 1, "nowhere" : 1, "o" : 1, "obviously" : 1, "of" : 1, "off" : 1,
+ "often" : 1, "oh" : 1, "ok" : 1, "okay" : 1, "old" : 1, "on" : 1, "once" : 1, "one" : 1, "ones" : 1,
+ "only" : 1, "onto" : 1, "or" : 1, "other" : 1, "others" : 1, "otherwise" : 1, "ought" : 1, "our" : 1,
+ "ours" : 1, "ourselves" : 1, "out" : 1, "outside" : 1, "over" : 1, "overall" : 1, "own" : 1, "p" : 1,
+ "particular" : 1, "particularly" : 1, "per" : 1, "perhaps" : 1, "placed" : 1, "please" : 1, "plus" : 1,
+ "possible" : 1, "presumably" : 1, "probably" : 1, "provides" : 1, "q" : 1, "que" : 1, "quite" : 1, "qv" : 1,
+ "r" : 1, "rather" : 1, "rd" : 1, "re" : 1, "really" : 1, "reasonably" : 1, "regarding" : 1, "regardless" : 1,
+ "regards" : 1, "relatively" : 1, "respectively" : 1, "right" : 1, "s" : 1, "said" : 1, "same" : 1, "saw" : 1,
+ "say" : 1, "saying" : 1, "says" : 1, "second" : 1, "secondly" : 1, "see" : 1, "seeing" : 1, "seem" : 1,
+ "seemed" : 1, "seeming" : 1, "seems" : 1, "seen" : 1, "self" : 1, "selves" : 1, "sensible" : 1, "sent" : 1,
+ "serious" : 1, "seriously" : 1, "seven" : 1, "several" : 1, "shall" : 1, "she" : 1, "should" : 1,
+ "shouldn't" : 1, "since" : 1, "six" : 1, "so" : 1, "some" : 1, "somebody" : 1, "somehow" : 1, "someone" : 1,
+ "something" : 1, "sometime" : 1, "sometimes" : 1, "somewhat" : 1, "somewhere" : 1, "soon" : 1, "sorry" : 1,
+ "specified" : 1, "specify" : 1, "specifying" : 1, "still" : 1, "sub" : 1, "such" : 1, "sup" : 1, "sure" : 1,
+ "t" : 1, "t's" : 1, "take" : 1, "taken" : 1, "tell" : 1, "tends" : 1, "th" : 1, "than" : 1, "thank" : 1,
+ "thanks" : 1, "thanx" : 1, "that" : 1, "that's" : 1, "thats" : 1, "the" : 1, "their" : 1, "theirs" : 1,
+ "them" : 1, "themselves" : 1, "then" : 1, "thence" : 1, "there" : 1, "there's" : 1, "thereafter" : 1,
+ "thereby" : 1, "therefore" : 1, "therein" : 1, "theres" : 1, "thereupon" : 1, "these" : 1, "they" : 1,
+ "they'd" : 1, "they'll" : 1, "they're" : 1, "they've" : 1, "think" : 1, "third" : 1, "this" : 1,
+ "thorough" : 1, "thoroughly" : 1, "those" : 1, "though" : 1, "three" : 1, "through" : 1, "throughout" : 1,
+ "thru" : 1, "thus" : 1, "to" : 1, "together" : 1, "too" : 1, "took" : 1, "toward" : 1, "towards" : 1,
+ "tried" : 1, "tries" : 1, "truly" : 1, "try" : 1, "trying" : 1, "twice" : 1, "two" : 1, "u" : 1, "un" : 1,
+ "under" : 1, "unfortunately" : 1, "unless" : 1, "unlikely" : 1, "until" : 1, "unto" : 1, "up" : 1, "upon" : 1,
+ "us" : 1, "use" : 1, "used" : 1, "useful" : 1, "uses" : 1, "using" : 1, "usually" : 1, "uucp" : 1, "v" : 1,
+ "value" : 1, "various" : 1, "very" : 1, "via" : 1, "viz" : 1, "vs" : 1, "w" : 1, "want" : 1, "wants" : 1,
+ "was" : 1, "wasn't" : 1, "way" : 1, "we" : 1, "we'd" : 1, "we'll" : 1, "we're" : 1, "we've" : 1, "welcome" : 1,
+ "well" : 1, "went" : 1, "were" : 1, "weren't" : 1, "what" : 1, "what's" : 1, "whatever" : 1, "when" : 1,
+ "whence" : 1, "whenever" : 1, "where" : 1, "where's" : 1, "whereafter" : 1, "whereas" : 1, "whereby" : 1,
+ "wherein" : 1, "whereupon" : 1, "wherever" : 1, "whether" : 1, "which" : 1, "while" : 1, "whither" : 1,
+ "who" : 1, "who's" : 1, "whoever" : 1, "whole" : 1, "whom" : 1, "whose" : 1, "why" : 1, "will" : 1,
+ "willing" : 1, "wish" : 1, "with" : 1, "within" : 1, "without" : 1, "won't" : 1, "wonder" : 1, "would" : 1,
+ "would" : 1, "wouldn't" : 1, "x" : 1, "y" : 1, "yes" : 1, "yet" : 1, "you" : 1, "you'd" : 1, "you'll" : 1,
+ "you're" : 1, "you've" : 1, "your" : 1, "yours" : 1, "yourself" : 1, "yourselves" : 1, "z" : 1, "zero" : 1 };
+
+ NAMESPACE.english.stopword_remover = function(word, callback) {
+ return NAMESPACE.filter_in_array(word, stopwords, callback);
+ };
+
+ return NAMESPACE;
+
+})(fullproof.normalizer||{});
280 src/normalizers/normalizers-french.js
@@ -0,0 +1,280 @@
+var fullproof = fullproof || {};
+fullproof.normalizer = (function(NAMESPACE) {
+
+ NAMESPACE.french = NAMESPACE.french||{};
+
+ NAMESPACE.french.simple_stemmer = (function(){
+
+ var suffix_removals_verbs_raw = [
+ // Below, common verbs suffix first
+ [/.../, /er(ai([st]?|ent)|i?(on[ts]|ez))$/, "e"]
+ [/.../, /ass(i?(ez?|ons)|e(nt|s)?)$/, "e"], // asse, asses, assez, assiez, assies*, if root length >= 3
+ [/.../, /assions$/, "e"], // assions if root lengh>=3
+ [/.../, /assent$/, "e"], // assent if root lengh>=3
+
+ [/endr(ez?|ai[st]?|on[st])$/, "ã"], // endrez, endrai, endrais, endrait, endrons, endront
+
+ [/.../, /iss(i?(ons|ez)|ai[st]?|ant(es?)?|es?)$/, "" ], // issions, issiez, issais, issait, issai, issant, issante, issantes, isses
+
+ [/irai(s|(en)?t)?$/, ""], // irai, irait, irais, iraient
+
+ [/.../, /e?oi(re?|t|s|ent)$/, ""], // eoir, eoire, oir, oire, oit, ois, oient
+
+ [/.../, /aient$/, ""], // removes aient
+ [/.../, /a[mt]es$/, ""], // removes ames, ates
+ [/i?ons$/, ""], // removes ons, ions
+ [/ait$/, ""], // removes ait
+ [/ent$/, ""], // removes ent
+ [/i?e[rz]$/, "e"], // removes er, ez, iez
+
+ ];
+
+ var suffix_removals_nouns_raw = [
+ [/inages?$/, "1"], // "copinage" > "cop1"
+ [/.../, /ages?$/, ""], // "habillage" > "habill"
+ [/.../, /[aoie]tions?$/, ""], // "déclaration" > "déclar", not "nation"
+ [/og(ies?|ues?)$/, "og"], // "philologie" -> "philolog", "philologue" -> "philolog"
+ [/t(rices?|euses?)$/, "ter"], // "fédératrice" -> "fédérater","flatteuse" -> "flatter" (eur is -> er by another rule)
+ [/.../, /e(uses?|ries?|urs?)$/, "er"], // euse, euses, eries, eries, eur (flatteuse, flatterie, flatteur)
+ [/utions$/, "u"], // "pollution", "attribution" ! produces a "u", because "uer"$ is not removed (but "er"$ is).
+ [/[ae]n[cs]es?$/, "ãS"], // prudence" -> "prudã", "tolérance" -> "tolérã"
+ [/..al/, /ites?$/, ""], // // "anormalite" -> "anormal"
+ [/[ea]mment]/, "ã"], // prudemment -> "prudã"
+ //
+ //not processed:
+ //* usion$ : not an interesting simplification, as there are not
+ // enough nominal cases. i.e. "diffusion", but "illusion",
+ // "exclusion", "contusion", etc.
+ [/ives?$/, "if"], // // "consécutives" -> con
+ [/istes?$/, "isme"], // maybe a bit aggressive ?
+ [/ables?$/, ""], // "chiffrable" -> "chiffr". aggressive ?
+ [/[^ae]/, /ines?$/, "1"],// "citadine"->"citadin"
+ ];
+
+ var phonetic_transforms_raw = [
+
+ [/n/, /t/, /iel/, "S"],
+ [false, /t/, /i[oea]/, "S"],
+
+ // the A LETTER
+ [false, /ain/, /[^aeiouymn].*|$/, "1" ], // copain->cop1, complainte->compl1te
+ [/ai(s$)?/, "e"],
+ [false, /am/, /[^aeiouymn].*|$/, "ã" ], // crampe->crãpe
+ [/aux?$/, "al"], // tribunaux->tribunal
+ [/e?au(x$)?/, "o"], // beaux->bo, bateau->bato, journaux->journo
+ [/an(te?s?|s)$/, "ã"], //
+ [false, /an/, /[^aeiouymn].*|$/, "ã" ],
+ [/r[dt]s?$/, "r"],
+ // Process the e letter
+ // The e letter is probably the most complicated of all
+ [false, /ein/, /[^aeiouymn].*|$/, "1"], // frein, teint
+ [/e[ui]/, "e"],// peine, pleurer, bleu
+ [/en[td]$/, "ã"], // client, prend, fend
+ [/i/, /en/, /[^aeiouymn].*|$/, "1"], // norvégien, rien
+ [false, /en/, /[^aeiouymn].*|$/, "ã"], // tente->tãte
+ [/ets?$/, "e"], // violet, triplets
+ [false, /e/, /o/, ""], // like surseoir
+
+ // Process the i letter
+
+ [/ier(s|es?)?$/, ""], // ier, iere, iere, ieres
+ [false, /i[nm]/, /[^aeiouymn].*|$/, "1"], // malintentionné->mal1tentionné
+ [/ill/, "y"], // paille->paye, rouille->rouye
+
+ // Process the o letter
+ [false, /on/, /[^aeiouyhnm].*|$/, "ô"],
+ [false, /ouin/, /[^aeiouymn].*|$/, "o1"],
+ [/oe(u(d$)?)?/, "e"],
+
+ // Process the u letter
+ [false, /un/, /[^aeiouymn].*|$/, "1"],
+ [/u[st]$/, "u"], // "résidus", "crut" TODO better remove /[st]$/ ?
+
+ // Process the y letter
+ [/yer$/, "i"], // "ennuyer"->ennui, "appuyer"->appui
+ [/[^aeiouy]/, /ym/, /[^aeiouy].*|$/, "1"], // "symbole", "nymphe", "sympa"
+ [/[^aeiouy]/, /yn/, /[^aeiouynm].*|$/, "1"], // "syndicat", "synchro"
+ [/[^aeiouy]/, /y/, "i"], // "dynamite"
+
+ [/[aeiouy]/, /s/, /[aeiouy]/, "z"],
+ [/sc?h/, "ch"],
+
+ [/gu/, "g"],
+ [false, /g/, /[^aorl].*/, "j"],
+
+ [/ph/, "f"],
+ [/[^t]/, /t/, /ion/, "ss"],
+
+ [/qu?/, "k"],
+ [false, /c/, /[auorlt]/, "k"],
+
+ [/[aeiou]/, /s/, /[aeiou]/, "z"],
+ [/[^c]/, /h/, ""],
+ [/^h/, ""],
+
+ [/[oiua]/, /t$/, false, ""],
+
+ [/es?$/, ""], // final e
+
+ //plural
+ [/[xs]$/, ""],
+
+ ];
+
+ function post_process_arrays(arr) {
+ var result = [];
+ for (var i=0; i<arr.length; ++i) {
+ var obj = arr[i];
+ if (obj) {
+ switch(obj.length) {
+ case 2:
+ result.push([new RegExp("(.*)("+obj[0].source+")(.*)"),obj[1]]);
+// result.push([new RegExp("("+obj[0].source+")"),obj[1]]);
+ break;
+ case 3:
+ result.push([new RegExp( (obj[0]?"(.*"+obj[0].source+")":"(.*)") + "("+obj[1].source+")" + "(.*)"),obj[2]]);
+// result.push([new RegExp( (obj[0]?obj[0].source:"") + "("+obj[1].source+")"),obj[2]]);
+ break;
+ case 4:
+ result.push([new RegExp( (obj[0]?"(.*"+obj[0].source+")":"(.*)") + "("+obj[1].source+")" + (obj[2]?"("+obj[2].source+".*)":"(.*)")),obj[3]]);
+// result.push([new RegExp( (obj[0]?obj[0].source:"") + "("+obj[1].source+")" + (obj[2]?obj[2].source:"")),obj[3]]);
+ break;
+ }
+ }
+ }
+ return result;
+ }
+
+ var suffix_removals_verbs = post_process_arrays(suffix_removals_verbs_raw);
+ var suffix_removals_nouns = post_process_arrays(suffix_removals_nouns_raw);
+ var phonetic_transforms = post_process_arrays(phonetic_transforms_raw);
+
+ function apply_regexp_array(word, regarray, stopOnFirstMatch) {
+ var org = word;
+ console.log("==== applying rules on " + word + " ========");
+
+ for (var i=0; i<regarray.length; ++i) {
+
+ var res = regarray[i][0].exec(word);
+ if (res) {
+ console.log("matched rule " + regarray[i][0].source + " -> " + regarray[i][1] + ", length: " + res.length);
+ console.log("re: " + regarray[i][0].lastIndex + " / " + res.index);
+ console.log(res);
+
+ var p1 = res[1];
+ var p2 = regarray[i][1];
+ var p3 = res[res.length-1];
+ word = p1 + p2 + p3;
+
+ console.log("word is now " + word + " (" + p1 +" + " + p2 + " + " + p3 + "), before: " + org);
+
+ if (stopOnFirstMatch) {
+ i = regarray.length;
+ }
+ }
+ }
+ return word;
+ }
+
+
+ return function(word, verbs, nouns, phonetic) {
+ verbs = verbs===undefined?true:verbs;
+ nouns = nouns===undefined?true:nouns;
+ phonetic = phonetic===undefined?true:phonetic;
+
+ if (verbs) {
+ word = apply_regexp_array(word, suffix_removals_verbs, true);
+ }
+ if (nouns) {
+ word = apply_regexp_array(word, suffix_removals_nouns, true);
+ }
+ if (phonetic) {
+ word = apply_regexp_array(word, phonetic_transforms, false);
+ }
+ return NAMESPACE.remove_duplicate_letters(word.toLowerCase());
+ };
+ })();
+
+
+
+ /**
+ * Stopword list, based on http://members.unine.ch/jacques.savoy/clef/
+ * Works for lowercased words, with or without diacritical marks
+ */
+ var stopwords = {
+ "a" : 1, "à" : 1, "â" : 1, "abord" : 1, "afin" : 1, "ah" : 1, "ai" : 1, "aie" : 1, "ainsi" : 1, "allaient" : 1,
+ "allo" : 1, "allô" : 1, "allons" : 1, "après" : 1, "apres" : 1, "assez" : 1, "attendu" : 1, "au" : 1,
+ "aucun" : 1, "aucune" : 1, "aujourd" : 1, "aujourd'hui" : 1, "auquel" : 1, "aura" : 1, "auront" : 1,
+ "aussi" : 1, "autre" : 1, "autres" : 1, "aux" : 1, "auxquelles" : 1, "auxquels" : 1, "avaient" : 1,
+ "avais" : 1, "avait" : 1, "avant" : 1, "avec" : 1, "avoir" : 1, "ayant" : 1, "b" : 1, "bah" : 1,
+ "beaucoup" : 1, "bien" : 1, "bigre" : 1, "boum" : 1, "bravo" : 1, "brrr" : 1, "c" : 1, "ça" : 1, "ca" : 1,
+ "car" : 1, "ce" : 1, "ceci" : 1, "cela" : 1, "celle" : 1, "celle-ci" : 1, "celle-là" : 1, "celle-la" : 1,
+ "celles" : 1, "celles-ci" : 1, "celles-là" : 1, "celles-la" : 1, "celui" : 1, "celui-ci" : 1, "celui-là" : 1,
+ "celui-la" : 1, "cent" : 1, "cependant" : 1, "certain" : 1, "certaine" : 1, "certaines" : 1, "certains" : 1,
+ "certes" : 1, "ces" : 1, "cet" : 1, "cette" : 1, "ceux" : 1, "ceux-ci" : 1, "ceux-là" : 1, "ceux-la" : 1,
+ "chacun" : 1, "chaque" : 1, "cher" : 1, "chère" : 1, "chères" : 1, "chere" : 1, "cheres" : 1, "chers" : 1,
+ "chez" : 1, "chiche" : 1, "chut" : 1, "ci" : 1, "cinq" : 1, "cinquantaine" : 1, "cinquante" : 1,
+ "cinquantième" : 1, "cinquieme" : 1, "cinquantieme" : 1, "cinquième" : 1, "clac" : 1, "clic" : 1,
+ "combien" : 1, "comme" : 1, "comment" : 1, "compris" : 1, "concernant" : 1, "contre" : 1, "couic" : 1,
+ "crac" : 1, "d" : 1, "da" : 1, "dans" : 1, "de" : 1, "debout" : 1, "dedans" : 1, "dehors" : 1, "delà" : 1,
+ "dela" : 1, "depuis" : 1, "derrière" : 1, "derriere" : 1, "des" : 1, "dès" : 1, "désormais" : 1,
+ "desormais" : 1, "desquelles" : 1, "desquels" : 1, "dessous" : 1, "dessus" : 1, "deux" : 1, "deuxième" : 1,
+ "deuxièmement" : 1, "deuxieme" : 1, "deuxiemement" : 1, "devant" : 1, "devers" : 1, "devra" : 1,
+ "différent" : 1, "différente" : 1, "différentes" : 1, "différents" : 1, "different" : 1, "differente" : 1,
+ "differentes" : 1, "differents" : 1, "dire" : 1, "divers" : 1, "diverse" : 1, "diverses" : 1, "dix" : 1,
+ "dix-huit" : 1, "dixième" : 1, "dixieme" : 1, "dix-neuf" : 1, "dix-sept" : 1, "doit" : 1, "doivent" : 1,
+ "donc" : 1, "dont" : 1, "douze" : 1, "douzième" : 1, "douzieme" : 1, "dring" : 1, "du" : 1, "duquel" : 1,
+ "durant" : 1, "e" : 1, "effet" : 1, "eh" : 1, "elle" : 1, "elle-même" : 1, "elle-meme" : 1, "elles" : 1,
+ "elles-mêmes" : 1, "elles-memes" : 1, "en" : 1, "encore" : 1, "entre" : 1, "envers" : 1, "environ" : 1,
+ "es" : 1, "ès" : 1, "est" : 1, "et" : 1, "etant" : 1, "étaient" : 1, "étais" : 1, "était" : 1, "étant" : 1,
+ "etaient" : 1, "etais" : 1, "etait" : 1, "etant" : 1, "etc" : 1, "été" : 1, "ete" : 1, "etre" : 1, "être" : 1,
+ "eu" : 1, "euh" : 1, "eux" : 1, "eux-mêmes" : 1, "eux-memes" : 1, "excepté" : 1, "excepte" : 1, "f" : 1,
+ "façon" : 1, "facon" : 1, "fais" : 1, "faisaient" : 1, "faisant" : 1, "fait" : 1, "feront" : 1, "fi" : 1,
+ "flac" : 1, "floc" : 1, "font" : 1, "g" : 1, "gens" : 1, "h" : 1, "ha" : 1, "" : 1, "he" : 1, "hein" : 1,
+ "hélas" : 1, "helas" : 1, "hem" : 1, "hep" : 1, "hi" : 1, "ho" : 1, "holà" : 1, "hola" : 1, "hop" : 1,
+ "hormis" : 1, "hors" : 1, "hou" : 1, "houp" : 1, "hue" : 1, "hui" : 1, "huit" : 1, "huitième" : 1,
+ "huitieme" : 1, "hum" : 1, "hurrah" : 1, "i" : 1, "il" : 1, "ils" : 1, "importe" : 1, "j" : 1, "je" : 1,
+ "jusqu" : 1, "jusque" : 1, "k" : 1, "l" : 1, "la" : 1, "" : 1, "la" : 1, "laquelle" : 1, "las" : 1, "le" : 1,
+ "lequel" : 1, "les" : 1, "lès" : 1, "lesquelles" : 1, "lesquels" : 1, "leur" : 1, "leurs" : 1, "longtemps" : 1,
+ "lorsque" : 1, "lui" : 1, "lui-même" : 1, "lui-meme" : 1, "m" : 1, "ma" : 1, "maint" : 1, "mais" : 1,
+ "malgré" : 1, "malgre" : 1, "me" : 1, "même" : 1, "mêmes" : 1, "meme" : 1, "memes" : 1, "merci" : 1, "mes" : 1,
+ "mien" : 1, "mienne" : 1, "miennes" : 1, "miens" : 1, "mille" : 1, "mince" : 1, "moi" : 1, "moi-même" : 1,
+ "moi-meme" : 1, "moins" : 1, "mon" : 1, "moyennant" : 1, "n" : 1, "na" : 1, "ne" : 1, "néanmoins" : 1,
+ "neanmoins" : 1, "neuf" : 1, "neuvième" : 1, "neuvieme" : 1, "ni" : 1, "nombreuses" : 1, "nombreux" : 1,
+ "non" : 1, "nos" : 1, "notre" : 1, "nôtre" : 1, "nôtres" : 1, "notre" : 1, "notres" : 1, "nous" : 1,
+ "nous-mêmes" : 1, "nous-memes" : 1, "nul" : 1, "o" : 1, "o|" : 1, "ô" : 1, "oh" : 1, "ohé" : 1, "olé" : 1,
+ "ollé" : 1, "ohe" : 1, "ole" : 1, "olle" : 1, "on" : 1, "ont" : 1, "onze" : 1, "onzième" : 1, "onzieme" : 1,
+ "ore" : 1, "ou" : 1, "" : 1, "ouf" : 1, "ouias" : 1, "oust" : 1, "ouste" : 1, "outre" : 1, "p" : 1,
+ "paf" : 1, "pan" : 1, "par" : 1, "parmi" : 1, "partant" : 1, "particulier" : 1, "particulière" : 1,
+ "particulièrement" : 1, "particuliere" : 1, "particulierement" : 1, "pas" : 1, "passé" : 1, "passe" : 1,
+ "pendant" : 1, "personne" : 1, "peu" : 1, "peut" : 1, "peuvent" : 1, "peux" : 1, "pff" : 1, "pfft" : 1,
+ "pfut" : 1, "pif" : 1, "plein" : 1, "plouf" : 1, "plus" : 1, "plusieurs" : 1, "plutôt" : 1, "plutot" : 1,
+ "pouah" : 1, "pour" : 1, "pourquoi" : 1, "premier" : 1, "première" : 1, "premièrement" : 1, "près" : 1,
+ "premiere" : 1, "premierement" : 1, "pres" : 1, "proche" : 1, "psitt" : 1, "puisque" : 1, "q" : 1, "qu" : 1,
+ "quand" : 1, "quant" : 1, "quanta" : 1, "quant-à-soi" : 1, "quant-a-soi" : 1, "quarante" : 1, "quatorze" : 1,
+ "quatre" : 1, "quatre-vingt" : 1, "quatrième" : 1, "quatrièmement" : 1, "quatrieme" : 1, "quatriemement" : 1,
+ "que" : 1, "quel" : 1, "quelconque" : 1, "quelle" : 1, "quelles" : 1, "quelque" : 1, "quelques" : 1,
+ "quelqu'un" : 1, "quels" : 1, "qui" : 1, "quiconque" : 1, "quinze" : 1, "quoi" : 1, "quoique" : 1, "r" : 1,
+ "revoici" : 1, "revoilà" : 1, "revoila" : 1, "rien" : 1, "s" : 1, "sa" : 1, "sacrebleu" : 1, "sans" : 1,
+ "sapristi" : 1, "sauf" : 1, "se" : 1, "seize" : 1, "selon" : 1, "sept" : 1, "septième" : 1, "septieme" : 1,
+ "sera" : 1, "seront" : 1, "ses" : 1, "si" : 1, "sien" : 1, "sienne" : 1, "siennes" : 1, "siens" : 1,
+ "sinon" : 1, "six" : 1, "sixième" : 1, "sixieme" : 1, "soi" : 1, "soi-même" : 1, "soi-meme" : 1, "soit" : 1,
+ "soixante" : 1, "son" : 1, "sont" : 1, "sous" : 1, "stop" : 1, "suis" : 1, "suivant" : 1, "sur" : 1,
+ "surtout" : 1, "t" : 1, "ta" : 1, "tac" : 1, "tant" : 1, "te" : 1, "" : 1, "te" : 1, "tel" : 1, "telle" : 1,
+ "tellement" : 1, "telles" : 1, "tels" : 1, "tenant" : 1, "tes" : 1, "tic" : 1, "tien" : 1, "tienne" : 1,
+ "tiennes" : 1, "tiens" : 1, "toc" : 1, "toi" : 1, "toi-même" : 1, "toi-meme" : 1, "ton" : 1, "touchant" : 1,
+ "toujours" : 1, "tous" : 1, "tout" : 1, "toute" : 1, "toutes" : 1, "treize" : 1, "trente" : 1, "très" : 1,
+ "tres" : 1, "trois" : 1, "troisième" : 1, "troisièmement" : 1, "troisieme" : 1, "troisiemement" : 1,
+ "trop" : 1, "tsoin" : 1, "tsouin" : 1, "tu" : 1, "u" : 1, "un" : 1, "une" : 1, "unes" : 1, "uns" : 1, "v" : 1,
+ "va" : 1, "vais" : 1, "vas" : 1, "" : 1, "ve" : 1, "vers" : 1, "via" : 1, "vif" : 1, "vifs" : 1, "vingt" : 1,
+ "vivat" : 1, "vive" : 1, "vives" : 1, "vlan" : 1, "voici" : 1, "voilà" : 1, "voila" : 1, "vont" : 1, "vos" : 1,
+ "votre" : 1, "vôtre" : 1, "vôtres" : 1, "votre" : 1, "votres" : 1, "vous" : 1, "vous-mêmes" : 1,
+ "vous-memes" : 1, "vu" : 1, "w" : 1, "x" : 1, "y" : 1, "z" : 1, "zut" : 1 };
+
+ NAMESPACE.french.stopword_remover = function(word, callback) {
+ return NAMESPACE.filter_in_array(word, stopwords, callback);
+ };
+
+ return NAMESPACE;
+
+})(fullproof.normalizer||{});
244 src/resultsets.js
@@ -0,0 +1,244 @@
+var fullproof = fullproof || {};
+fullproof = (function(NAMESPACE) {
+"use strict";
+
+ var defaultComparator = {
+ lower_than: function(a,b) {
+ return a<b;
+ },
+ equals: function(a,b) {
+ return a==b;
+ }
+ };
+
+
+ /*
+ * Binary search an array.
+ */
+ NAMESPACE.binary_search = function(array, value, min, max, lower_than) {
+ lower_than=lower_than||defaultComparator.lower_than;
+ if (min===undefined && max===undefined) {
+ if (array.length == 0) {
+ return 0
+ } else {
+ return NAMESPACE.binary_search(array,value,0,array.length,lower_than);
+ }
+ }
+
+ while (max>=min) {
+ var mid = parseInt((max+min)/2);
+ if (mid>=array.length) {
+ return array.length;
+ } else if (lower_than(array[mid], value)) {
+ min = mid+1;
+ } else if (lower_than(value, array[mid])) {
+ max = mid-1;
+ } else {
+ // Found
+ return mid;
+ }
+ }
+ // Not found
+ return min;
+ }
+
+ /**
+ * Provides an object containing one sorted array of value, and the elementary
+ * set operations merge (union), intersect, and substract (complement).
+ * It maintains internally a sorted array of data. The optional comparator
+ * must be an object of the form {lower_than: func_lt, equals: func_equal}
+ */
+ NAMESPACE.ResultSet = function(comparatorObject) {
+ if (!(this instanceof NAMESPACE.ResultSet)) {
+ return new NAMESPACE.ResultSet(comparatorObject);
+ }
+ comparatorObject = comparatorObject||defaultComparator;
+
+ var data = [];
+ this.EXPOSED_REMOVED = data;
+
+ var last_insert = undefined;
+
+ this.insert = function() {
+ for (var i=0; i<arguments.length; ++i) {
+ var obj = arguments[i];
+
+ if (last_insert && comparatorObject.lower_than(last_insert,obj)) {
+ data.push(obj);
+ last_insert = obj
+ } else {
+ var index = NAMESPACE.binary_search(data, obj, undefined, undefined, comparatorObject.lower_than);
+ if (index >= data.length) {
+ data.push(obj);
+ last_insert = obj
+ } else if (comparatorObject.equals(obj, data[index]) == false) {
+ data.splice(index, 0, arguments[i]);
+ last_insert = undefined;
+ }
+ }
+ }
+ return this;
+ };
+
+ this.merge = function(set) {;
+ last_insert = undefined;
+ var other = false;
+ if (set.constructor == Array) {
+ other = set;
+ } else if (set instanceof NAMESPACE.ResultSet) {
+ other = set.getDataUnsafe();
+ }
+
+ if (other) {
+ var result = [];
+ var ddd = data;
+ var i=0,j=0,maxi=data.length,maxj=other.length;
+ var r = -1;
+ while (i<maxi || j<maxj) {
+ var goi = false;
+ if (i<maxi && j<maxj) {
+ if (comparatorObject.lower_than(data[i],other[j])) {
+ goi = true;
+ }
+ } else if (i<maxi) {
+ goi = true;
+ }
+
+ if (goi) {
+ if (result.length==0 || (!comparatorObject.equals(data[i],result[r]))) {
+ result.push(data[i]);
+ ++r;
+ }
+ ++i;
+ } else {
+ if (result.length==0 || (!comparatorObject.equals(other[j],result[r]))) {
+ result.push(other[j]);
+ ++r;
+ }
+ ++j;
+ }
+ }
+ data = result;
+ }
+ return this;
+ };
+
+
+ this.intersect = function(set) {
+ last_insert = undefined;
+ var other = false;
+ if (set.constructor == Array) {
+ other = set;
+ } else if (set instanceof NAMESPACE.ResultSet) {
+ other = set.getDataUnsafe();
+ }
+
+ if (other) {
+ var result = [];
+ var i=0,j=0,maxi=data.length,maxj=other.length;
+ while (i<maxi) {
+ while (j<maxj && comparatorObject.lower_than(other[j],data[i])) {
+ ++j;
+ }
+ if (j<maxj && comparatorObject.equals(other[j],data[i])) {
+ result.push(other[j]);
+ ++i;
+ ++j;
+ } else {
+ i++;
+ }
+ }
+ data = result;
+ } else {
+ data = [];
+ }
+ return this;
+ }
+
+ this.substract = function(set) {
+ last_insert = undefined;
+ var other = false;
+ if (set.constructor == Array) {
+ other = set;
+ } else if (set instanceof NAMESPACE.ResultSet) {
+ other = set.getDataUnsafe();
+ }
+
+ if (other) {
+ var result = [];
+ var i=0,j=0,maxi=data.length,maxj=other.length;
+ while (i<maxi) {
+ while (j<maxj && comparatorObject.lower_than(other[j],data[i])) {
+ ++j;
+ }
+ if (j<maxj && comparatorObject.equals(other[j],data[i])) {
+ ++i;
+ ++j;
+ } else {
+ result.push(data[i]);
+ i++;
+ }
+ }
+ data = result;
+ } else {
+ data = [];
+ }
+
+ return this;
+ };
+
+ this.getDataUnsafe = function() {
+ return data;
+ }
+ this.setDataUnsafe = function(sorted_array) {
+ last_insert = undefined;
+ data = sorted_array;
+ return this;
+ }
+
+ this.toString = function() {
+ return data.join(",");
+ }
+
+ this.forEach = function(callback) {
+ for (var i=0,max=data.length; i<max; ++i) {
+ callback(data[i]);
+ }
+ }
+
+ };
+
+// this.ResultEntry = function(resultArray, setType, initialWeight) {
+//
+// };
+//
+// function array_indexOf_proxy(arr,el) {
+// return arr.indexOf(el);
+// }
+
+// function array_indexOf_hm(arr,el) {
+// for (var i=0,max=arr.length; i<max; ++i) {
+// if (arr[i]===el) {
+// return i;
+// }
+// }
+// return -1;
+// }
+//
+
+// var array_indexOf = Array.indexOf?array_indexOf_proxy:array_indexOf_hm;
+//
+// NAMESPACE.intersects = function(array1, array2) {
+// var result = [];
+// for (var i=0, max=array1.length; i<max; ++i) {
+// if (array_indexOf(array2, array1[i])>=0) {
+// result.push(array1[i]);
+// };
+// }
+// return result;
+// };
+//
+
+ return NAMESPACE;
+
+})(fullproof);
90 src/searchmgr.js
@@ -0,0 +1,90 @@
+var fullproof = (function(NAMESPACE) {
+ "use strict";
+
+ NAMESPACE.manager = function() {
+
+ if (!(this instanceof NAMESPACE.manager)) {
+ return new NAMESPACE.manager();
+ }
+
+ var parser = net.kornr.searchengine.parsers.parse;
+ var normalizers = [];
+ var index = [];
+
+ this.setParser = function(p) {
+ parser=p;
+ };
+
+ this.setNormalizers = function(n) {
+ if (n instanceof Array) {
+ normalizers = n;
+ } else {
+ normalizers = [n];
+ }
+ };
+
+ this.addIndex = function(name, instance, weight, normalizers, parser) {
+ weight = weight||1.0;
+ parser = parser||net.kornr.searchengine.parsers.parse;
+ normalizers = normalizers||[NAMESPACE.normalizer.to_lowercase_nomark, NAMESPACE.normalizer.remove_duplicate_letters];
+ index.push({name: name, index: instance, weight: weight, normalizers: normalizers, parser: parser});
+ };
+
+ this.inject = function(sentence, value) {
+ for (var i=0; i<index.length; ++i) {
+ var ind = index[i];
+ ind.parser(sentence, function(word) {
+ if (word) {
+ for (var i=0,max=normalizers.length; i<max; ++i) {
+ word = normalizers[i](word);
+ }
+ ind.index.inject(word,value);
+ }
+ });
+ }
+ };
+
+ this.search = function(expr) {
+
+ var sets = [];
+
+ for (var i=0; i<index.length; ++i) {
+ var ind = index[i];
+ var indexset = [];
+ ind.parser(sentence, function(word) {
+
+ if (word) {
+
+ var searchType = 0;
+ if (word[0]=='+') {
+ searchType = 1;
+ word = word.substring(1);
+ } else if (word[0]=='-') {
+ searchType = -1;
+ word = word.substring(1);
+ }
+
+ for (var i=0,max=normalizers.length; i<max; ++i) {
+ word = normalizers[i](word);
+ }
+ var res = ind.index.lookup(word,value);
+ indexset.push({result: res, searchtype: searchType, weight: ind.weight});
+ }
+ });
+
+ // make sure the negative sets are last
+ indexset.sort(function(a,b) {
+ if (a.searchType>b.searchType) {
+ return 1;
+ }
+ return a.searchType==b.searchType?0:-1;
+ });
+
+ for (var i=0,max=indexset.length; i<max; ++i) {
+
+ }
+ }
+ };
+ };
+
+})(fullproof||{});
20 src/store/indexeddb_indexer.js
@@ -0,0 +1,20 @@
+var net = net||{};
+net.kornr = net.kornr||{};
+net.kornr.searchengine=net.kornr.searchengine||{};
+net.kornr.searchengine.storage=net.kornr.searchengine.storage||{};
+(function(NAMESPACE) {
+
+ NAMESPACE.indexeddb_indexer = function() {
+
+ if (!(this instanceof NAMESPACE.indexeddb_indexer)) {
+ return new NAMESPACE.indexeddb_indexer();
+ }
+
+ var indexedDB = window.indexedDB||window.moz_indexedDB||window.mozIndexedDB||webkitIndexedDB;
+
+ return {
+
+ };
+ };
+
+})(net.kornr.searchengine.storage);
50 src/store/memory_store.js
@@ -0,0 +1,50 @@
+var fullproof = fullproof || {};
+fullproof.store = (function(NAMESPACE) {
+
+ NAMESPACE.MemoryStore = function(comparatorObject) {
+
+ if (!(this instanceof NAMESPACE.MemoryStore)) {
+ return new NAMESPACE.MemoryStore();
+ }
+
+ this.data= {};
+
+ this.capabilities = {
+ can_store_object: true,
+ memory_based: true,
+ disk_based: false,
+ available : true
+ };
+
+ this.clear = function(callback) {
+ this.data = {};
+ if (callback) {
+ callback();
+ }
+ return this;
+ }
+
+ this.inject = function(key, value, callback) {
+ if (key && key != "") {
+ if (!this.data[key]) {
+ this.data[key] = new fullproof.ResultSet(comparatorObject);
+ }
+ this.data[key].insert(value);
+ }
+ if (callback) {
+ callback(key,value);
+ }
+ return this;
+ };
+
+ this.lookup = function(word, callback) {
+ callback(this.data[word]?this.data[word]:new fullproof.ResultSet);
+ return this;
+ };
+
+ return this;
+ };
+
+ return NAMESPACE;
+
+})(fullproof.store||{});
108 src/store/websql_store.js
@@ -0,0 +1,108 @@
+var fullproof = fullproof || {};
+fullproof.store = (function(NAMESPACE) {
+
+ NAMESPACE.WebSQLStore = function(comparatorObject) {
+
+ if (!(this instanceof NAMESPACE.WebSQLStore)) {
+ return new NAMESPACE.WebSQLStore();
+ }
+
+ var db = null;
+
+ var make_callback_caller = function(callback,value) {
+ return function() {
+ if (callback) {
+ callback(value);
+ }
+ }
+ };
+
+ return {
+
+ capabilities : {
+ can_store_object : false,
+ memory_based : false,
+ disk_based : true,
+ available : window.openDatabase ? true : false
+ },
+
+ tableName : undefined,
+ opened : false,
+
+ open : function(database, tablename, size, callback) {
+ this.opened = false;
+ this.tableName = name;
+ db = openDatabase(database, '1.0', 'javascript search engine', size);
+ this.tableName = tablename;
+ var self = this;
+
+ db.transaction(function(tx) {
+ tx.executeSql("CREATE TABLE IF NOT EXISTS "+ self.tableName +" (id NCHAR(48), value)", [],
+ function() {
+ tx.executeSql("CREATE INDEX IF NOT EXISTS "+ self.tableName +"_indx ON " + self.tableName + " (id)", [],
+ function() {
+ self.opened = true;
+ if (callback) {
+ callback(true);
+ }
+ },
+ function() {
+ console.log(arguments);
+ if (callback) {
+ callback(false);
+ }
+ });
+ },
+ function() {
+ self.opened = false;
+ if (callback) {
+ callback(false);
+ }
+ });
+ });
+ },
+
+ clear: function(callback) {
+ var self = this;
+ db.transaction(function(tx) {
+ tx.executeSql("DELETE FROM "+ self.tableName, [], make_callback_caller(callback, true), make_callback_caller(callback, false));
+ });
+ },
+
+ inject: function(word, value, callback) {
+ var self = this;
+ db.transaction(function(tx) {
+ tx.executeSql("INSERT OR REPLACE INTO " + self.tableName + " (id,value) VALUES (?,?)", [word, value], make_callback_caller(callback, true), make_callback_caller(callback, false));
+ });
+ },
+
+ /**
+ * WebSQLStore does not support object storage, only primary values, so we rely
+ * on the sql engine sorting functions. ORDER BY should provide fine results as long as
+ * the datatype of values is consistant.
+ */
+ lookup: function(word, callback) {
+ var self = this;
+ db.transaction(function(tx) {
+ tx.executeSql("SELECT * FROM " + self.tableName + " WHERE id=? ORDER BY value ASC", [word],
+ function(tx,res) {
+ var result = [];
+ for (var i=0; i<res.rows.length; ++i) {
+// console.log("GOT A RES");
+// console.log(res.rows.item(i));
+ result.push(res.rows.item(i).value);
+ }
+ callback(new fullproof.ResultSet(comparatorObject).setDataUnsafe(result));
+ },
+ function() {
+ console.log(arguments);
+ callback(false);
+ });
+ });
+ }
+ };
+ };
+
+ return NAMESPACE;
+
+})(fullproof.store||{});
58 src/unicode/categ_letters_numbers.js
@@ -0,0 +1,58 @@
+var net = net||{};net.kornr = net.kornr||{};net.kornr.unicode=net.kornr.unicode||{};
+net.kornr.unicode.categ_letters_numbers_data=[[48,57],[65,90],[97,122],170,[178,179],181,[185,186],[188,190],[192,214],[216,246],[248,705],[710,721],[736,740],748
+ ,750,[880,884],[886,887],[890,893],902,[904,906],908,[910,929],[931,1013],[1015,1153],[1162,1319],[1329,1366],1369
+ ,[1377,1415],[1488,1514],[1520,1522],[1568,1610],[1632,1641],[1646,1647],[1649,1747],1749,[1765,1766],[1774,1788],1791
+ ,1808,[1810,1839],[1869,1957],1969,[1984,2026],[2036,2037],2042,[2048,2069],2074,2084,2088,[2112,2136],2208,[2210,2220]
+ ,[2308,2361],2365,2384,[2392,2401],[2406,2415],[2417,2423],[2425,2431],[2437,2444],[2447,2448],[2451,2472],[2474,2480]
+ ,2482,[2486,2489],2493,2510,[2524,2525],[2527,2529],[2534,2545],[2548,2553],[2565,2570],[2575,2576],[2579,2600]
+ ,[2602,2608],[2610,2611],[2613,2614],[2616,2617],[2649,2652],2654,[2662,2671],[2674,2676],[2693,2701],[2703,2705]
+ ,[2707,2728],[2730,2736],[2738,2739],[2741,2745],2749,2768,[2784,2785],[2790,2799],[2821,2828],[2831,2832],[2835,2856]
+ ,[2858,2864],[2866,2867],[2869,2873],2877,[2908,2909],[2911,2913],[2918,2927],[2929,2935],2947,[2949,2954],[2958,2960]
+ ,[2962,2965],[2969,2970],2972,[2974,2975],[2979,2980],[2984,2986],[2990,3001],3024,[3046,3058],[3077,3084],[3086,3088]
+ ,[3090,3112],[3114,3123],[3125,3129],3133,[3160,3161],[3168,3169],[3174,3183],[3192,3198],[3205,3212],[3214,3216]
+ ,[3218,3240],[3242,3251],[3253,3257],3261,3294,[3296,3297],[3302,3311],[3313,3314],[3333,3340],[3342,3344],[3346,3386]
+ ,3389,3406,[3424,3425],[3430,3445],[3450,3455],[3461,3478],[3482,3505],[3507,3515],3517,[3520,3526],[3585,3632]
+ ,[3634,3635],[3648,3654],[3664,3673],[3713,3714],3716,[3719,3720],3722,3725,[3732,3735],[3737,3743],[3745,3747],3749
+ ,3751,[3754,3755],[3757,3760],[3762,3763],3773,[3776,3780],3782,[3792,3801],[3804,3807],3840,[3872,3891],[3904,3911]
+ ,[3913,3948],[3976,3980],[4096,4138],[4159,4169],[4176,4181],[4186,4189],4193,[4197,4198],[4206,4208],[4213,4225],4238
+ ,[4240,4249],[4256,4293],4295,4301,[4304,4346],[4348,4680],[4682,4685],[4688,4694],4696,[4698,4701],[4704,4744]
+ ,[4746,4749],[4752,4784],[4786,4789],[4792,4798],4800,[4802,4805],[4808,4822],[4824,4880],[4882,4885],[4888,4954]
+ ,[4969,4988],[4992,5007],[5024,5108],[5121,5740],[5743,5759],[5761,5786],[5792,5866],[5870,5872],[5888,5900]
+ ,[5902,5905],[5920,5937],[5952,5969],[5984,5996],[5998,6000],[6016,6067],6103,6108,[6112,6121],[6128,6137],[6160,6169]
+ ,[6176,6263],[6272,6312],6314,[6320,6389],[6400,6428],[6470,6509],[6512,6516],[6528,6571],[6593,6599],[6608,6618]
+ ,[6656,6678],[6688,6740],[6784,6793],[6800,6809],6823,[6917,6963],[6981,6987],[6992,7001],[7043,7072],[7086,7141]
+ ,[7168,7203],[7232,7241],[7245,7293],[7401,7404],[7406,7409],[7413,7414],[7424,7615],[7680,7957],[7960,7965]
+ ,[7968,8005],[8008,8013],[8016,8023],8025,8027,8029,[8031,8061],[8064,8116],[8118,8124],8126,[8130,8132],[8134,8140]
+ ,[8144,8147],[8150,8155],[8160,8172],[8178,8180],[8182,8188],[8304,8305],[8308,8313],[8319,8329],[8336,8348],8450,8455
+ ,[8458,8467],8469,[8473,8477],8484,8486,8488,[8490,8493],[8495,8505],[8508,8511],[8517,8521],8526,[8528,8585]
+ ,[9312,9371],[9450,9471],[10102,10131],[11264,11310],[11312,11358],[11360,11492],[11499,11502],[11506,11507],11517
+ ,[11520,11557],11559,11565,[11568,11623],11631,[11648,11670],[11680,11686],[11688,11694],[11696,11702],[11704,11710]
+ ,[11712,11718],[11720,11726],[11728,11734],[11736,11742],11823,[12293,12295],[12321,12329],[12337,12341],[12344,12348]
+ ,[12353,12438],[12445,12447],[12449,12538],[12540,12543],[12549,12589],[12593,12686],[12690,12693],[12704,12730]
+ ,[12784,12799],[12832,12841],[12872,12879],[12881,12895],[12928,12937],[12977,12991],13312,19893,19968,40908
+ ,[40960,42124],[42192,42237],[42240,42508],[42512,42539],[42560,42606],[42623,42647],[42656,42735],[42775,42783]
+ ,[42786,42888],[42891,42894],[42896,42899],[42912,42922],[43000,43009],[43011,43013],[43015,43018],[43020,43042]
+ ,[43056,43061],[43072,43123],[43138,43187],[43216,43225],[43250,43255],43259,[43264,43301],[43312,43334],[43360,43388]
+ ,[43396,43442],[43471,43481],[43520,43560],[43584,43586],[43588,43595],[43600,43609],[43616,43638],43642,[43648,43695]
+ ,43697,[43701,43702],[43705,43709],43712,43714,[43739,43741],[43744,43754],[43762,43764],[43777,43782],[43785,43790]
+ ,[43793,43798],[43808,43814],[43816,43822],[43968,44002],[44016,44025],44032,55203,[55216,55238],[55243,55291]
+ ,[63744,64109],[64112,64217],[64256,64262],[64275,64279],64285,[64287,64296],[64298,64310],[64312,64316],64318
+ ,[64320,64321],[64323,64324],[64326,64433],[64467,64829],[64848,64911],[64914,64967],[65008,65019],[65136,65140]
+ ,[65142,65276],[65296,65305],[65313,65338],[65345,65370],[65382,65470],[65474,65479],[65482,65487],[65490,65495]
+ ,[65498,65500],[65536,65547],[65549,65574],[65576,65594],[65596,65597],[65599,65613],[65616,65629],[65664,65786]
+ ,[65799,65843],[65856,65912],65930,[66176,66204],[66208,66256],[66304,66334],[66336,66339],[66352,66378],[66432,66461]
+ ,[66464,66499],[66504,66511],[66513,66517],[66560,66717],[66720,66729],[67584,67589],67592,[67594,67637],[67639,67640]
+ ,67644,[67647,67669],[67672,67679],[67840,67867],[67872,67897],[67968,68023],[68030,68031],68096,[68112,68115]
+ ,[68117,68119],[68121,68147],[68160,68167],[68192,68222],[68352,68405],[68416,68437],[68440,68466],[68472,68479]
+ ,[68608,68680],[69216,69246],[69635,69687],[69714,69743],[69763,69807],[69840,69864],[69872,69881],[69891,69926]
+ ,[69942,69951],[70019,70066],[70081,70084],[70096,70105],[71296,71338],[71360,71369],[73728,74606],[74752,74850]
+ ,[77824,78894],[92160,92728],[93952,94020],94032,[94099,94111],[110592,110593],[119648,119665],[119808,119892]
+ ,[119894,119964],[119966,119967],119970,[119973,119974],[119977,119980],[119982,119993],119995,[119997,120003]
+ ,[120005,120069],[120071,120074],[120077,120084],[120086,120092],[120094,120121],[120123,120126],[120128,120132],120134
+ ,[120138,120144],[120146,120485],[120488,120512],[120514,120538],[120540,120570],[120572,120596],[120598,120628]
+ ,[120630,120654],[120656,120686],[120688,120712],[120714,120744],[120746,120770],[120772,120779],[120782,120831]
+ ,[126464,126467],[126469,126495],[126497,126498],126500,126503,[126505,126514],[126516,126519],126521,126523,126530
+ ,126535,126537,126539,[126541,126543],[126545,126546],126548,126551,126553,126555,126557,126559,[126561,126562],126564
+ ,[126567,126570],[126572,126578],[126580,126583],[126585,126588],126590,[126592,126601],[126603,126619],[126625,126627]
+ ,[126629,126633],[126635,126651],[127232,127242],131072,173782,173824,177972,177984,178205,[194560,195101]];
+;
700 src/unicode/normalizer_lowercase.js
@@ -0,0 +1,700 @@
+var net = net||{};net.kornr = net.kornr||{};net.kornr.unicode=net.kornr.unicode||{};
+net.kornr.unicode.norm_lowercase_data=[[65,90,'R',32],[160,32],[168,[32, 776]],[170,97],[175,[32, 772]],[178,179,'R',-128],[180,[32, 769]],[181,956],
+ [184,[32, 807]],[185,49],[186,111],[188,[49, 8260, 52]],[189,[49, 8260, 50]],[190,[51, 8260, 52]],[192,[97, 768]],
+ [193,[97, 769]],[194,[97, 770]],[195,[97, 771]],[196,[97, 776]],[197,[97, 778]],[198,230],[199,[99, 807]],
+ [200,[101, 768]],[201,[101, 769]],[202,[101, 770]],[203,[101, 776]],[204,[105, 768]],[205,[105, 769]],[206,[105, 770]],
+ [207,[105, 776]],[208,240],[209,[110, 771]],[210,[111, 768]],[211,[111, 769]],[212,[111, 770]],[213,[111, 771]],
+ [214,[111, 776]],[216,248],[217,[117, 768]],[218,[117, 769]],[219,[117, 770]],[220,[117, 776]],[221,[121, 769]],
+ [222,254],[224,[97, 768]],[225,[97, 769]],[226,[97, 770]],[227,[97, 771]],[228,[97, 776]],[229,[97, 778]],
+ [231,[99, 807]],[232,[101, 768]],[233,[101, 769]],[234,[101, 770]],[235,[101, 776]],[236,[105, 768]],[237,[105, 769]],
+ [238,[105, 770]],[239,[105, 776]],[241,[110, 771]],[242,[111, 768]],[243,[111, 769]],[244,[111, 770]],[245,[111, 771]],
+ [246,[111, 776]],[249,[117, 768]],[250,[117, 769]],[251,[117, 770]],[252,[117, 776]],[253,[121, 769]],[255,[121, 776]],
+ [256,[97, 772]],[257,[97, 772]],[258,[97, 774]],[259,[97, 774]],[260,[97, 808]],[261,[97, 808]],[262,[99, 769]],
+ [263,[99, 769]],[264,[99, 770]],[265,[99, 770]],[266,[99, 775]],[267,[99, 775]],[268,[99, 780]],[269,[99, 780]],
+ [270,[100, 780]],[271,[100, 780]],[272,273],[274,[101, 772]],[275,[101, 772]],[276,[101, 774]],[277,[101, 774]],
+ [278,[101, 775]],[279,[101, 775]],[280,[101, 808]],[281,[101, 808]],[282,[101, 780]],[283,[101, 780]],[284,[103, 770]],
+ [285,[103, 770]],[286,[103, 774]],[287,[103, 774]],[288,[103, 775]],[289,[103, 775]],[290,[103, 807]],[291,[103, 807]],
+ [292,[104, 770]],[293,[104, 770]],[294,295],[296,[105, 771]],[297,[105, 771]],[298,[105, 772]],[299,[105, 772]],
+ [300,[105, 774]],[301,[105, 774]],[302,[105, 808]],[303,[105, 808]],[304,105],[306,[105, 106]],[307,[105, 106]],
+ [308,[106, 770]],[309,[106, 770]],[310,[107, 807]],[311,[107, 807]],[313,[108, 769]],[314,[108, 769]],[315,[108, 807]],
+ [316,[108, 807]],[317,[108, 780]],[318,[108, 780]],[319,[108, 183]],[320,[108, 183]],[321,322],[323,[110, 769]],
+ [324,[110, 769]],[325,[110, 807]],[326,[110, 807]],[327,[110, 780]],[328,[110, 780]],[329,[700, 110]],[330,331],
+ [332,[111, 772]],[333,[111, 772]],[334,[111, 774]],[335,[111, 774]],[336,[111, 779]],[337,[111, 779]],[338,339],
+ [340,[114, 769]],[341,[114, 769]],[342,[114, 807]],[343,[114, 807]],[344,[114, 780]],[345,[114, 780]],[346,[115, 769]],
+ [347,[115, 769]],[348,[115, 770]],[349,[115, 770]],[350,[115, 807]],[351,[115, 807]],[352,[115, 780]],[353,[115, 780]],
+ [354,[116, 807]],[355,[116, 807]],[356,[116, 780]],[357,[116, 780]],[358,359],[360,[117, 771]],[361,[117, 771]],
+ [362,[117, 772]],[363,[117, 772]],[364,[117, 774]],[365,[117, 774]],[366,[117, 778]],[367,[117, 778]],[368,[117, 779]],
+ [369,[117, 779]],[370,[117, 808]],[371,[117, 808]],[372,[119, 770]],[373,[119, 770]],[374,[121, 770]],[375,[121, 770]],
+ [376,[121, 776]],[377,[122, 769]],[378,[122, 769]],[379,[122, 775]],[380,[122, 775]],[381,[122, 780]],[382,[122, 780]],
+ [383,115],[385,595],[386,388,'R',1],[390,596],[391,392],[393,394,'R',205],[395,396],[398,477],[399,601],[400,603],
+ [401,402],[403,608],[404,611],[406,617],[407,616],[408,409],[412,623],[413,626],[415,629],[416,[111, 795]],
+ [417,[111, 795]],[418,420,'R',1],[422,640],[423,424],[425,643],[428,429],[430,648],[431,[117, 795]],[432,[117, 795]],
+ [433,434,'R',217],[435,437,'R',1],[439,658],[440,444,'R',1],[452,[100, 382]],[453,[100, 382]],[454,[100, 382]],
+ [455,[108, 106]],[456,[108, 106]],[457,[108, 106]],[458,[110, 106]],[459,[110, 106]],[460,[110, 106]],[461,[97, 780]],
+ [462,[97, 780]],[463,[105, 780]],[464,[105, 780]],[465,[111, 780]],[466,[111, 780]],[467,[117, 780]],[468,[117, 780]],
+ [469,[252, 772]],[470,[252, 772]],[471,[252, 769]],[472,[252, 769]],[473,[252, 780]],[474,[252, 780]],[475,[252, 768]],
+ [476,[252, 768]],[478,[228, 772]],[479,[228, 772]],[480,[551, 772]],[481,[551, 772]],[482,[230, 772]],[483,[230, 772]],
+ [484,485],[486,[103, 780]],[487,[103, 780]],[488,[107, 780]],[489,[107, 780]],[490,[111, 808]],[491,[111, 808]],
+ [492,[491, 772]],[493,[491, 772]],[494,[658, 780]],[495,[658, 780]],[496,[106, 780]],[497,[100, 122]],[498,[100, 122]],
+ [499,[100, 122]],[500,[103, 769]],[501,[103, 769]],[502,405],[503,447],[504,[110, 768]],[505,[110, 768]],
+ [506,[229, 769]],[507,[229, 769]],[508,[230, 769]],[509,[230, 769]],[510,[248, 769]],[511,[248, 769]],[512,[97, 783]],
+ [513,[97, 783]],[514,[97, 785]],[515,[97, 785]],[516,[101, 783]],[517,[101, 783]],[518,[101, 785]],[519,[101, 785]],
+ [520,[105, 783]],[521,[105, 783]],[522,[105, 785]],[523,[105, 785]],[524,[111, 783]],[525,[111, 783]],[526,[111, 785]],
+ [527,[111, 785]],[528,[114, 783]],[529,[114, 783]],[530,[114, 785]],[531,[114, 785]],[532,[117, 783]],[533,[117, 783]],
+ [534,[117, 785]],[535,[117, 785]],[536,[115, 806]],[537,[115, 806]],[538,[116, 806]],[539,[116, 806]],[540,541],
+ [542,[104, 780]],[543,[104, 780]],[544,414],[546,548,'R',1],[550,[97, 775]],[551,[97, 775]],[552,[101, 807]],
+ [553,[101, 807]],[554,[246, 772]],[555,[246, 772]],[556,[245, 772]],[557,[245, 772]],[558,[111, 775]],[559,[111, 775]],
+ [560,[559, 772]],[561,[559, 772]],[562,[121, 772]],[563,[121, 772]],[570,11365],[571,572],[573,410],[574,11366],
+ [577,578],[579,384],[580,649],[581,652],[582,590,'R',1],[688,104],[689,614],[690,106],[691,114],[692,633],[693,635],
+ [694,641],[695,119],[696,121],[728,[32, 774]],[729,[32, 775]],[730,[32, 778]],[731,[32, 808]],[732,[32, 771]],
+ [733,[32, 779]],[736,611],[737,108],[738,115],[739,120],[740,661],[832,833,'R',-64],[835,787],[836,[776, 769]],
+ [880,882,'R',1],[884,697],[886,887],[890,[32, 837]],[894,59],[900,[32, 769]],[901,[168, 769]],[902,[945, 769]],[903,183],
+ [904,[949, 769]],[905,[951, 769]],[906,[953, 769]],[908,[959, 769]],[910,[965, 769]],[911,[969, 769]],[912,[970, 769]],
+ [913,937,'R',32],[938,[953, 776]],[939,[965, 776]],[940,[945, 769]],[941,[949, 769]],[942,[951, 769]],[943,[953, 769]],
+ [944,[971, 769]],[970,[953, 776]],[971,[965, 776]],[972,[959, 769]],[973,[965, 769]],[974,[969, 769]],[975,983],
+ [976,946],[977,952],[978,933],[979,[978, 769]],[980,[978, 776]],[981,966],[982,960],[984,1006,'R',1],[1008,954],
+ [1009,1010,'R',-48],[1012,952],[1013,949],[1015,1016],[1017,962],[1018,1019],[1021,1023,'R',-130],[1024,[1077, 768]],
+ [1025,[1077, 776]],[1026,1106],[1027,[1075, 769]],[1028,1030,'R',80],[1031,[1110, 776]],[1032,1035,'R',80],
+ [1036,[1082, 769]],[1037,[1080, 768]],[1038,[1091, 774]],[1039,1119],[1040,1048,'R',32],[1049,[1080, 774]],
+ [1050,1071,'R',32],[1081,[1080, 774]],[1104,[1077, 768]],[1105,[1077, 776]],[1107,[1075, 769]],[1111,[1110, 776]],
+ [1116,[1082, 769]],[1117,[1080, 768]],[1118,[1091, 774]],[1120,1140,'R',1],[1142,[1141, 783]],[1143,[1141, 783]],
+ [1144,1214,'R',1],[1216,1231],[1217,[1078, 774]],[1218,[1078, 774]],[1219,1229,'R',1],[1232,[1072, 774]],
+ [1233,[1072, 774]],[1234,[1072, 776]],[1235,[1072, 776]],[1236,1237],[1238,[1077, 774]],[1239,[1077, 774]],[1240,1241],
+ [1242,[1241, 776]],[1243,[1241, 776]],[1244,[1078, 776]],[1245,[1078, 776]],[1246,[1079, 776]],[1247,[1079, 776]],
+ [1248,1249],[1250,[1080, 772]],[1251,[1080, 772]],[1252,[1080, 776]],[1253,[1080, 776]],[1254,[1086, 776]],
+ [1255,[1086, 776]],[1256,1257],[1258,[1257, 776]],[1259,[1257, 776]],[1260,[1101, 776]],[1261,[1101, 776]],
+ [1262,[1091, 772]],[1263,[1091, 772]],[1264,[1091, 776]],[1265,[1091, 776]],[1266,[1091, 779]],[1267,[1091, 779]],
+ [1268,[1095, 776]],[1269,[1095, 776]],[1270,1271],[1272,[1099, 776]],[1273,[1099, 776]],[1274,1318,'R',1],
+ [1329,1366,'R',48],[1415,[1381, 1410]],[1570,[1575, 1619]],[1571,[1575, 1620]],[1572,[1608, 1620]],[1573,[1575, 1621]],
+ [1574,[1610, 1620]],[1653,[1575, 1652]],[1654,[1608, 1652]],[1655,[1735, 1652]],[1656,[1610, 1652]],[1728,[1749, 1620]],
+ [1730,[1729, 1620]],[1747,[1746, 1620]],[2345,[2344, 2364]],[2353,[2352, 2364]],[2356,[2355, 2364]],[2392,[2325, 2364]],
+ [2393,[2326, 2364]],[2394,[2327, 2364]],[2395,[2332, 2364]],[2396,[2337, 2364]],[2397,[2338, 2364]],[2398,[2347, 2364]],
+ [2399,[2351, 2364]],[2507,[2503, 2494]],[2508,[2503, 2519]],[2524,[2465, 2492]],[2525,[2466, 2492]],[2527,[2479, 2492]],
+ [2611,[2610, 2620]],[2614,[2616, 2620]],[2649,[2582, 2620]],[2650,[2583, 2620]],[2651,[2588, 2620]],[2654,[2603, 2620]],
+ [2888,[2887, 2902]],[2891,[2887, 2878]],[2892,[2887, 2903]],[2908,[2849, 2876]],[2909,[2850, 2876]],[2964,[2962, 3031]],
+ [3018,[3014, 3006]],[3019,[3015, 3006]],[3020,[3014, 3031]],[3144,[3142, 3158]],[3264,[3263, 3285]],[3271,[3270, 3285]],
+ [3272,[3270, 3286]],[3274,[3270, 3266]],[3275,[3274, 3285]],[3402,[3398, 3390]],[3403,[3399, 3390]],[3404,[3398, 3415]],
+ [3546,[3545, 3530]],[3548,[3545, 3535]],[3549,[3548, 3530]],[3550,[3545, 3551]],[3635,[3661, 3634]],[3763,[3789, 3762]],
+ [3804,[3755, 3737]],[3805,[3755, 3745]],[3852,3851],[3907,[3906, 4023]],[3917,[3916, 4023]],[3922,[3921, 4023]],
+ [3927,[3926, 4023]],[3932,[3931, 4023]],[3945,[3904, 4021]],[3955,[3953, 3954]],[3957,[3953, 3956]],[3958,[4018, 3968]],
+ [3959,[4018, 3969]],[3960,[4019, 3968]],[3961,[4019, 3969]],[3969,[3953, 3968]],[3987,[3986, 4023]],[3997,[3996, 4023]],
+ [4002,[4001, 4023]],[4007,[4006, 4023]],[4012,[4011, 4023]],[4025,[3984, 4021]],[4134,[4133, 4142]],[4256,4301,'R',7264],
+ [4348,4316],[6918,[6917, 6965]],[6920,[6919, 6965]],[6922,[6921, 6965]],[6924,[6923, 6965]],[6926,[6925, 6965]],
+ [6930,[6929, 6965]],[6971,[6970, 6965]],[6973,[6972, 6965]],[6976,[6974, 6965]],[6977,[6975, 6965]],[6979,[6978, 6965]],
+ [7468,65],[7469,198],[7470,7473,'R',-7404],[7474,398],[7475,7482,'R',-7404],[7484,79],[7485,546],[7486,80],[7487,82],
+ [7488,7489,'R',-7404],[7490,87],[7491,97],[7492,7493,'R',-6900],[7494,7426],[7495,98],[7496,7497,'R',-7396],[7498,601],
+ [7499,7500,'R',-6896],[7501,103],[7503,107],[7504,109],[7505,331],[7506,111],[7507,596],[7508,7509,'R',-62],[7510,112],
+ [