Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

fix Issue #10: broken tutorial caused by the english normalizers not …

…using the callback chaining. Also added a few unit tests for english in tests/normalizers.html.
  • Loading branch information...
commit 72298d6a8bda366904404650c9dea0c2409594c9 1 parent 5951081
reyesr authored
View
6 examples/tutorial.html
@@ -114,13 +114,13 @@
var index1 = {
name: "normalindex",
analyzer: new fullproof.StandardAnalyzer(fullproof.normalizer.to_lowercase_nomark, fullproof.normalizer.remove_duplicate_letters),
- capabilities: new fullproof.Capabilities().setStoreObjects(false).setUseScores(false).setDbName(dbName),
+ capabilities: new fullproof.Capabilities().setStoreObjects(false).setUseScores(false).setDbName(dbName).setDbSize(1*1024*1024),
initializer: initializer
};
var index2 = {
name: "stemmedindex",
analyzer: new fullproof.StandardAnalyzer(fullproof.normalizer.to_lowercase_nomark, fullproof.english.metaphone),
- capabilities: new fullproof.Capabilities().setStoreObjects(false).setUseScores(false).setDbName(dbName),
+ capabilities: new fullproof.Capabilities().setStoreObjects(false).setUseScores(false).setDbName(dbName).setDbSize(1*1024*1024),
initializer: initializer
};
@@ -138,7 +138,7 @@
if (resultset && resultset.getSize()) {
var rsize = resultset.getSize();
result = "<h1>Found " + rsize + " character"+(rsize>1?"s":"")+" matching your request.</h1>";
- result += "<table><tr><th>Name</th><th>Role</th></tr>"
+ result += "<table><tr><th>Name</th><th>Role</th></tr>";
resultset.forEach(function(e) {
var c = marioData[e];
result += "<tr><td style='font-weight: bold;'>"+ c.name+"</td>";
View
291 src/normalizers/english/metaphone.js
@@ -26,148 +26,153 @@ var fullproof = (function(NAMESPACE) {
/*
* Borrowed from https://github.com/NaturalNode/natural/blob/master/lib/natural/phonetics/metaphone.js
*/
- NAMESPACE.english.metaphone = (function(){
- "use strict";
-
- function dedup(token) {
- return token.replace(/([^c])\1/g, '$1');
- }
-
- function dropInitialLetters(token) {
- if(token.match(/^(kn|gn|pn|ae|wr)/))
- return token.substr(1, token.length - 1);
-
- return token;
- }
-
- function dropBafterMAtEnd(token) {
- return token.replace(/mb$/, 'm');
- }
-
- function cTransform(token) {
- token = token.replace(/([^s]|^)(c)(h)/g, '$1x$3').trim();
- token = token.replace(/cia/g, 'xia');
- token = token.replace(/c(i|e|y)/g, 's$1');
- token = token.replace(/c/g, 'k');
-
- return token;
- }
-
- function dTransform(token) {
- token = token.replace(/d(ge|gy|gi)/g, 'j$1');
- token = token.replace(/d/g, 't');
-
- return token;
- }
-
- function dropG(token) {
- token = token.replace(/gh(^$|[^aeiou])/g, 'h$1');
- token = token.replace(/g(n|ned)$/g, '$1');
-
- return token;
- }
-
- function transformG(token) {
- token = token.replace(/([^g]|^)(g)(i|e|y)/g, '$1j$3');
- token = token.replace(/gg/g, 'g');
- token = token.replace(/g/g, 'k');
-
- return token;
- }
-
- function dropH(token) {
- return token.replace(/([aeiou])h([^aeiou])/g, '$1$2');
- }
-
- function transformCK(token) {
- return token.replace(/ck/g, 'k');
- }
- function transformPH(token) {
- return token.replace(/ph/g, 'f');
- }
-
- function transformQ(token) {
- return token.replace(/q/g, 'k');
- }
-
- function transformS(token) {
- return token.replace(/s(h|io|ia)/g, 'x$1');
- }
-
- function transformT(token) {
- token = token.replace(/t(ia|io)/g, 'x$1');
- token = token.replace(/th/, '0');
-
- return token;
- }
-
- function dropT(token) {
- return token.replace(/tch/g, 'ch');
- }
-
- function transformV(token) {
- return token.replace(/v/g, 'f');
- }
-
- function transformWH(token) {
- return token.replace(/^wh/, 'w');
- }
-
- function dropW(token) {
- return token.replace(/w([^aeiou]|$)/g, '$1');
- }
-
- function transformX(token) {
- token = token.replace(/^x/, 's');
- token = token.replace(/x/g, 'ks');
- return token;
- }
-
- function dropY(token) {
- return token.replace(/y([^aeiou]|$)/g, '$1');
- }
-
- function transformZ(token) {
- return token.replace(/z/, 's');
- }
-
- function dropVowels(token) {
- return token.charAt(0) + token.substr(1, token.length).replace(/[aeiou]/g, '');
- }
-
- return function(token, maxLength) {
- maxLength = maxLength || 32;
- token = token.toLowerCase();
- token = dedup(token);
- token = dropInitialLetters(token);
- token = dropBafterMAtEnd(token);
- token = transformCK(token);
- token = cTransform(token);
- token = dTransform(token);
- token = dropG(token);
- token = transformG(token);
- token = dropH(token);
- token = transformPH(token);
- token = transformQ(token);
- token = transformS(token);
- token = transformX(token);
- token = transformT(token);
- token = dropT(token);
- token = transformV(token);
- token = transformWH(token);
- token = dropW(token);
- token = dropY(token);
- token = transformZ(token);
- token = dropVowels(token);
-
- token.toUpperCase();
- if(token.length >= maxLength)
- token = token.substring(0, maxLength);
-
- return token.toUpperCase();
- };
-
- })();
- return NAMESPACE;
+
+ NAMESPACE.english.metaphone_make = function(maxLength) {
+ "use strict";
+
+ function dedup(token) {
+ return token.replace(/([^c])\1/g, '$1');
+ }
+
+ function dropInitialLetters(token) {
+ if(token.match(/^(kn|gn|pn|ae|wr)/))
+ return token.substr(1, token.length - 1);
+
+ return token;
+ }
+
+ function dropBafterMAtEnd(token) {
+ return token.replace(/mb$/, 'm');
+ }
+
+ function cTransform(token) {
+ token = token.replace(/([^s]|^)(c)(h)/g, '$1x$3').trim();
+ token = token.replace(/cia/g, 'xia');
+ token = token.replace(/c(i|e|y)/g, 's$1');
+ token = token.replace(/c/g, 'k');
+
+ return token;
+ }
+
+ function dTransform(token) {
+ token = token.replace(/d(ge|gy|gi)/g, 'j$1');
+ token = token.replace(/d/g, 't');
+
+ return token;
+ }
+
+ function dropG(token) {
+ token = token.replace(/gh(^$|[^aeiou])/g, 'h$1');
+ token = token.replace(/g(n|ned)$/g, '$1');
+
+ return token;
+ }
+
+ function transformG(token) {
+ token = token.replace(/([^g]|^)(g)(i|e|y)/g, '$1j$3');
+ token = token.replace(/gg/g, 'g');
+ token = token.replace(/g/g, 'k');
+
+ return token;
+ }
+
+ function dropH(token) {
+ return token.replace(/([aeiou])h([^aeiou])/g, '$1$2');
+ }
+
+ function transformCK(token) {
+ return token.replace(/ck/g, 'k');
+ }
+ function transformPH(token) {
+ return token.replace(/ph/g, 'f');
+ }
+
+ function transformQ(token) {
+ return token.replace(/q/g, 'k');
+ }
+
+ function transformS(token) {
+ return token.replace(/s(h|io|ia)/g, 'x$1');
+ }
+
+ function transformT(token) {
+ token = token.replace(/t(ia|io)/g, 'x$1');
+ token = token.replace(/th/, '0');
+
+ return token;
+ }
+
+ function dropT(token) {
+ return token.replace(/tch/g, 'ch');
+ }
+
+ function transformV(token) {
+ return token.replace(/v/g, 'f');
+ }
+
+ function transformWH(token) {
+ return token.replace(/^wh/, 'w');
+ }
+
+ function dropW(token) {
+ return token.replace(/w([^aeiou]|$)/g, '$1');
+ }
+
+ function transformX(token) {
+ token = token.replace(/^x/, 's');
+ token = token.replace(/x/g, 'ks');
+ return token;
+ }
+
+ function dropY(token) {
+ return token.replace(/y([^aeiou]|$)/g, '$1');
+ }
+
+ function transformZ(token) {
+ return token.replace(/z/, 's');
+ }
+
+ function dropVowels(token) {
+ return token.charAt(0) + token.substr(1, token.length).replace(/[aeiou]/g, '');
+ }
+
+ return function(token, callback) {
+ maxLength = maxLength || 32;
+ token = token.toLowerCase();
+ token = dedup(token);
+ token = dropInitialLetters(token);
+ token = dropBafterMAtEnd(token);
+ token = transformCK(token);
+ token = cTransform(token);
+ token = dTransform(token);
+ token = dropG(token);
+ token = transformG(token);
+ token = dropH(token);
+ token = transformPH(token);
+ token = transformQ(token);
+ token = transformS(token);
+ token = transformX(token);
+ token = transformT(token);
+ token = dropT(token);
+ token = transformV(token);
+ token = transformWH(token);
+ token = dropW(token);
+ token = dropY(token);
+ token = transformZ(token);
+ token = dropVowels(token);
+
+ token.toUpperCase();
+ if(token.length >= maxLength) {
+ token = token.substring(0, maxLength);
+ }
+ token = token.toUpperCase();
+
+ return callback?callback(token):token;
+ };
+ };
+
+ NAMESPACE.english.metaphone = NAMESPACE.english.metaphone_make(32);
+
+ return NAMESPACE;
})(fullproof||{});
View
7 src/normalizers/english/porter-stemmer.js
@@ -64,8 +64,9 @@ var fullproof = (function(NAMESPACE) {
var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1
var s_v = "^(" + C + ")?" + v; // vowel in stem
- return function(word) {
-
+ return function(word, callback) {
+ word = word.toLowerCase();
+
var stem;
var suffix;
var firstch;
@@ -194,7 +195,7 @@ var fullproof = (function(NAMESPACE) {
w = firstch.toLowerCase() + w.substr(1);
}
- return w;
+ return callback?callback(w):w;
}
})();
View
4 src/normalizers/english/stopword-remover.js
@@ -77,11 +77,11 @@ var fullproof = (function(NAMESPACE) {
"wherein" : 1, "whereupon" : 1, "wherever" : 1, "whether" : 1, "which" : 1, "while" : 1, "whither" : 1,
"who" : 1, "who's" : 1, "whoever" : 1, "whole" : 1, "whom" : 1, "whose" : 1, "why" : 1, "will" : 1,
"willing" : 1, "wish" : 1, "with" : 1, "within" : 1, "without" : 1, "won't" : 1, "wonder" : 1, "would" : 1,
- "would" : 1, "wouldn't" : 1, "x" : 1, "y" : 1, "yes" : 1, "yet" : 1, "you" : 1, "you'd" : 1, "you'll" : 1,
+ "wouldn't" : 1, "x" : 1, "y" : 1, "yes" : 1, "yet" : 1, "you" : 1, "you'd" : 1, "you'll" : 1,
"you're" : 1, "you've" : 1, "your" : 1, "yours" : 1, "yourself" : 1, "yourselves" : 1, "z" : 1, "zero" : 1 };
NAMESPACE.english.stopword_remover = function(word, callback) {
- return NAMESPACE.filter_in_object(word, stopwords, callback);
+ return NAMESPACE.normalizer.filter_in_object(word, stopwords, callback);
};
return NAMESPACE;
View
2  tests/normalizers.html
@@ -15,6 +15,8 @@
<script type="text/javascript" src="../src/normalizers.js"></script>
<script type="text/javascript" src="../src/utils.js"></script>
<script type="text/javascript" src="../src/normalizers/english/porter-stemmer.js"></script>
+ <script type="text/javascript" src="../src/normalizers/english/metaphone.js"></script>
+ <script type="text/javascript" src="../src/normalizers/english/stopword-remover.js"></script>
<script type="text/javascript" src="test-normalizers.js"></script>
View
28 tests/test-normalizers.js
@@ -72,4 +72,30 @@ test("scoring analyzer", function() {
console.log("scoring 1", words);
});
stda.parse("this is a test longer than the previous one, but not that long for a test though", sync);
-});
+});
+
+function make_normalizer_test(name, normalizerRef, input, expected) {
+ test(name + " " + input + " to " + expected, function() {
+ expect(2);
+ var test1 = normalizerRef(input);
+ equal(test1, expected);
+ QUnit.stop();
+ normalizerRef(input, function(result) {
+ equal(result, expected);
+ QUnit.start();
+ });
+ });
+}
+
+make_normalizer_test("english metaphone", fullproof.english.metaphone, "Absolutly", "ABSLTL");
+make_normalizer_test("english metaphone", fullproof.english.metaphone, "Indications", "INTKXNS");
+make_normalizer_test("english metaphone", fullproof.english.metaphone, "John", "JN");
+
+make_normalizer_test("english porter-stemmer", fullproof.english.porter_stemmer, "ABSOLUTLY", "absolutli");
+make_normalizer_test("english porter-stemmer", fullproof.english.porter_stemmer, "Indications", "indic");
+make_normalizer_test("english porter-stemmer", fullproof.english.porter_stemmer, "JOHN", "john");
+
+make_normalizer_test("english stopwords", fullproof.english.stopword_remover, "JOHN", "JOHN");
+make_normalizer_test("english stopwords", fullproof.english.stopword_remover, "so", false);
+make_normalizer_test("english stopwords", fullproof.english.stopword_remover, "is", false);
+make_normalizer_test("english stopwords", fullproof.english.stopword_remover, "valid", "valid");
Please sign in to comment.
Something went wrong with that request. Please try again.