primaryobjects · primaryobjects · Dec 7, 2018 · Dec 7, 2018 · Jul 15, 2019 · Jul 15, 2019
diff --git a/Readme.md b/Readme.md
@@ -69,20 +69,20 @@ var result = lda(documents, 2, 5);
 for (var i in result) {
 	var row = result[i];
 	console.log('Topic ' + (parseInt(i) + 1));
-	
+
 	// For each term.
 	for (var j in row) {
 		var term = row[j];
 		console.log(term.term + ' (' + term.probability + '%)');
 	}
-	
+
 	console.log('');
 }
 ```
 
 ## Additional Languages
 
-LDA uses [stop-words](https://en.wikipedia.org/wiki/Stop_words) to ignore common terms in the text (for example: this, that, it, we). By default, the stop-words list uses English. To use additional languages, you can specify an array of language ids, as follows: 
+LDA uses [stop-words](https://en.wikipedia.org/wiki/Stop_words) to ignore common terms in the text (for example: this, that, it, we). By default, the stop-words list uses English. To use additional languages, you can specify an array of language ids, as follows:
 
 ```javascript
 // Use English (this is the default).
@@ -95,7 +95,13 @@ result = lda(documents, 2, 5, ['de']);
 result = lda(documents, 2, 5, ['en', 'de']);
 ```
 
-To add a new language-specific stop-words list, create a file /lda/lib/stopwords_XX.js where XX is the id for the language. For example, a French stop-words list could be named "stopwords_fr.js". The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is, as follows:
+To add a new language-specific stop-words list, register a file for the specific language. For example, to register a French stop-words list use the following code.
+
+```js
+lda.registerStopwords('fr', '/path/to/the/french/stopwords.js')
+```
+
+The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is shown below.
 
 ```javascript
 exports.stop_words = [

diff --git a/lib/lda.js b/lib/lda.js
@@ -1,5 +1,11 @@
 var stem = require('stem-porter');
 
+var STOP_WORDS_MAP = {
+    en: './stopwords_en.js',
+    de: './stopwords_de.js',
+    es: './stopwords_es.js',
+};
+
 //
 // Based on javascript implementation https://github.com/awaisathar/lda.js
 // Original code based on http://www.arbylon.net/projects/LdaGibbsSampler.java
@@ -22,7 +28,21 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag
       var stopwords = new Array();
 
       languages.forEach(function(value) {
-          var stopwordsLang = require('./stopwords_' + value + ".js");
+          var stopwordsLang;
+
+          var path = STOP_WORDS_MAP[value];
+          if (!path) {
+              // Try loading the file directly.
+              try {
+                  stopwordsLang = require('./stopwords_' + value + ".js");
+              }
+              catch {
+                  console.log('Warning: Ignoring invalid stop-word list "' + value + '". Please register your stop-words file using: lda.registerStopwords(\'' + value + '\', \'/path/to/stopwords_' + value + '.js\')');
+                  return;
+              }
+          }
+
+          stopwordsLang = stopwordsLang || require(path);
           stopwords = stopwords.concat(stopwordsLang.stop_words);
       });
 
@@ -37,15 +57,15 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag
               var w=words[wc].toLowerCase().replace(/[^a-z\'A-Z0-9\u00C0-\u00ff ]+/g, '');
               var wStemmed = stem(w);
               if (w=="" || !wStemmed || w.length==1 || stopwords.indexOf(w.replace("'", "")) > -1 || stopwords.indexOf(wStemmed) > -1 || w.indexOf("http")==0) continue;
-              if (f[wStemmed]) { 
+              if (f[wStemmed]) {
                   f[wStemmed]=f[wStemmed]+1;
-              } 
-              else if(wStemmed) { 
-                  f[wStemmed]=1; 
+              }
+              else if(wStemmed) {
+                  f[wStemmed]=1;
                   vocab.push(wStemmed);
                   vocabOrig[wStemmed] = w;
               };
-              
+
               documents[i].push(vocab.indexOf(wStemmed));
           }
       }
@@ -78,14 +98,14 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag
 
           //console.log('Topic ' + (k + 1));
           var row = [];
-          
+
           for (var t = 0; t < topTerms; t++) {
               var topicTerm=things[t].split("_")[2];
               var prob=parseInt(things[t].split("_")[0]*100);
               if (prob<2) continue;
-              
+
               //console.log('Top Term: ' + topicTerm + ' (' + prob + '%)');
-              
+
               var term = {};
               term.term = topicTerm;
               term.probability = parseFloat(things[t].split("_")[0]);
@@ -95,20 +115,25 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag
           result.push(row);
       }
     }
-    
+
     return result;
 }
 
+process.registerStopwords = function(language, path) {
+    STOP_WORDS_MAP[language] = path;
+    return this;
+};
+
 function makeArray(x) {
-    var a = new Array();    
+    var a = new Array();
     for (var i=0;i<x;i++)  {
         a[i]=0;
     }
     return a;
 }
 
 function make2DArray(x,y) {
-    var a = new Array();    
+    var a = new Array();
     for (var i=0;i<x;i++)  {
         a[i]=new Array();
         for (var j=0;j<y;j++)
@@ -118,7 +143,7 @@ function make2DArray(x,y) {
 }
 
 var lda = new function() {
-    var documents,z,nw,nd,nwsum,ndsum,thetasum,phisum,V,K,alpha,beta; 
+    var documents,z,nw,nd,nwsum,ndsum,thetasum,phisum,V,K,alpha,beta;
     var THIN_INTERVAL = 20;
     var BURN_IN = 100;
     var ITERATIONS = 1000;
@@ -135,21 +160,21 @@ var lda = new function() {
         this.documents = docs;
         this.V = v;
         this.dispcol=0;
-        this.numstats=0; 
+        this.numstats=0;
     }
     this.initialState = function (K) {
         var i;
         var M = this.documents.length;
-        this.nw = make2DArray(this.V,K); 
-        this.nd = make2DArray(M,K); 
-        this.nwsum = makeArray(K); 
+        this.nw = make2DArray(this.V,K);
+        this.nd = make2DArray(M,K);
+        this.nwsum = makeArray(K);
         this.ndsum = makeArray(M);
         this.z = new Array();   for (i=0;i<M;i++) this.z[i]=new Array();
         for (var m = 0; m < M; m++) {
                 var N = this.documents[m].length;
                 this.z[m] = new Array();
                 for (var n = 0; n < N; n++) {
-                    var topic = parseInt(""+(this.getRandom() * K));                 
+                    var topic = parseInt(""+(this.getRandom() * K));
                     this.z[m][n] = topic;
                     this.nw[this.documents[m][n]][topic]++;
                     this.nd[m][topic]++;
@@ -158,7 +183,7 @@ var lda = new function() {
                 this.ndsum[m] = N;
         }
     }
-    
+
     this.gibbs = function (K,alpha,beta) {
         var i;
         this.K = K;
@@ -190,17 +215,17 @@ var lda = new function() {
             }
             if ((i > this.BURN_IN) && (this.SAMPLE_LAG > 0) && (i % this.SAMPLE_LAG == 0)) {
                 this.updateParams();
-                //document.write("|");                
+                //document.write("|");
                 if (i % this.THIN_INTERVAL != 0)
                     this.dispcol++;
             }
             if (this.dispcol >= 100) {
-                //document.write("*<br/>");                
+                //document.write("*<br/>");
                 this.dispcol = 0;
             }
         }
     }
-    
+
     this.sampleFullConditional = function(m,n) {
         var topic = this.z[m][n];
         this.nw[this.documents[m][n]][topic]--;
@@ -226,7 +251,7 @@ var lda = new function() {
         this.ndsum[m]++;
         return topic;
     }
-    
+
     this.updateParams =function () {
         for (var m = 0; m < this.documents.length; m++) {
             for (var k = 0; k < this.K; k++) {
@@ -240,7 +265,7 @@ var lda = new function() {
         }
         this.numstats++;
     }
-    
+
     this.getTheta = function() {
         var theta = new Array(); for(var i=0;i<this.documents.length;i++) theta[i] = new Array();
         if (this.SAMPLE_LAG > 0) {
@@ -258,7 +283,7 @@ var lda = new function() {
         }
         return theta;
     }
-    
+
     this.getPhi = function () {
         var phi = new Array(); for(var i=0;i<this.K;i++) phi[i] = new Array();
         if (this.SAMPLE_LAG > 0) {

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "lda",
-  "version": "0.2.0",
+  "version": "0.3.0",
   "description": "LDA topic modeling for node.js.",
   "author": {
     "name": "Kory Becker",
@@ -13,7 +13,7 @@
   },
   "main": "./lib",
   "dependencies": {
-  "stem-porter": "*"
+    "stem-porter": "*"
   },
   "engines": {
     "node": ">= 0.8.x"

diff --git a/test4.js b/test4.js
@@ -0,0 +1,77 @@
+const lda = require('./lib/lda');
+const path = require('path');
+
+lda.registerStopwords('en_override', path.resolve(__dirname, './lib/stopwords_en.js'));
+
+const collection = [
+  [
+    'Ruby slippers are pretty and fun.',
+    'Long walks in the park are fun.',
+    '',
+    'Slippers are soft on your feet.'
+  ],
+  [
+    'Ruby slippers are pretty and fun.',
+    'Long walks in the park are fun.',
+    null,
+    'Slippers are soft on your feet.'
+  ],
+  [
+    '',
+    'Ruby slippers are pretty and fun.',
+    'Long walks in the park are fun.',
+    'Slippers are soft on your feet.'
+  ],
+  [
+    null,
+    'Ruby slippers are pretty and fun.',
+    'Long walks in the park are fun.',
+    'Slippers are soft on your feet.'
+  ],
+  [
+    'Ruby slippers are pretty and fun.',
+    'Long walks in the park are fun.',
+    'Slippers are soft on your feet.',
+    ''
+  ],
+  [
+    'Ruby slippers are pretty and fun.',
+    'Long walks in the park are fun.',
+    'Slippers are soft on your feet.',
+    null
+  ]
+];
+
+var probabilities = [];
+
+collection.forEach((documents, i) => {
+  const results = lda(documents, 3, 2, ['en_override'], null, null, 123);
+
+  // Save the probabilities for each group. The values should be the same, since we're using the same random seed.
+  const groupProbs = [];
+  results.forEach(group => {
+    group.forEach(row => {
+      groupProbs.push(row.probability);
+    });
+  });
+
+  // Store the entire group in an array.
+  probabilities.push(groupProbs);
+
+  //console.log('\nSet ' + (i + 1));
+  //console.log(results);
+});
+
+var success = true;
+
+// Verify the probabilities for each group are the same, even with empty and null values in the docs.
+probabilities.forEach((group, i) => {
+  if (group[0] !== 0.15 || group[1] !== 0.14 || group[2] !== 0.16 || group[3] !== 0.15 || group[4] !== 0.16 || group[5] !== 0.14) {
+    console.log('Failed expected values for group ' + i);
+    success = false;
+  }
+});
+
+if (success) {
+  console.log('\nResult OK.');
+}