Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stopwords register merge #20

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,20 +69,20 @@ var result = lda(documents, 2, 5);
for (var i in result) {
var row = result[i];
console.log('Topic ' + (parseInt(i) + 1));

// For each term.
for (var j in row) {
var term = row[j];
console.log(term.term + ' (' + term.probability + '%)');
}

console.log('');
}
```

## Additional Languages

LDA uses [stop-words](https://en.wikipedia.org/wiki/Stop_words) to ignore common terms in the text (for example: this, that, it, we). By default, the stop-words list uses English. To use additional languages, you can specify an array of language ids, as follows:
LDA uses [stop-words](https://en.wikipedia.org/wiki/Stop_words) to ignore common terms in the text (for example: this, that, it, we). By default, the stop-words list uses English. To use additional languages, you can specify an array of language ids, as follows:

```javascript
// Use English (this is the default).
Expand All @@ -95,7 +95,13 @@ result = lda(documents, 2, 5, ['de']);
result = lda(documents, 2, 5, ['en', 'de']);
```

To add a new language-specific stop-words list, create a file /lda/lib/stopwords_XX.js where XX is the id for the language. For example, a French stop-words list could be named "stopwords_fr.js". The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is, as follows:
To add a new language-specific stop-words list, register a file for the specific language. For example, to register a French stop-words list use the following code.

```js
lda.registerStopwords('fr', '/path/to/the/french/stopwords.js')
```

The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is shown below.

```javascript
exports.stop_words = [
Expand Down
75 changes: 50 additions & 25 deletions lib/lda.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
var stem = require('stem-porter');

var STOP_WORDS_MAP = {
en: './stopwords_en.js',
de: './stopwords_de.js',
es: './stopwords_es.js',
};

//
// Based on javascript implementation https://github.com/awaisathar/lda.js
// Original code based on http://www.arbylon.net/projects/LdaGibbsSampler.java
Expand All @@ -22,7 +28,21 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag
var stopwords = new Array();

languages.forEach(function(value) {
var stopwordsLang = require('./stopwords_' + value + ".js");
var stopwordsLang;

var path = STOP_WORDS_MAP[value];
if (!path) {
// Try loading the file directly.
try {
stopwordsLang = require('./stopwords_' + value + ".js");
}
catch {
console.log('Warning: Ignoring invalid stop-word list "' + value + '". Please register your stop-words file using: lda.registerStopwords(\'' + value + '\', \'/path/to/stopwords_' + value + '.js\')');
return;
}
}

stopwordsLang = stopwordsLang || require(path);
stopwords = stopwords.concat(stopwordsLang.stop_words);
});

Expand All @@ -37,15 +57,15 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag
var w=words[wc].toLowerCase().replace(/[^a-z\'A-Z0-9\u00C0-\u00ff ]+/g, '');
var wStemmed = stem(w);
if (w=="" || !wStemmed || w.length==1 || stopwords.indexOf(w.replace("'", "")) > -1 || stopwords.indexOf(wStemmed) > -1 || w.indexOf("http")==0) continue;
if (f[wStemmed]) {
if (f[wStemmed]) {
f[wStemmed]=f[wStemmed]+1;
}
else if(wStemmed) {
f[wStemmed]=1;
}
else if(wStemmed) {
f[wStemmed]=1;
vocab.push(wStemmed);
vocabOrig[wStemmed] = w;
};

documents[i].push(vocab.indexOf(wStemmed));
}
}
Expand Down Expand Up @@ -78,14 +98,14 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag

//console.log('Topic ' + (k + 1));
var row = [];

for (var t = 0; t < topTerms; t++) {
var topicTerm=things[t].split("_")[2];
var prob=parseInt(things[t].split("_")[0]*100);
if (prob<2) continue;

//console.log('Top Term: ' + topicTerm + ' (' + prob + '%)');

var term = {};
term.term = topicTerm;
term.probability = parseFloat(things[t].split("_")[0]);
Expand All @@ -95,20 +115,25 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag
result.push(row);
}
}

return result;
}

process.registerStopwords = function(language, path) {
STOP_WORDS_MAP[language] = path;
return this;
};

function makeArray(x) {
var a = new Array();
var a = new Array();
for (var i=0;i<x;i++) {
a[i]=0;
}
return a;
}

function make2DArray(x,y) {
var a = new Array();
var a = new Array();
for (var i=0;i<x;i++) {
a[i]=new Array();
for (var j=0;j<y;j++)
Expand All @@ -118,7 +143,7 @@ function make2DArray(x,y) {
}

var lda = new function() {
var documents,z,nw,nd,nwsum,ndsum,thetasum,phisum,V,K,alpha,beta;
var documents,z,nw,nd,nwsum,ndsum,thetasum,phisum,V,K,alpha,beta;
var THIN_INTERVAL = 20;
var BURN_IN = 100;
var ITERATIONS = 1000;
Expand All @@ -135,21 +160,21 @@ var lda = new function() {
this.documents = docs;
this.V = v;
this.dispcol=0;
this.numstats=0;
this.numstats=0;
}
this.initialState = function (K) {
var i;
var M = this.documents.length;
this.nw = make2DArray(this.V,K);
this.nd = make2DArray(M,K);
this.nwsum = makeArray(K);
this.nw = make2DArray(this.V,K);
this.nd = make2DArray(M,K);
this.nwsum = makeArray(K);
this.ndsum = makeArray(M);
this.z = new Array(); for (i=0;i<M;i++) this.z[i]=new Array();
for (var m = 0; m < M; m++) {
var N = this.documents[m].length;
this.z[m] = new Array();
for (var n = 0; n < N; n++) {
var topic = parseInt(""+(this.getRandom() * K));
var topic = parseInt(""+(this.getRandom() * K));
this.z[m][n] = topic;
this.nw[this.documents[m][n]][topic]++;
this.nd[m][topic]++;
Expand All @@ -158,7 +183,7 @@ var lda = new function() {
this.ndsum[m] = N;
}
}

this.gibbs = function (K,alpha,beta) {
var i;
this.K = K;
Expand Down Expand Up @@ -190,17 +215,17 @@ var lda = new function() {
}
if ((i > this.BURN_IN) && (this.SAMPLE_LAG > 0) && (i % this.SAMPLE_LAG == 0)) {
this.updateParams();
//document.write("|");
//document.write("|");
if (i % this.THIN_INTERVAL != 0)
this.dispcol++;
}
if (this.dispcol >= 100) {
//document.write("*<br/>");
//document.write("*<br/>");
this.dispcol = 0;
}
}
}

this.sampleFullConditional = function(m,n) {
var topic = this.z[m][n];
this.nw[this.documents[m][n]][topic]--;
Expand All @@ -226,7 +251,7 @@ var lda = new function() {
this.ndsum[m]++;
return topic;
}

this.updateParams =function () {
for (var m = 0; m < this.documents.length; m++) {
for (var k = 0; k < this.K; k++) {
Expand All @@ -240,7 +265,7 @@ var lda = new function() {
}
this.numstats++;
}

this.getTheta = function() {
var theta = new Array(); for(var i=0;i<this.documents.length;i++) theta[i] = new Array();
if (this.SAMPLE_LAG > 0) {
Expand All @@ -258,7 +283,7 @@ var lda = new function() {
}
return theta;
}

this.getPhi = function () {
var phi = new Array(); for(var i=0;i<this.K;i++) phi[i] = new Array();
if (this.SAMPLE_LAG > 0) {
Expand Down
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "lda",
"version": "0.2.0",
"version": "0.3.0",
"description": "LDA topic modeling for node.js.",
"author": {
"name": "Kory Becker",
Expand All @@ -13,7 +13,7 @@
},
"main": "./lib",
"dependencies": {
"stem-porter": "*"
"stem-porter": "*"
},
"engines": {
"node": ">= 0.8.x"
Expand Down
77 changes: 77 additions & 0 deletions test4.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
const lda = require('./lib/lda');
const path = require('path');

lda.registerStopwords('en_override', path.resolve(__dirname, './lib/stopwords_en.js'));

const collection = [
[
'Ruby slippers are pretty and fun.',
'Long walks in the park are fun.',
'',
'Slippers are soft on your feet.'
],
[
'Ruby slippers are pretty and fun.',
'Long walks in the park are fun.',
null,
'Slippers are soft on your feet.'
],
[
'',
'Ruby slippers are pretty and fun.',
'Long walks in the park are fun.',
'Slippers are soft on your feet.'
],
[
null,
'Ruby slippers are pretty and fun.',
'Long walks in the park are fun.',
'Slippers are soft on your feet.'
],
[
'Ruby slippers are pretty and fun.',
'Long walks in the park are fun.',
'Slippers are soft on your feet.',
''
],
[
'Ruby slippers are pretty and fun.',
'Long walks in the park are fun.',
'Slippers are soft on your feet.',
null
]
];

var probabilities = [];

collection.forEach((documents, i) => {
const results = lda(documents, 3, 2, ['en_override'], null, null, 123);

// Save the probabilities for each group. The values should be the same, since we're using the same random seed.
const groupProbs = [];
results.forEach(group => {
group.forEach(row => {
groupProbs.push(row.probability);
});
});

// Store the entire group in an array.
probabilities.push(groupProbs);

//console.log('\nSet ' + (i + 1));
//console.log(results);
});

var success = true;

// Verify the probabilities for each group are the same, even with empty and null values in the docs.
probabilities.forEach((group, i) => {
if (group[0] !== 0.15 || group[1] !== 0.14 || group[2] !== 0.16 || group[3] !== 0.15 || group[4] !== 0.16 || group[5] !== 0.14) {
console.log('Failed expected values for group ' + i);
success = false;
}
});

if (success) {
console.log('\nResult OK.');
}
Loading