Skip to content

Commit

Permalink
Merge pull request #40 from spencermountain/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
spencermountain committed May 18, 2018
2 parents 78d74dd + 78dda82 commit cca9de7
Show file tree
Hide file tree
Showing 23 changed files with 359 additions and 260 deletions.
3 changes: 3 additions & 0 deletions .npmignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
tests
contributing.md
scratch.js
37 changes: 30 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,19 @@

<h2 align="center">💂 Yup 💂</h2>
<div><sup>do it on your laptop.</sup></div>
<img src="https://user-images.githubusercontent.com/399657/39391259-b57ca9e0-4a6e-11e8-8b33-2064e5fc187e.png"/>
</div>

![image](https://user-images.githubusercontent.com/399657/39391259-b57ca9e0-4a6e-11e8-8b33-2064e5fc187e.png)
`dumpster-dive` is a **nodejs** script that puts a **highly-queryable** wikipedia on your computer in a nice afternoon.

It uses [worker-nodes](https://github.com/allegro/node-worker-nodes) to process pages in parallel, and [wtf_wikipedia](https://github.com/spencermountain/wtf_wikipedia) to turn ***wikiscript*** into whatever json.

<div align="center">
-- <b>en-wikipedia</b> takes about 7-hours, end-to-end --
-- <b>en-wikipedia</b> takes about 5-hours, end-to-end --
</div>

![dumpster](https://user-images.githubusercontent.com/399657/40262198-a268b95a-5ad3-11e8-86ef-29c2347eec81.gif)

```bash
npm install -g dumpster-dive
```
Expand All @@ -53,11 +55,11 @@ dumpster /path/to/my-wikipedia-article-dump.xml --citations=false --html=true
````bash
$ mongo #enter the mongo shell
use enwiki #grab the database
db.wikipedia.find({title:"Toronto"})[0].categories
db.pages.count()
# 4,926,056...
db.pages.find({title:"Toronto"})[0].categories
#[ "Former colonial capitals in Canada",
# "Populated places established in 1793" ...]
db.wikipedia.count()
# 4,926,056...
````

# Steps:
Expand Down Expand Up @@ -106,16 +108,21 @@ The en-wiki dump should take a few hours. Maybe 8. Should be done before dinner.
The console will update you every couple seconds to let you know where it's at.

### 7️⃣ done!
![image](https://user-images.githubusercontent.com/399657/40262181-7c1f17bc-5ad3-11e8-95ab-55f324022d43.png)

go check-out the data! to view your data in the mongo console:
````javascript
$ mongo
use afwiki //your db name

//show a random page
db.wikipedia.find().skip(200).limit(2)
db.pages.find().skip(200).limit(2)

//find a specific page
db.wikipedia.findOne({title:"Toronto"}).categories
db.pages.findOne({title:"Toronto"}).categories

//find the last page
db.wikipedia.find().sort({$natural:-1}).limit(1)
````
alternatively, you can run `dumpster-report afwiki` to see a quick spot-check of the records it has created across the database.

Expand Down Expand Up @@ -154,6 +161,22 @@ you can tell wtf_wikipedia what you want it to parse, and which data you don't n
```bash
dumpster ./my-wiki-dump.xml --infoboxes=false --citations=false --categories=false --links=false
```
* **custom json formatting**
you can grab whatever data you want, by passing-in a `custom` function. It takes a [wtf_wikipedia](https://github.com/spencermountain/wtf_wikipedia) `Doc` object, and you can return your cool data:
```js
let obj={
file: path,
db: dbName,
custom: function(doc) {
return {
_id: doc.title(), //for duplicate-detection
title: doc.title(), //for the logger..
categories: doc.categories() //whatever you want!
}
}
}
dumpster(obj, () => console.log('custom wikipedia!') )
```

## how it works:
this library uses:
Expand Down
3 changes: 1 addition & 2 deletions bin/dumpster.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ Object.keys(options).forEach((k) => {

//grab the wiki file
if (!file) {
console.log('please supply a filename to the wikipedia article dump')
console.log('please supply a filename to the wikipedia article dump')
process.exit(1)
}
//try to make-up the language name for the db
Expand All @@ -67,5 +67,4 @@ if (file.match(/-latest-pages-articles/)) {
}
options.file = file
options.db = db
// console.log(options)
dumpster(options)
104 changes: 45 additions & 59 deletions bin/report.js
Original file line number Diff line number Diff line change
@@ -1,68 +1,54 @@
const config = require('../config');
const chalk = require('chalk');
const niceNumber = require('../lib/fns').niceNumber;
const MongoClient = require('mongodb').MongoClient
const openDb = require('../src/lib/open-db')
const niceNumber = require('../src/lib/fns').niceNumber;
const dbName = process.argv[2] || 'enwiki'

const open = function(_dbName, callback) {
let url = 'mongodb://localhost:27017/' + _dbName
MongoClient.connect(url, function(err, db) {
if (err) {
console.log(err)
process.exit(1)
const showPage = async function(col) {
// int = parseInt(int, 10)
let docs = await col.aggregate(
{
$sample: {
size: 1
}
}
callback(db)
})
}
)
console.log(docs)
// let docs = await col.find({}, {
// skip: int,
// limit: 1
// })
// console.log(docs.toArray())
// let doc = docs[0]
// console.log(chalk.blue('\npage #' + niceNumber(int) + `: -- ${chalk.green(chalk.underline(doc.title))} --`))
// let sections = doc.sections || []
// let str = ' ' + chalk.red(`${(doc.sections || []).length} sections`)
// str += ' - ' + chalk.red(`${(doc.infoboxes || []).length} infoboxes`)
// str += ' - ' + chalk.red(`${(doc.categories || []).length} categories`)
// str += ' - ' + chalk.red(`${(doc.citations || []).length} citations`)
// console.log(str, '\n')
// sections.forEach((sec) => {
// let heading = '='.repeat(sec.depth + 2)
// console.log(chalk.grey(' ' + heading + ' ' + (sec.title || '(intro)') + ' ' + heading))
// //print first sentence
// if (sec.sentences && sec.sentences[0]) {
// let sen = sec.sentences[0].text || ''
// console.log(chalk.yellow(` "${sen.slice(0, 170)}..."`))
// }
// })
console.log('\n\n\n')

const showPage = function(col, int) {
col.find({}, {
skip: int,
limit: 1
}).toArray(function(err, docs) {
let doc = docs[0]
console.log(chalk.blue('\npage #' + niceNumber(int) + `: -- ${chalk.green(chalk.underline(doc.title))} --`))
let sections = doc.sections || []
let str = ' ' + chalk.red(`${(doc.sections || []).length} sections`)
str += ' - ' + chalk.red(`${(doc.infoboxes || []).length} infoboxes`)
str += ' - ' + chalk.red(`${(doc.categories || []).length} categories`)
str += ' - ' + chalk.red(`${(doc.citations || []).length} citations`)
console.log(str, '\n')
sections.forEach((sec) => {
let heading = '='.repeat(sec.depth + 2)
console.log(chalk.grey(' ' + heading + ' ' + (sec.title || '(intro)') + ' ' + heading))
//print first sentence
if (sec.sentences && sec.sentences[0]) {
let sen = sec.sentences[0].text || ''
console.log(chalk.yellow(` "${sen.slice(0, 170)}..."`))
}
})
console.log('\n\n\n')
})
}


open(dbName, (db) => {
let col = db.collection(config.collection)
col.count().then((count) => {
console.log(chalk.blue('\n\n ----------- ' + niceNumber(count) + ' pages total -----------\n'))
let showPages = [1]
showPages.push(Math.floor(count / 6))
showPages.push(Math.floor(count / 5))
showPages.push(Math.floor(count / 4))
showPages.push(Math.floor(count / 3))
showPages.push(Math.floor(count / 2))
showPages.push(Math.floor(count / 1.5))
let i = 0
let repeat = setInterval(function() {
if (!showPages[i]) {
clearInterval(repeat)
db.close()
return
}
showPage(col, showPages[i])
i += 1
}, 2000)

//cool moves,
const main = async function() {
let obj = await openDb({
db: dbName
})
})
let count = await obj.col.count()
console.log(chalk.blue('\n\n ----------- ' + niceNumber(count) + ' pages total -----------\n'))
await showPage(obj.col)
// await showPage(obj.col, count / 5)
await obj.client.close()
}
main()
4 changes: 4 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,7 @@
* rename from `wikipedia-to-mongo` to `dumpster-dive`
* use wtf_wikipedia v3 (a big re-factor too!)
* use `line-by-line`, and `worker-nodes` to run parsing in parallel
### v3.1.0
* fix connection time-outs & improve logging output
* change default collection name to `pages`
* add `.custom()` function support
4 changes: 2 additions & 2 deletions config.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module.exports = {
//number of pages to write at a time, to the queue
"batch_size": 1000,
//the default name of the collection to write to
"collection": "wikipedia",
"collection": "pages",
//update interval
"logInterval": 4000,
"logInterval": 10000,
}
14 changes: 6 additions & 8 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,35 @@
"author": "Spencer Kelly <spencermountain@gmail.com> (http://spencermounta.in)",
"name": "dumpster-dive",
"description": "get a wikipedia dump parsed into mongodb",
"version": "3.0.4",
"version": "3.1.0",
"repository": {
"type": "git",
"url": "git://github.com/spencermountain/wikipedia-to-mongodb.git"
},
"bin": {
"dumpster": "./bin/dumpster.js",
"dumpster-report": "./bin/report.js"
"dumpster": "./bin/dumpster.js"
},
"engines": {
"node": ">=6.0.0"
},
"main": "./src/index.js",
"scripts": {
"test": "\"node_modules/.bin/tape\" \"./tests/*.test.js\" | \"node_modules/.bin/tap-spec\" --color",
"test": "\"node_modules/.bin/tape\" \"./tests/*.test.js\" | \"node_modules/.bin/tap-dancer\" --color",
"cleanup": "rm /tmp/worker.logs && touch /tmp/worker.logs",
"watch": "node ./scratch.js"
},
"dependencies": {
"chalk": "2.4.1",
"line-by-line": "0.1.6",
"mongodb": "3.0.7",
"ora": "2.1.0",
"prettysize": "1.1.0",
"sunday-driver": "1.0.1",
"worker-nodes": "1.6.0",
"wtf_wikipedia": "^3.1.1",
"yargs": "11.0.0"
},
"devDependencies": {
"shelljs": "^0.8.1",
"tap-spec": "4.1.1",
"shelljs": "0.8.2",
"tap-dancer": "0.0.3",
"tape": "4.9.0"
},
"license": "MIT"
Expand Down
33 changes: 25 additions & 8 deletions scratch.js
Original file line number Diff line number Diff line change
@@ -1,24 +1,41 @@
const dumpster = require('./src')
const drop = require('./src/lib/drop-db')

const path = '/Users/spencer/data/wikipedia/simplewiki-latest-pages-articles.xml'
// const path = '/Users/spencer/data/wikipedia/eswiki-latest-pages-articles.xml'
// const path = '/Users/spencer/data/wikipedia/enwiki-latest-pages-articles.xml'
// const path = './tests/smallwiki-latest-pages-articles.xml'
// const path = './tests/tinywiki-latest-pages-articles.xml'
//144mb → 2.5 minutes = 57mb per worker per minute

// const path = '/Users/spencer/data/wikipedia/afwiki-latest-pages-articles.xml' //4.3mins
const path = '/Users/spencer/data/wikipedia/simplewiki-latest-pages-articles.xml' //5mins //144 MB each
// const path = '/Users/spencer/data/wikipedia/eswiki-latest-pages-articles.xml' //2hrs - 12gb→5gb
// const path = '/Users/spencer/data/wikipedia/enwiki-latest-pages-articles.xml' //6hrs
// const path = './tests/smallwiki-latest-pages-articles.xml' //3s
// const path = './tests/tinywiki-latest-pages-articles.xml' //2s
const dbName = path.match(/\/([a-z-]+)-latest-pages/)[1]
let options = {
file: path,
db: dbName,
plaintext: true,
html: true,
markdown: true,
custom: function(doc) {
return {
_id: doc.title(),
title: doc.title(),
categories: doc.categories(),
}
}
}

//delete all pages
drop(options).then(() => {
dumpster(options)
})

// const fs = require('fs');
// let str = fs.readFileSync(path).toString()
// let str = `
// <text xml:space="preserve">
// this duplicate should stay
// from here too
// </text>`
// console.log(str.match(/<text xml:space="preserve">([\s\S]*?)<\/text>/))


// half- 6021472
// Euston Road - 5888070
Expand Down
13 changes: 7 additions & 6 deletions src/01-prelim-stuff.js → src/01-prepwork.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ const fs = require("fs")
const config = require("../config")
const cpuCount = require('os').cpus().length

const guardFile = function(options) {
const guardIO = function(options) {
if (!options.file || !fs.existsSync(options.file)) {
console.log(chalk.red('\n --can\'t find file: "' + chalk.blue(options.file) + '" ---'));
console.log(chalk.grey(' please supply a filename for the wikipedia article dump in xml format'));
Expand All @@ -16,18 +16,19 @@ const guardFile = function(options) {
}
}

//a little housework first,
const prepare = function(options) {
//a little housework first, for our config object
const prepWork = function(options) {
options = options || {}
options = Object.assign({}, options);

//guess an appropriate dbName
if (!options.db) {
options.db = options.file.match(/\/([a-z-]+)-latest-pages/)[1] || 'wikipedia'
}
guardFile(options)
//make sure the file looks good..
guardIO(options)

//few defaults
//set a few defaults
options.dbName = options.db
options.workers = options.workers || cpuCount
options.batch_size = options.batch_size || config.batch_size
Expand All @@ -38,4 +39,4 @@ const prepare = function(options) {
});
return options
}
module.exports = prepare
module.exports = prepWork

0 comments on commit cca9de7

Please sign in to comment.