Permalink
Browse files

Improvements

  • Loading branch information...
1 parent 86d9088 commit 63e556e7eb12980f1e48c3d450a1f8a10372ebee @skid committed Feb 29, 2012
Showing with 12 additions and 3 deletions.
  1. +12 −3 extractor.js
View
@@ -1,3 +1,11 @@
+// TODO: The tuning settings need explanation on their overall effect
+// TODO: Look at URL 28. Score and tag count need a stronger influence on text node selection
+// Content node is decided on title only
+// div height: 5 words: 602|94 score: 11.835164835164836 tags: 89 longest: 54 title: YES
+// Otherwise comments node is taken
+// div height: 6 words: 843|91 score: 56.3929122574956 tags: 229 longest: 237 title:
+
+
/**
* Refinery Extractor.
* Extracts text content from a DOM tree.
@@ -57,8 +65,6 @@ var legend = { a: "a", h1: "b", h2: "c", h3: "d", h4: "e", h5: "f", h6: "g", ul:
u: "I", strong: "J", em: "K", q: "L", sub: "M", sup: "N", abbr: "O", address: "D", li: "E", dl: "R",
}
-// TODO: The tuning settings need explanation on their overall effect
-
var heightThreshold = 5; // Stop comparing and producing node patterns for nodes that are heigher than this
var anchorWeight = 4; // Added to the node's score if it is an anchor tag
var titleThreshold = 5; // Max number of words added to the page title (Name of site, etc...)
@@ -324,6 +330,10 @@ function analyze(subtree, parent, root, depth) {
score += anchorWeight;
node.aWords += node.words;
}
+ else if(name === 'h1') {
+ node.title = true;
+ }
+
if(node.title) {
parent.title = true;
@@ -400,7 +410,6 @@ function extract(root, options) {
var to = 'textOnly' in options ? options.textOnly : true;
var candidate = null;
- debugger;
(function walk(parent) {
if(parent.type !== 'tag' || !parent.children || !parent.children.length) {
return;

0 comments on commit 63e556e

Please sign in to comment.