Skip to content
This repository has been archived by the owner on Mar 7, 2021. It is now read-only.

Commit

Permalink
Performance optimization for very large sitemaps.
Browse files Browse the repository at this point in the history
Fixes #364
  • Loading branch information
Konstantin Bläsi committed Jul 24, 2017
1 parent 750511d commit b290215
Showing 1 changed file with 31 additions and 35 deletions.
66 changes: 31 additions & 35 deletions lib/crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -832,52 +832,48 @@ function cleanURL (URL, queueItem) {
* @param {QueueItem} queueItem The queue item representing the resource where the URL's were discovered
* @return {Array} Returns an array of unique and absolute URL's
*/
Crawler.prototype.cleanExpandResources = function(urlMatch, queueItem) {
Crawler.prototype.cleanExpandResources = function (urlMatch, queueItem) {
var crawler = this;

if (!urlMatch) {
return [];
}
const URLs = new Set();
let URL;
for (let i = 0; i < urlMatch.length; i++) {
URL = urlMatch[i];

return urlMatch
.filter(Boolean)
.map(function(url) {
return cleanURL(url, queueItem);
})
.reduce(function(list, URL) {

// Ensure URL is whole and complete
try {
URL = uri(URL)
.absoluteTo(queueItem.url || "")
.normalize()
.href();
} catch (e) {
// But if URI.js couldn't parse it - nobody can!
return list;
}
if (!URL) {
continue;
}

// If we hit an empty item, don't return it
if (!URL.length) {
return list;
}
URL = cleanURL(URL, queueItem);

// Ensure URL is whole and complete
try {
URL = uri(URL)
.absoluteTo(queueItem.url || "")
.normalize()
.href();
} catch (e) {
// But if URI.js couldn't parse it - nobody can!
continue;
}

// If we don't support the protocol in question
if (!crawler.protocolSupported(URL)) {
return list;
}
// If we hit an empty item, don't return it
if (!URL.length) {
continue;
}

// Does the item already exist in the list?
var exists = list.some(function(entry) {
return entry === URL;
});
// If we don't support the protocol in question
if (!crawler.protocolSupported(URL)) {
continue;
}

if (exists) {
return list;
}
URLs.add(URL);
}

return list.concat(URL);
}, []);
return Array.from(URLs);
};

/**
Expand Down

0 comments on commit b290215

Please sign in to comment.