Skip to content

Commit

Permalink
output referer if a page get > 400 when verified
Browse files Browse the repository at this point in the history
  • Loading branch information
soulgalore committed Mar 31, 2014
1 parent 2e8601a commit e5fd0b5
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 1 deletion.
5 changes: 5 additions & 0 deletions CHANGELOG
@@ -1,4 +1,9 @@
CHANGELOG crawler

version 1.5.14
------------------------
* Log referer URL if the a page get > 400 when verified

version 1.5.13
------------------------
* Catch a href that has a new line between a and href instead of a space (some WP plugin creates them that way)
Expand Down
6 changes: 5 additions & 1 deletion src/main/java/com/soulgalore/crawler/run/CrawlToFile.java
Expand Up @@ -32,6 +32,7 @@
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.http.HttpStatus;

import com.google.inject.Guice;
import com.google.inject.Injector;
Expand Down Expand Up @@ -106,7 +107,10 @@ private void crawl() {
if (result.getNonWorkingUrls().size() > 0) {
for (HTMLPageResponse nonWorkingUrl : result.getNonWorkingUrls()) {
nonWorkingUrls.append(StatusCode.toFriendlyName(nonWorkingUrl.getResponseCode()))
.append(",").append(nonWorkingUrl.getUrl()).append("\n");
.append(",").append(nonWorkingUrl.getUrl());
if (nonWorkingUrl.getResponseCode() >= HttpStatus.SC_NOT_FOUND)
nonWorkingUrls.append(" from ").append(nonWorkingUrl.getPageUrl().getReferer());
nonWorkingUrls.append("\n");
}

if (verbose) System.out.println("Start storing file non working urls " + errorFileName);
Expand Down

0 comments on commit e5fd0b5

Please sign in to comment.