Skip to content

Commit

Permalink
Better handling multiple RobotsTxt clauses
Browse files Browse the repository at this point in the history
  • Loading branch information
emmanuel-keller committed Jun 18, 2017
1 parent 8b264c9 commit b3548c0
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;

/**
* Contains the clause list of a "robots.txt" file for one "User-agent".
Expand All @@ -27,16 +29,13 @@ public final class RobotsTxtClauseSet {
private final static RobotsTxtClauseSet EMPTY = new RobotsTxtClauseSet();

private final Map<RobotsTxtPathMatcher, Boolean> clauses;
private final boolean defaultValue;

private RobotsTxtClauseSet(Builder builder) {
clauses = Collections.unmodifiableMap(builder.clauses);
defaultValue = builder.defautValue;
}

private RobotsTxtClauseSet() {
clauses = null;
defaultValue = true;
}

/**
Expand All @@ -46,12 +45,19 @@ private RobotsTxtClauseSet() {
final boolean isAllowed(String path) {
if (clauses == null || path == null)
return true;
if (path.isEmpty())
path = "/";
for (Map.Entry<RobotsTxtPathMatcher, Boolean> entry : clauses.entrySet())
if (entry.getKey().match(path))
return entry.getValue();
return defaultValue;
final String fpath = path.isEmpty() ? path + '/' : path;
final AtomicReference<String> pattern = new AtomicReference<>();
final AtomicBoolean status = new AtomicBoolean(true);
clauses.forEach((matcher, st) -> {
if (matcher.match(fpath)) {
final String pt = pattern.get();
if (pt != null && pt.length() >= matcher.getPattern().length())
return;
pattern.set(matcher.getPattern());
status.set(st);
}
});
return status.get();
}

public Map<RobotsTxtPathMatcher, Boolean> getClauses() {
Expand All @@ -65,15 +71,13 @@ static Builder of() {
final static class Builder {

private Map<RobotsTxtPathMatcher, Boolean> clauses;
private boolean defautValue = true;

private boolean add(final RobotsTxtPathMatcher matcher, final Boolean result) {
private void add(final RobotsTxtPathMatcher matcher, final Boolean result) {
if (matcher == null)
return false;
return;
if (clauses == null)
clauses = new LinkedHashMap<>();
clauses.put(matcher, result);
return true;
}

/**
Expand All @@ -82,8 +86,7 @@ private boolean add(final RobotsTxtPathMatcher matcher, final Boolean result) {
* @param pattern the path of the clause
*/
final void allow(String pattern) {
if (add(RobotsTxtPathMatcher.of(pattern), true))
defautValue = false; // If we have any allow clause, the default value is false
add(RobotsTxtPathMatcher.of(pattern), true);
}

final void disallow(String pattern) {
Expand Down
21 changes: 21 additions & 0 deletions src/test/java/com/qwazr/crawler/web/robotstxt/RobotsTxtTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,15 @@
*/
package com.qwazr.crawler.web.robotstxt;

import com.qwazr.utils.CharsetUtils;
import org.apache.commons.io.IOUtils;
import org.junit.Assert;
import org.junit.Test;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class RobotsTxtTest {

void shouldMatch(String pattern, String... paths) {
Expand Down Expand Up @@ -61,4 +67,19 @@ public void pathMatcherTests() {
shouldNotMatch("/fish*.php", "/Fish.PHP");
}

public void checkAllowDisallow(String url, String allow, String disallow, RobotsTxt.Status status)
throws IOException, URISyntaxException {
Assert.assertEquals(status, new RobotsTxt(
IOUtils.toInputStream("user-agent: *\nAllow: " + allow + "\nDisallow: " + disallow,
CharsetUtils.CharsetUTF8), CharsetUtils.CharsetUTF8).getStatus(URI.create(url), "ua"));
}

@Test
public void allowDisallowTests() throws IOException, URISyntaxException {
checkAllowDisallow("http://example.com/page", "/p", "/", RobotsTxt.Status.ALLOW);
checkAllowDisallow("http://example.com/folder/page", "/folder/", "/folder", RobotsTxt.Status.ALLOW);
//checkAllowDisallow("http://example.com/page.htm", "/page", "/*.htm", RobotsTxt.Status.ALLOW);
checkAllowDisallow("http://example.com/", "/$", "/", RobotsTxt.Status.ALLOW);
checkAllowDisallow("http://example.com/page.htm", "/$", "/", RobotsTxt.Status.DISALLOW);
}
}

0 comments on commit b3548c0

Please sign in to comment.