Skip to content

Commit

Permalink
[Robots.txt] Handle allow/disallow directives containing unescaped
Browse files Browse the repository at this point in the history
Unicode characters, fixes crawler-commons#389
- use UTF-8 as default input encoding of robots.txt files
- add unit test
  - test matching of Unicode paths in allow/disallow directives
  - test for proper matching of ASCII paths if encoding is not
    UTF-8 (and no byte order mark present)
  • Loading branch information
sebastian-nagel committed Apr 24, 2023
1 parent d8a6126 commit 2c2cb3b
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 1 deletion.
12 changes: 11 additions & 1 deletion src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -492,7 +492,17 @@ private SimpleRobotRules parseContent(String url, byte[] content, String content

int bytesLen = content.length;
int offset = 0;
Charset encoding = StandardCharsets.US_ASCII;

/*
* RFC 9309 requires that is "UTF-8 encoded" (<a href=
* "https://www.rfc-editor.org/rfc/rfc9309.html#name-access-method"> RFC
* 9309, section 2.3 Access Method</a>), but
* "Implementors MAY bridge encoding mismatches if they detect that the robots.txt file is not UTF-8 encoded."
* (<a href=
* "https://www.rfc-editor.org/rfc/rfc9309.html#name-the-allow-and-disallow-line"
* > RFC 9309, section 2.2.2. The "Allow" and "Disallow" Lines</a>)
*/
Charset encoding = StandardCharsets.UTF_8;

// Check for a UTF-8 BOM at the beginning (EF BB BF)
if ((bytesLen >= 3) && (content[0] == (byte) 0xEF) && (content[1] == (byte) 0xBB) && (content[2] == (byte) 0xBF)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import java.io.InputStream;
import java.net.HttpURLConnection;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
Expand Down Expand Up @@ -226,6 +227,44 @@ void testNonAsciiEncoding() {
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
}

@Test
void testUnicodeUnescapedPaths() {
final String simpleRobotsTxt = "User-agent: *" + CRLF //
+ "Disallow: /bücher/" + CRLF //
+ "Disallow: /k%C3%B6nyvek/" + CRLF //
+ CRLF //
+ "User-agent: GoodBot" + CRLF //
+ "Allow: /";

BaseRobotRules rules = createRobotRules("mybot", simpleRobotsTxt);
assertTrue(rules.isAllowed("https://www.example.com/"));

// test using escaped and unescaped URLs
assertFalse(rules.isAllowed("https://www.example.com/b%C3%BCcher/book1.html"));
assertFalse(rules.isAllowed("https://www.example.com/bücher/book2.html"));

// (for completeness) check also escaped path in robots.txt
assertFalse(rules.isAllowed("https://www.example.com/k%C3%B6nyvek/book1.html"));
assertFalse(rules.isAllowed("https://www.example.com/könyvek/book2.html"));

// test invalid encoding: invalid encoded characters should not break
// parsing of rules below
rules = createRobotRules("goodbot", simpleRobotsTxt.getBytes(StandardCharsets.ISO_8859_1));
assertTrue(rules.isAllowed("https://www.example.com/"));
assertTrue(rules.isAllowed("https://www.example.com/b%C3%BCcher/book1.html"));

// test invalid encoding: only rules with invalid characters should be
// ignored
rules = createRobotRules("mybot", simpleRobotsTxt.getBytes(StandardCharsets.ISO_8859_1));
assertTrue(rules.isAllowed("https://www.example.com/"));
assertFalse(rules.isAllowed("https://www.example.com/k%C3%B6nyvek/book1.html"));
assertFalse(rules.isAllowed("https://www.example.com/könyvek/book2.html"));
// if URL paths in disallow rules are not properly encoded, these two
// URLs are not matched:
// assertFalse(rules.isAllowed("https://www.example.com/b%C3%BCcher/book2.html"));
// assertFalse(rules.isAllowed("https://www.example.com/bücher/book1.html"));
}

@Test
void testSimplestAllowAll() {
final String simpleRobotsTxt = "User-agent: *" + CRLF //
Expand Down

0 comments on commit 2c2cb3b

Please sign in to comment.