Skip to content

Commit

Permalink
Only prefix absolute URLs with / (#34)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed Aug 8, 2023
1 parent 6c7c318 commit 741d95f
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 10 deletions.
14 changes: 5 additions & 9 deletions src/protego.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,6 @@ def _is_valid_directive_field(field):
)


def _enforce_path(pattern):
if pattern.startswith("/"):
return pattern

return "/" + pattern


class _URLPattern(object):
"""Internal class which represents a URL pattern."""

Expand Down Expand Up @@ -179,6 +172,9 @@ def _quote_path(self, path):
return path or "/"

def _quote_pattern(self, pattern):
if pattern.startswith("https://") or pattern.startswith("http://"):
pattern = "/" + pattern

# Corner case for query only (e.g. '/abc?') and param only (e.g. '/abc;') URLs.
# Save the last character otherwise, urlparse will kill it.
last_char = ""
Expand Down Expand Up @@ -444,11 +440,11 @@ def _parse_robotstxt(self, content):

elif field in _ALLOW_DIRECTIVE:
for rule_set in current_rule_sets:
rule_set.allow(_enforce_path(value))
rule_set.allow(value)

elif field in _DISALLOW_DIRECTIVE:
for rule_set in current_rule_sets:
rule_set.disallow(_enforce_path(value))
rule_set.disallow(value)

elif field in _SITEMAP_DIRECTIVE:
self._sitemap_list.append(value)
Expand Down
16 changes: 15 additions & 1 deletion tests/test_protego.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# encoding=utf-8
from datetime import time
from unittest import TestCase

import pytest

from protego import Protego, _RuleSet


Expand Down Expand Up @@ -1139,3 +1140,16 @@ def test_parse_time_period(self):
start_time, end_time = rs._parse_time_period("0500 0600", separator=" ")
self.assertEqual(start_time, time(5, 0))
self.assertEqual(end_time, time(6, 0))


@pytest.mark.parametrize(
"allow,disallow,url,allowed",
[
("*/p", "/", "http://example.com/page", True),
("/page", "*/*.htm", "https://example.com/page.htm", False),
],
)
def test_leading_asterisk(allow, disallow, url, allowed):
content = f"User-Agent: *\n" f"allow: {allow}\n" f"disallow: {disallow}\n"
rp = Protego.parse(content)
assert rp.can_fetch(url, "*") == allowed

0 comments on commit 741d95f

Please sign in to comment.