Skip to content

Commit

Permalink
✨ Match or skip handler (function) names
Browse files Browse the repository at this point in the history
  • Loading branch information
roniemartinez committed May 21, 2022
1 parent 681b905 commit 72d1cfe
Show file tree
Hide file tree
Showing 12 changed files with 84 additions and 7 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ Here is the summary of features supported by each parser backend.
</tbody>
</table>

## using the Docker image
## Using the Docker image

Pull the docker image using the following command.

Expand Down
14 changes: 14 additions & 0 deletions dude/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,18 @@ def cli() -> None: # pragma: no cover
action="store_true",
help="Flag to ignore robots.txt.",
)
optional.add_argument(
"--pattern",
dest="pattern",
action="append",
help="Run handlers that match the provided patterns.",
)
optional.add_argument(
"--skip",
dest="skip",
action="append",
help="Skip handlers that match the provided patterns.",
)
arguments = parser.parse_args()

if arguments.version:
Expand Down Expand Up @@ -253,4 +265,6 @@ def cli() -> None: # pragma: no cover
follow_urls=arguments.follow_urls,
save_per_page=arguments.save_per_page,
ignore_robots_txt=arguments.ignore_robots_txt,
pattern=arguments.pattern,
skip=arguments.skip,
)
20 changes: 17 additions & 3 deletions dude/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ def __init__(
self.requests: Deque = requests or collections.deque() # allows dynamically appending new requests for crawling
self.allowed_domains: Set[str] = set()
self.ignore_robots_txt: bool = False
self.pattern: Set[str] = set()
self.skip: Set[str] = set()

@abstractmethod
def run(
Expand All @@ -86,6 +88,8 @@ def run(
follow_urls: bool = False,
save_per_page: bool = False,
ignore_robots_txt: bool = False,
pattern: Sequence[str] = None,
skip: Sequence[str] = None,
**kwargs: Any,
) -> None:
"""
Expand All @@ -99,9 +103,15 @@ def run(
:param follow_urls: Automatically follow URLs.
:param save_per_page: Flag to save data on every page extraction or not. If not, saves all the data at the end.
:param ignore_robots_txt: Flag to ignore robots.txt.
:param pattern: Run handlers that match the provided patterns.
:param skip: Skip handlers that match the provided patterns.
"""
self.initialize_scraper(urls)
self.ignore_robots_txt = ignore_robots_txt
if pattern:
self.pattern = set(pattern)
if skip:
self.skip = set(skip)

logger.info("Using %s...", self.__class__.__name__)

Expand Down Expand Up @@ -573,13 +583,17 @@ async def extract_all_async(self, page_number: int, **kwargs: Any) -> AsyncItera
yield scraped_data

def get_scraping_rules(self, url: str) -> Iterable[Rule]:
return filter(rule_filter(url), self.rules)
return filter(rule_filter(url, self.pattern, self.skip), self.rules)

def get_setup_rules(self, url: str) -> Iterable[Rule]:
return sorted(filter(rule_filter(url, setup=True), self.rules), key=lambda r: r.priority)
return sorted(
filter(rule_filter(url, self.pattern, self.skip, setup=True), self.rules), key=lambda r: r.priority
)

def get_navigate_rules(self, url: str) -> Iterable[Rule]:
return sorted(filter(rule_filter(url, navigate=True), self.rules), key=lambda r: r.priority)
return sorted(
filter(rule_filter(url, self.pattern, self.skip, navigate=True), self.rules), key=lambda r: r.priority
)

def get_flattened_data(self) -> List[Dict]:
items = []
Expand Down
6 changes: 6 additions & 0 deletions dude/optional/beautifulsoup_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ def run(
follow_urls: bool = False,
save_per_page: bool = False,
ignore_robots_txt: bool = False,
pattern: Sequence[str] = None,
skip: Sequence[str] = None,
**kwargs: Any,
) -> None:
"""
Expand All @@ -42,6 +44,8 @@ def run(
:param follow_urls: Automatically follow URLs.
:param save_per_page: Flag to save data on every page extraction or not. If not, saves all the data at the end.
:param ignore_robots_txt: Flag to ignore robots.txt.
:param pattern: Run handlers that match the provided patterns.
:param skip: Skip handlers that match the provided patterns.
"""
super(BeautifulSoupScraper, self).run(
urls=urls,
Expand All @@ -52,6 +56,8 @@ def run(
follow_urls=follow_urls,
save_per_page=save_per_page,
ignore_robots_txt=ignore_robots_txt,
pattern=pattern,
skip=skip,
**kwargs,
)

Expand Down
6 changes: 6 additions & 0 deletions dude/optional/lxml_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ def run(
follow_urls: bool = False,
save_per_page: bool = False,
ignore_robots_txt: bool = False,
pattern: Sequence[str] = None,
skip: Sequence[str] = None,
**kwargs: Any,
) -> None:
"""
Expand All @@ -43,6 +45,8 @@ def run(
:param follow_urls: Automatically follow URLs.
:param save_per_page: Flag to save data on every page extraction or not. If not, saves all the data at the end.
:param ignore_robots_txt: Flag to ignore robots.txt.
:param pattern: Run handlers that match the provided patterns.
:param skip: Skip handlers that match the provided patterns.
"""
super(LxmlScraper, self).run(
urls=urls,
Expand All @@ -53,6 +57,8 @@ def run(
follow_urls=follow_urls,
save_per_page=save_per_page,
ignore_robots_txt=ignore_robots_txt,
pattern=pattern,
skip=skip,
**kwargs,
)

Expand Down
6 changes: 6 additions & 0 deletions dude/optional/parsel_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ def run(
follow_urls: bool = False,
save_per_page: bool = False,
ignore_robots_txt: bool = False,
pattern: Sequence[str] = None,
skip: Sequence[str] = None,
**kwargs: Any,
) -> None:
"""
Expand All @@ -42,6 +44,8 @@ def run(
:param follow_urls: Automatically follow URLs.
:param save_per_page: Flag to save data on every page extraction or not. If not, saves all the data at the end.
:param ignore_robots_txt: Flag to ignore robots.txt.
:param pattern: Run handlers that match the provided patterns.
:param skip: Skip handlers that match the provided patterns.
"""
super(ParselScraper, self).run(
urls=urls,
Expand All @@ -52,6 +56,8 @@ def run(
follow_urls=follow_urls,
save_per_page=save_per_page,
ignore_robots_txt=ignore_robots_txt,
pattern=pattern,
skip=skip,
**kwargs,
)

Expand Down
6 changes: 6 additions & 0 deletions dude/optional/pyppeteer_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ def run(
follow_urls: bool = False,
save_per_page: bool = False,
ignore_robots_txt: bool = False,
pattern: Sequence[str] = None,
skip: Sequence[str] = None,
headless: bool = True,
**kwargs: Any,
) -> None:
Expand All @@ -48,6 +50,8 @@ def run(
:param follow_urls: Automatically follow URLs.
:param save_per_page: Flag to save data on every page extraction or not. If not, saves all the data at the end.
:param ignore_robots_txt: Flag to ignore robots.txt.
:param pattern: Run handlers that match the provided patterns.
:param skip: Skip handlers that match the provided patterns.
:param headless: Enables headless browser. (default=True)
"""
Expand All @@ -60,6 +64,8 @@ def run(
follow_urls=follow_urls,
save_per_page=save_per_page,
ignore_robots_txt=ignore_robots_txt,
pattern=pattern,
skip=skip,
headless=headless,
**kwargs,
)
Expand Down
6 changes: 6 additions & 0 deletions dude/optional/selenium_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ def run(
follow_urls: bool = False,
save_per_page: bool = False,
ignore_robots_txt: bool = False,
pattern: Sequence[str] = None,
skip: Sequence[str] = None,
headless: bool = True,
browser_type: str = "chromium",
**kwargs: Any,
Expand All @@ -55,6 +57,8 @@ def run(
:param follow_urls: Automatically follow URLs.
:param save_per_page: Flag to save data on every page extraction or not. If not, saves all the data at the end.
:param ignore_robots_txt: Flag to ignore robots.txt.
:param pattern: Run handlers that match the provided patterns.
:param skip: Skip handlers that match the provided patterns.
:param headless: Enables headless browser. (default=True)
:param browser_type: Selenium supported browser types ("chromium", "firefox").
Expand All @@ -68,6 +72,8 @@ def run(
follow_urls=follow_urls,
save_per_page=save_per_page,
ignore_robots_txt=ignore_robots_txt,
pattern=pattern,
skip=skip,
headless=headless,
browser_type=browser_type,
**kwargs,
Expand Down
6 changes: 6 additions & 0 deletions dude/playwright_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ def run(
follow_urls: bool = False,
save_per_page: bool = False,
ignore_robots_txt: bool = False,
pattern: Sequence[str] = None,
skip: Sequence[str] = None,
headless: bool = True,
browser_type: str = "chromium",
**kwargs: Any,
Expand All @@ -43,6 +45,8 @@ def run(
:param follow_urls: Automatically follow URLs.
:param save_per_page: Flag to save data on every page extraction or not. If not, saves all the data at the end.
:param ignore_robots_txt: Flag to ignore robots.txt.
:param pattern: Run handlers that match the provided patterns.
:param skip: Skip handlers that match the provided patterns.
:param headless: Enables headless browser. (default=True)
:param browser_type: Playwright supported browser types ("chromium", "webkit" or "firefox").
Expand All @@ -56,6 +60,8 @@ def run(
follow_urls=follow_urls,
save_per_page=save_per_page,
ignore_robots_txt=ignore_robots_txt,
pattern=pattern,
skip=skip,
**{**kwargs, "headless": headless, "browser_type": browser_type},
)

Expand Down
11 changes: 9 additions & 2 deletions dude/rule.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import fnmatch
from enum import Enum, auto
from typing import Callable, NamedTuple, Optional, Tuple, Union
from typing import Callable, NamedTuple, Optional, Set, Tuple, Union


class SelectorType(Enum):
Expand Down Expand Up @@ -75,12 +75,19 @@ def rule_grouper(rule: Rule) -> Selector:
return rule.group


def rule_filter(url: str, setup: bool = False, navigate: bool = False) -> Callable:
def rule_filter(url: str, pattern: Set[str], skip: Set[str], setup: bool = False, navigate: bool = False) -> Callable:
def wrapper(rule: Rule) -> bool:
if len(pattern) and not any(p in rule.handler.__name__ for p in pattern):
return False

if len(skip) and any(p in rule.handler.__name__ for p in skip):
return False

if callable(rule.url_matcher):
matches = rule.url_matcher(url)
else:
matches = fnmatch.fnmatch(url, rule.url_matcher)

return matches and rule.setup is setup and rule.navigate is navigate

return wrapper
6 changes: 6 additions & 0 deletions dude/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ def run(
follow_urls: bool = False,
save_per_page: bool = False,
ignore_robots_txt: bool = False,
pattern: Sequence[str] = None,
skip: Sequence[str] = None,
# extra args
parser: str = "playwright",
headless: bool = True,
Expand All @@ -39,6 +41,8 @@ def run(
:param follow_urls: Automatically follow URLs.
:param save_per_page: Flag to save data on every page extraction or not. If not, saves all the data at the end.
:param ignore_robots_txt: Flag to ignore robots.txt.
:param pattern: Run handlers that match the provided patterns.
:param skip: Skip handlers that match the provided patterns.
:param parser: Parser backend ["playwright" (default), "bs4", "parsel, "lxml", "pyppeteer" or "selenium"]
:param headless: Enables headless browser. (default=True)
Expand Down Expand Up @@ -99,5 +103,7 @@ def run(
follow_urls=follow_urls,
save_per_page=save_per_page or follow_urls,
ignore_robots_txt=ignore_robots_txt,
pattern=pattern,
skip=skip,
**{"headless": headless, "browser_type": browser_type},
)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pydude"
version = "0.20.1"
version = "0.21.0"
repository = "https://github.com/roniemartinez/dude"
description = "dude uncomplicated data extraction"
authors = ["Ronie Martinez <ronmarti18@gmail.com>"]
Expand Down

0 comments on commit 72d1cfe

Please sign in to comment.