✨ Match or skip handler (function) names

roniemartinez · May 21, 2022 · 72d1cfe · 72d1cfe
1 parent 681b905
commit 72d1cfe
Show file tree

Hide file tree

Showing 12 changed files with 84 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -224,7 +224,7 @@ Here is the summary of features supported by each parser backend.
 </tbody>
 </table>
 
-## using the Docker image
+## Using the Docker image
 
 Pull the docker image using the following command.
 

diff --git a/dude/__init__.py b/dude/__init__.py
@@ -195,6 +195,18 @@ def cli() -> None:  # pragma: no cover
         action="store_true",
         help="Flag to ignore robots.txt.",
     )
+    optional.add_argument(
+        "--pattern",
+        dest="pattern",
+        action="append",
+        help="Run handlers that match the provided patterns.",
+    )
+    optional.add_argument(
+        "--skip",
+        dest="skip",
+        action="append",
+        help="Skip handlers that match the provided patterns.",
+    )
     arguments = parser.parse_args()
 
     if arguments.version:
@@ -253,4 +265,6 @@ def cli() -> None:  # pragma: no cover
         follow_urls=arguments.follow_urls,
         save_per_page=arguments.save_per_page,
         ignore_robots_txt=arguments.ignore_robots_txt,
+        pattern=arguments.pattern,
+        skip=arguments.skip,
     )
diff --git a/dude/base.py b/dude/base.py
@@ -74,6 +74,8 @@ def __init__(
         self.requests: Deque = requests or collections.deque()  # allows dynamically appending new requests for crawling
         self.allowed_domains: Set[str] = set()
         self.ignore_robots_txt: bool = False
+        self.pattern: Set[str] = set()
+        self.skip: Set[str] = set()
 
     @abstractmethod
     def run(
@@ -86,6 +88,8 @@ def run(
         follow_urls: bool = False,
         save_per_page: bool = False,
         ignore_robots_txt: bool = False,
+        pattern: Sequence[str] = None,
+        skip: Sequence[str] = None,
         **kwargs: Any,
     ) -> None:
         """
@@ -99,9 +103,15 @@ def run(
         :param follow_urls: Automatically follow URLs.
         :param save_per_page: Flag to save data on every page extraction or not. If not, saves all the data at the end.
         :param ignore_robots_txt: Flag to ignore robots.txt.
+        :param pattern: Run handlers that match the provided patterns.
+        :param skip: Skip handlers that match the provided patterns.
         """
         self.initialize_scraper(urls)
         self.ignore_robots_txt = ignore_robots_txt
+        if pattern:
+            self.pattern = set(pattern)
+        if skip:
+            self.skip = set(skip)
 
         logger.info("Using %s...", self.__class__.__name__)
 
@@ -573,13 +583,17 @@ async def extract_all_async(self, page_number: int, **kwargs: Any) -> AsyncItera
             yield scraped_data
 
     def get_scraping_rules(self, url: str) -> Iterable[Rule]:
-        return filter(rule_filter(url), self.rules)
+        return filter(rule_filter(url, self.pattern, self.skip), self.rules)
 
     def get_setup_rules(self, url: str) -> Iterable[Rule]:
-        return sorted(filter(rule_filter(url, setup=True), self.rules), key=lambda r: r.priority)
+        return sorted(
+            filter(rule_filter(url, self.pattern, self.skip, setup=True), self.rules), key=lambda r: r.priority
+        )
 
     def get_navigate_rules(self, url: str) -> Iterable[Rule]:
-        return sorted(filter(rule_filter(url, navigate=True), self.rules), key=lambda r: r.priority)
+        return sorted(
+            filter(rule_filter(url, self.pattern, self.skip, navigate=True), self.rules), key=lambda r: r.priority
+        )
 
     def get_flattened_data(self) -> List[Dict]:
         items = []

diff --git a/dude/optional/beautifulsoup_scraper.py b/dude/optional/beautifulsoup_scraper.py
@@ -29,6 +29,8 @@ def run(
         follow_urls: bool = False,
         save_per_page: bool = False,
         ignore_robots_txt: bool = False,
+        pattern: Sequence[str] = None,
+        skip: Sequence[str] = None,
         **kwargs: Any,
     ) -> None:
         """
@@ -42,6 +44,8 @@ def run(
         :param follow_urls: Automatically follow URLs.
         :param save_per_page: Flag to save data on every page extraction or not. If not, saves all the data at the end.
         :param ignore_robots_txt: Flag to ignore robots.txt.
+        :param pattern: Run handlers that match the provided patterns.
+        :param skip: Skip handlers that match the provided patterns.
         """
         super(BeautifulSoupScraper, self).run(
             urls=urls,
@@ -52,6 +56,8 @@ def run(
             follow_urls=follow_urls,
             save_per_page=save_per_page,
             ignore_robots_txt=ignore_robots_txt,
+            pattern=pattern,
+            skip=skip,
             **kwargs,
         )
 

diff --git a/dude/optional/lxml_scraper.py b/dude/optional/lxml_scraper.py
@@ -30,6 +30,8 @@ def run(
         follow_urls: bool = False,
         save_per_page: bool = False,
         ignore_robots_txt: bool = False,
+        pattern: Sequence[str] = None,
+        skip: Sequence[str] = None,
         **kwargs: Any,
     ) -> None:
         """
@@ -43,6 +45,8 @@ def run(
         :param follow_urls: Automatically follow URLs.
         :param save_per_page: Flag to save data on every page extraction or not. If not, saves all the data at the end.
         :param ignore_robots_txt: Flag to ignore robots.txt.
+        :param pattern: Run handlers that match the provided patterns.
+        :param skip: Skip handlers that match the provided patterns.
         """
         super(LxmlScraper, self).run(
             urls=urls,
@@ -53,6 +57,8 @@ def run(
             follow_urls=follow_urls,
             save_per_page=save_per_page,
             ignore_robots_txt=ignore_robots_txt,
+            pattern=pattern,
+            skip=skip,
             **kwargs,
         )
 

diff --git a/dude/optional/parsel_scraper.py b/dude/optional/parsel_scraper.py
@@ -29,6 +29,8 @@ def run(
         follow_urls: bool = False,
         save_per_page: bool = False,
         ignore_robots_txt: bool = False,
+        pattern: Sequence[str] = None,
+        skip: Sequence[str] = None,
         **kwargs: Any,
     ) -> None:
         """
@@ -42,6 +44,8 @@ def run(
         :param follow_urls: Automatically follow URLs.
         :param save_per_page: Flag to save data on every page extraction or not. If not, saves all the data at the end.
         :param ignore_robots_txt: Flag to ignore robots.txt.
+        :param pattern: Run handlers that match the provided patterns.
+        :param skip: Skip handlers that match the provided patterns.
         """
         super(ParselScraper, self).run(
             urls=urls,
@@ -52,6 +56,8 @@ def run(
             follow_urls=follow_urls,
             save_per_page=save_per_page,
             ignore_robots_txt=ignore_robots_txt,
+            pattern=pattern,
+            skip=skip,
             **kwargs,
         )
 

diff --git a/dude/optional/pyppeteer_scraper.py b/dude/optional/pyppeteer_scraper.py
@@ -34,6 +34,8 @@ def run(
         follow_urls: bool = False,
         save_per_page: bool = False,
         ignore_robots_txt: bool = False,
+        pattern: Sequence[str] = None,
+        skip: Sequence[str] = None,
         headless: bool = True,
         **kwargs: Any,
     ) -> None:
@@ -48,6 +50,8 @@ def run(
         :param follow_urls: Automatically follow URLs.
         :param save_per_page: Flag to save data on every page extraction or not. If not, saves all the data at the end.
         :param ignore_robots_txt: Flag to ignore robots.txt.
+        :param pattern: Run handlers that match the provided patterns.
+        :param skip: Skip handlers that match the provided patterns.
 
         :param headless: Enables headless browser. (default=True)
         """
@@ -60,6 +64,8 @@ def run(
             follow_urls=follow_urls,
             save_per_page=save_per_page,
             ignore_robots_txt=ignore_robots_txt,
+            pattern=pattern,
+            skip=skip,
             headless=headless,
             **kwargs,
         )

diff --git a/dude/optional/selenium_scraper.py b/dude/optional/selenium_scraper.py
@@ -40,6 +40,8 @@ def run(
         follow_urls: bool = False,
         save_per_page: bool = False,
         ignore_robots_txt: bool = False,
+        pattern: Sequence[str] = None,
+        skip: Sequence[str] = None,
         headless: bool = True,
         browser_type: str = "chromium",
         **kwargs: Any,
@@ -55,6 +57,8 @@ def run(
         :param follow_urls: Automatically follow URLs.
         :param save_per_page: Flag to save data on every page extraction or not. If not, saves all the data at the end.
         :param ignore_robots_txt: Flag to ignore robots.txt.
+        :param pattern: Run handlers that match the provided patterns.
+        :param skip: Skip handlers that match the provided patterns.
 
         :param headless: Enables headless browser. (default=True)
         :param browser_type: Selenium supported browser types ("chromium", "firefox").
@@ -68,6 +72,8 @@ def run(
             follow_urls=follow_urls,
             save_per_page=save_per_page,
             ignore_robots_txt=ignore_robots_txt,
+            pattern=pattern,
+            skip=skip,
             headless=headless,
             browser_type=browser_type,
             **kwargs,

diff --git a/dude/playwright_scraper.py b/dude/playwright_scraper.py
@@ -28,6 +28,8 @@ def run(
         follow_urls: bool = False,
         save_per_page: bool = False,
         ignore_robots_txt: bool = False,
+        pattern: Sequence[str] = None,
+        skip: Sequence[str] = None,
         headless: bool = True,
         browser_type: str = "chromium",
         **kwargs: Any,
@@ -43,6 +45,8 @@ def run(
         :param follow_urls: Automatically follow URLs.
         :param save_per_page: Flag to save data on every page extraction or not. If not, saves all the data at the end.
         :param ignore_robots_txt: Flag to ignore robots.txt.
+        :param pattern: Run handlers that match the provided patterns.
+        :param skip: Skip handlers that match the provided patterns.
 
         :param headless: Enables headless browser. (default=True)
         :param browser_type: Playwright supported browser types ("chromium", "webkit" or "firefox").
@@ -56,6 +60,8 @@ def run(
             follow_urls=follow_urls,
             save_per_page=save_per_page,
             ignore_robots_txt=ignore_robots_txt,
+            pattern=pattern,
+            skip=skip,
             **{**kwargs, "headless": headless, "browser_type": browser_type},
         )
 

diff --git a/dude/rule.py b/dude/rule.py
@@ -1,6 +1,6 @@
 import fnmatch
 from enum import Enum, auto
-from typing import Callable, NamedTuple, Optional, Tuple, Union
+from typing import Callable, NamedTuple, Optional, Set, Tuple, Union
 
 
 class SelectorType(Enum):
@@ -75,12 +75,19 @@ def rule_grouper(rule: Rule) -> Selector:
     return rule.group
 
 
-def rule_filter(url: str, setup: bool = False, navigate: bool = False) -> Callable:
+def rule_filter(url: str, pattern: Set[str], skip: Set[str], setup: bool = False, navigate: bool = False) -> Callable:
     def wrapper(rule: Rule) -> bool:
+        if len(pattern) and not any(p in rule.handler.__name__ for p in pattern):
+            return False
+
+        if len(skip) and any(p in rule.handler.__name__ for p in skip):
+            return False
+
         if callable(rule.url_matcher):
             matches = rule.url_matcher(url)
         else:
             matches = fnmatch.fnmatch(url, rule.url_matcher)
+
         return matches and rule.setup is setup and rule.navigate is navigate
 
     return wrapper
diff --git a/dude/scraper.py b/dude/scraper.py
@@ -22,6 +22,8 @@ def run(
         follow_urls: bool = False,
         save_per_page: bool = False,
         ignore_robots_txt: bool = False,
+        pattern: Sequence[str] = None,
+        skip: Sequence[str] = None,
         # extra args
         parser: str = "playwright",
         headless: bool = True,
@@ -39,6 +41,8 @@ def run(
         :param follow_urls: Automatically follow URLs.
         :param save_per_page: Flag to save data on every page extraction or not. If not, saves all the data at the end.
         :param ignore_robots_txt: Flag to ignore robots.txt.
+        :param pattern: Run handlers that match the provided patterns.
+        :param skip: Skip handlers that match the provided patterns.
 
         :param parser: Parser backend ["playwright" (default), "bs4", "parsel, "lxml", "pyppeteer" or "selenium"]
         :param headless: Enables headless browser. (default=True)
@@ -99,5 +103,7 @@ def run(
             follow_urls=follow_urls,
             save_per_page=save_per_page or follow_urls,
             ignore_robots_txt=ignore_robots_txt,
+            pattern=pattern,
+            skip=skip,
             **{"headless": headless, "browser_type": browser_type},
         )
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pydude"
-version = "0.20.1"
+version = "0.21.0"
 repository = "https://github.com/roniemartinez/dude"
 description = "dude uncomplicated data extraction"
 authors = ["Ronie Martinez <ronmarti18@gmail.com>"]