From 3d69a7d8a2c5b1630edb7f4c90b71e9c7c0f1c16 Mon Sep 17 00:00:00 2001 From: Lucia Seggiaro Date: Mon, 5 May 2025 10:42:32 +0200 Subject: [PATCH] feat: amazon SDEs --- src/scraperapi_mcp_server/scrape.py | 57 ------------ .../{model.py => scraping/models.py} | 3 +- src/scraperapi_mcp_server/scraping/scrape.py | 34 ++++++++ src/scraperapi_mcp_server/sdes/amazon.py | 35 ++++++++ src/scraperapi_mcp_server/sdes/base.py | 14 +++ src/scraperapi_mcp_server/sdes/models.py | 34 ++++++++ src/scraperapi_mcp_server/server.py | 87 +++++++++++++++++-- src/scraperapi_mcp_server/utils/__init__.py | 1 + .../{ => utils}/country_codes.py | 0 .../utils/make_request.py | 40 +++++++++ 10 files changed, 241 insertions(+), 64 deletions(-) delete mode 100644 src/scraperapi_mcp_server/scrape.py rename src/scraperapi_mcp_server/{model.py => scraping/models.py} (97%) create mode 100644 src/scraperapi_mcp_server/scraping/scrape.py create mode 100644 src/scraperapi_mcp_server/sdes/amazon.py create mode 100644 src/scraperapi_mcp_server/sdes/base.py create mode 100644 src/scraperapi_mcp_server/sdes/models.py create mode 100644 src/scraperapi_mcp_server/utils/__init__.py rename src/scraperapi_mcp_server/{ => utils}/country_codes.py (100%) create mode 100644 src/scraperapi_mcp_server/utils/make_request.py diff --git a/src/scraperapi_mcp_server/scrape.py b/src/scraperapi_mcp_server/scrape.py deleted file mode 100644 index fb4ac80..0000000 --- a/src/scraperapi_mcp_server/scrape.py +++ /dev/null @@ -1,57 +0,0 @@ -import requests -from requests.exceptions import RequestException, HTTPError as RequestsHTTPError -from mcp.shared.exceptions import McpError -from scraperapi_mcp_server.config import settings -from mcp.types import ( - ErrorData, - INTERNAL_ERROR, -) - -def basic_scrape( - url: str, - render: bool = None, - country_code: str = None, - premium: bool = None, - ultra_premium: bool = None, - device_type: str = None - ) -> str: - payload = { - 'api_key': settings.API_KEY, - 'url': url, - 'output_format': 'markdown' - } - - optional_params = { - 'render': (render, lambda v: str(v).lower()), - 'country_code': (country_code, str), - 'premium': (premium, lambda v: str(v).lower()), - 'ultra_premium': (ultra_premium, lambda v: str(v).lower()), - 'device_type': (device_type, str) - } - - for key, (value, formatter) in optional_params.items(): - if value is not None: - payload[key] = formatter(value) - - try: - response = requests.get(settings.API_URL, params=payload, timeout=settings.API_TIMEOUT_SECONDS) - response.raise_for_status() - - return response.text - except RequestsHTTPError as e: - status_code = e.response.status_code if hasattr(e, 'response') else 500 - error_message = f"HTTP error {status_code} when scraping '{url}': {str(e)}" - raise McpError(ErrorData( - code=INTERNAL_ERROR, - message=error_message, - )) - except RequestException as e: - raise McpError(ErrorData( - code=INTERNAL_ERROR, - message=f"Connection error when scraping '{url}': {str(e)}", - )) - except Exception as e: - raise McpError(ErrorData( - code=INTERNAL_ERROR, - message=f"Unexpected error when scraping '{url}': {str(e)}", - )) \ No newline at end of file diff --git a/src/scraperapi_mcp_server/model.py b/src/scraperapi_mcp_server/scraping/models.py similarity index 97% rename from src/scraperapi_mcp_server/model.py rename to src/scraperapi_mcp_server/scraping/models.py index d0e2992..7de9b7d 100644 --- a/src/scraperapi_mcp_server/model.py +++ b/src/scraperapi_mcp_server/scraping/models.py @@ -1,6 +1,7 @@ from typing import Annotated from pydantic import BaseModel, Field, AnyUrl + class Scrape(BaseModel): """Parameters for scraping a URL.""" @@ -9,4 +10,4 @@ class Scrape(BaseModel): country_code: Annotated[str, Field(default=None, description="Country code to scrape from")] premium: Annotated[bool, Field(default=False, description="Whether to use premium scraping")] ultra_premium: Annotated[bool, Field(default=False, description="Whether to use ultra premium scraping")] - device_type: Annotated[str, Field(default=None, description="Device type to scrape from. Set request to use `mobile` or `desktop` user agents")] \ No newline at end of file + device_type: Annotated[str, Field(default=None, description="Device type to scrape from. Set request to use `mobile` or `desktop` user agents")] diff --git a/src/scraperapi_mcp_server/scraping/scrape.py b/src/scraperapi_mcp_server/scraping/scrape.py new file mode 100644 index 0000000..0d9b180 --- /dev/null +++ b/src/scraperapi_mcp_server/scraping/scrape.py @@ -0,0 +1,34 @@ +from scraperapi_mcp_server.config import settings +from scraperapi_mcp_server.utils.make_request import make_request + +def basic_scrape( + url: str, + render: bool = None, + country_code: str = None, + premium: bool = None, + ultra_premium: bool = None, + device_type: str = None + ) -> str: + payload = { + 'api_key': settings.API_KEY, + 'url': url, + 'output_format': 'markdown' + } + + optional_params = { + 'render': (render, lambda v: str(v).lower()), + 'country_code': (country_code, str), + 'premium': (premium, lambda v: str(v).lower()), + 'ultra_premium': (ultra_premium, lambda v: str(v).lower()), + 'device_type': (device_type, str) + } + + for key, (value, formatter) in optional_params.items(): + if value is not None: + payload[key] = formatter(value) + + return make_request( + url=settings.API_URL, + params=payload, + context=f"scraping '{url}'" + ) \ No newline at end of file diff --git a/src/scraperapi_mcp_server/sdes/amazon.py b/src/scraperapi_mcp_server/sdes/amazon.py new file mode 100644 index 0000000..56a6129 --- /dev/null +++ b/src/scraperapi_mcp_server/sdes/amazon.py @@ -0,0 +1,35 @@ +from .base import ScraperEndpoint + +amazon_product = ScraperEndpoint( + endpoint_path="/structured/amazon/product", + context_template="fetching Amazon product '{asin}'" +) + +amazon_search = ScraperEndpoint( + endpoint_path="/structured/amazon/search", + context_template="fetching Amazon search results for '{query}'" +) + +amazon_offers = ScraperEndpoint( + endpoint_path="/structured/amazon/offers", + context_template="fetching Amazon offers for '{asin}'" +) + +def scrape_amazon_product(asin: str, tld: str, country: str, output_format: str) -> str: + return amazon_product.call(asin=asin, tld=tld, country=country, output_format=output_format) + +def scrape_amazon_search(query: str, tld: str, country: str, output_format: str, page: int) -> str: + return amazon_search.call(query=query, tld=tld, country=country, output_format=output_format, page=page) + +def scrape_amazon_offers(asin: str, tld: str, country: str, output_format: str, f_new: bool, f_used_good: bool, f_used_like_new: bool, f_used_very_good: bool, f_used_acceptable: bool) -> str: + return amazon_offers.call( + asin=asin, + tld=tld, + country=country, + output_format=output_format, + f_new=f_new, + f_used_good=f_used_good, + f_used_like_new=f_used_like_new, + f_used_very_good=f_used_very_good, + f_used_acceptable=f_used_acceptable + ) diff --git a/src/scraperapi_mcp_server/sdes/base.py b/src/scraperapi_mcp_server/sdes/base.py new file mode 100644 index 0000000..b93c7c7 --- /dev/null +++ b/src/scraperapi_mcp_server/sdes/base.py @@ -0,0 +1,14 @@ +from scraperapi_mcp_server.config import settings +from scraperapi_mcp_server.utils.make_request import make_request + +class ScraperEndpoint: + def __init__(self, endpoint_path, context_template): + self.endpoint_path = endpoint_path + self.context_template = context_template + + def call(self, **params): + payload = {'api_key': settings.API_KEY} + payload.update(params) + url = f"{settings.API_URL}{self.endpoint_path}" + context = self.context_template.format(**params) + return make_request(url=url, params=payload, context=context) \ No newline at end of file diff --git a/src/scraperapi_mcp_server/sdes/models.py b/src/scraperapi_mcp_server/sdes/models.py new file mode 100644 index 0000000..68f6e79 --- /dev/null +++ b/src/scraperapi_mcp_server/sdes/models.py @@ -0,0 +1,34 @@ +from typing import Annotated +from pydantic import BaseModel, Field, AnyUrl + + +# Amazon +class ScrapeAmazonProductParams(BaseModel): + """Parameters for scraping an Amazon product.""" + + asin: Annotated[str, Field(description="ASIN of the Amazon product page.")] + tld: Annotated[str, Field(description="Top-level domain to scrape.")] + country: Annotated[str, Field(description="Country to scrape from.")] + output_format: Annotated[str, Field(description="Output format to scrape from. We offer 'csv' and 'json' output. JSON is default if parameter is not added.")] + + +class ScrapeAmazonSearchParams(BaseModel): + """Parameters for scraping an Amazon search.""" + + query: Annotated[str, Field(description="Query to scrape.")] + tld: Annotated[str, Field(description="Top-level domain to scrape.")] + country: Annotated[str, Field(description="Country to scrape from.")] + output_format: Annotated[str, Field(description="Output format to scrape from. We offer 'csv' and 'json' output. JSON is default if parameter is not added.")] + + +class ScrapeAmazonOffersParams(BaseModel): + """Parameters for scraping an Amazon offers.""" + asin: Annotated[str, Field(description="ASIN of the Amazon product page.")] + tld: Annotated[str, Field(description="Top-level domain to scrape.")] + country: Annotated[str, Field(description="Country to scrape from.")] + output_format: Annotated[str, Field(description="Output format to scrape from. We offer 'csv' and 'json' output. JSON is default if parameter is not added.")] + f_new: Annotated[bool, Field(description="Whether to scrape new offers.")] + f_used_good: Annotated[bool, Field(description="Whether to scrape used good offers.")] + f_used_like_new: Annotated[bool, Field(description="Whether to scrape used like new offers.")] + f_used_very_good: Annotated[bool, Field(description="Whether to scrape used very good offers.")] + f_used_acceptable: Annotated[bool, Field(description="Whether to scrape used acceptable offers.")] diff --git a/src/scraperapi_mcp_server/server.py b/src/scraperapi_mcp_server/server.py index cc9e4b4..18e8a1e 100644 --- a/src/scraperapi_mcp_server/server.py +++ b/src/scraperapi_mcp_server/server.py @@ -5,9 +5,11 @@ ErrorData, INTERNAL_ERROR, ) -from scraperapi_mcp_server.model import Scrape -from scraperapi_mcp_server.country_codes import COUNTRY_CODES -from scraperapi_mcp_server.scrape import basic_scrape +from scraperapi_mcp_server.scraping.models import Scrape +from scraperapi_mcp_server.scraping.scrape import basic_scrape +from scraperapi_mcp_server.utils.country_codes import COUNTRY_CODES +from scraperapi_mcp_server.sdes.models import ScrapeAmazonProductParams, ScrapeAmazonSearchParams, ScrapeAmazonOffersParams +from scraperapi_mcp_server.sdes import amazon mcp = FastMCP("mcp-scraperapi") @@ -21,8 +23,7 @@ def scrape(params: Scrape) -> str: Args: params: A Scrape model instance containing all scraping parameters - url: The URL to scrape (required) - - render: Set to True ONLY if the page requires JavaScript to load content. - Default is False, which is sufficient for most static websites. + - render: Set to True ONLY if the page requires JavaScript to load content. Default is False, which is sufficient for most static websites. - country_code: Two-letter country code to scrape from (optional) - premium: Whether to use premium proxies (optional) - ultra_premium: Whether to use ultra premium proxies (optional) @@ -95,4 +96,78 @@ def scrape_prompt(params: str) -> str: scrape_params.country_code = code break - return scrape(scrape_params) \ No newline at end of file + return scrape(scrape_params) + + +# SDEs + + +# Amazon +@mcp.tool() +def scrape_amazon_product(params: ScrapeAmazonProductParams) -> str: + """ + Scrape a product from Amazon. + + Args: + params: + - asin: The ASIN of the product to scrape + - tld: The top-level domain to scrape + - country: The country to scrape + - output_format: The output format to scrape, we offer 'csv' and 'json' output. JSON is default if parameter is not added + """ + return amazon.scrape_amazon_product( + asin=params.asin, + tld=params.tld, + country=params.country, + output_format=params.output_format, + ) + + +@mcp.tool() +def scrape_amazon_search(params: ScrapeAmazonSearchParams) -> str: + """ + Scrape a search from Amazon. + + Args: + params: + - query: The query to scrape + - tld: The top-level domain to scrape + - country: The country to scrape + - output_format: The output format to scrape, we offer 'csv' and 'json' output. JSON is default if parameter is not added + """ + return amazon.scrape_amazon_search( + query=params.query, + tld=params.tld, + country=params.country, + output_format=params.output_format + ) + + +@mcp.tool() +def scrape_amazon_offers(params: ScrapeAmazonOffersParams) -> str: + """ + Scrape offers from Amazon. + + Args: + params: + - asin: The ASIN of the product to scrape + - tld: The top-level domain to scrape + - country: The country to scrape + - output_format: The output format to scrape, we offer 'csv' and 'json' output. JSON is default if parameter is not added + - f_new: Whether to scrape new offers + - f_used_good: Whether to scrape used good offers + - f_used_like_new: Whether to scrape used like new offers + - f_used_very_good: Whether to scrape used very good offers + - f_used_acceptable: Whether to scrape used acceptable offers + """ + return amazon.scrape_amazon_offers( + asin=params.asin, + tld=params.tld, + country=params.country, + output_format=params.output_format, + f_new=params.f_new, + f_used_good=params.f_used_good, + f_used_like_new=params.f_used_like_new, + f_used_very_good=params.f_used_very_good, + f_used_acceptable=params.f_used_acceptable + ) \ No newline at end of file diff --git a/src/scraperapi_mcp_server/utils/__init__.py b/src/scraperapi_mcp_server/utils/__init__.py new file mode 100644 index 0000000..5af8be3 --- /dev/null +++ b/src/scraperapi_mcp_server/utils/__init__.py @@ -0,0 +1 @@ +# Package marker \ No newline at end of file diff --git a/src/scraperapi_mcp_server/country_codes.py b/src/scraperapi_mcp_server/utils/country_codes.py similarity index 100% rename from src/scraperapi_mcp_server/country_codes.py rename to src/scraperapi_mcp_server/utils/country_codes.py diff --git a/src/scraperapi_mcp_server/utils/make_request.py b/src/scraperapi_mcp_server/utils/make_request.py new file mode 100644 index 0000000..1765ecc --- /dev/null +++ b/src/scraperapi_mcp_server/utils/make_request.py @@ -0,0 +1,40 @@ +import requests +from scraperapi_mcp_server.config import settings +from requests.exceptions import RequestException, HTTPError as RequestsHTTPError +from mcp.shared.exceptions import McpError +from mcp.types import ErrorData, INTERNAL_ERROR + + +def make_request(url: str, params: dict, context: str = "request") -> str: + """ + Make an HTTP GET request with unified error handling. + + Args: + url (str): The URL to request. + params (dict): Query parameters for the request. + context (str): Context string for error messages (e.g., 'scraping', 'fetching Ebay product'). + + Returns: + str: The response text. + """ + try: + response = requests.get(url, params=params, timeout=settings.API_TIMEOUT_SECONDS) + response.raise_for_status() + return response.text + except RequestsHTTPError as e: + status_code = e.response.status_code if hasattr(e, 'response') else 500 + error_message = f"HTTP error {status_code} when {context}: {str(e)}" + raise McpError(ErrorData( + code=INTERNAL_ERROR, + message=error_message, + )) + except RequestException as e: + raise McpError(ErrorData( + code=INTERNAL_ERROR, + message=f"Connection error when {context}: {str(e)}", + )) + except Exception as e: + raise McpError(ErrorData( + code=INTERNAL_ERROR, + message=f"Unexpected error when {context}: {str(e)}", + )) \ No newline at end of file