Skip to content

Commit

Permalink
Fix security vulnerability
Browse files Browse the repository at this point in the history
  • Loading branch information
ranjan-mohanty committed Mar 19, 2024
1 parent a03ad48 commit 772e60e
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 3 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
[![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/ranjan-mohanty/amazon-product-details-scraper/build.yml)](https://github.com/ranjan-mohanty/amazon-product-details-scraper/actions/workflows/build.yml)
[![Downloads](https://static.pepy.tech/badge/amazon-product-details-scraper)](https://pepy.tech/project/amazon-product-details-scraper)
[![GitHub Issues or Pull Requests](https://img.shields.io/github/issues/ranjan-mohanty/amazon-product-details-scraper)](https://github.com/ranjan-mohanty/amazon-product-details-scraper/issues)
![Libraries.io dependency status for GitHub repo](https://img.shields.io/librariesio/github/ranjan-mohanty/amazon-product-details-scraper)
[![GitHub forks](https://img.shields.io/github/forks/ranjan-mohanty/amazon-product-details-scraper)](https://github.com/ranjan-mohanty/amazon-product-details-scraper/forks)
[![GitHub Repo stars](https://img.shields.io/github/stars/ranjan-mohanty/amazon-product-details-scraper)](https://github.com/ranjan-mohanty/amazon-product-details-scraper/stargazers)

Expand Down
4 changes: 3 additions & 1 deletion amazon_product_details_scraper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def main():
url_list = [input_url]
else:
try:
url_list_path = os.path.normpath(url_list_path)
url_list = read_file(url_list_path)
except FileNotFoundError as e:
logging.error(f"Error: URL list file not found: {url_list_path}")
Expand All @@ -82,6 +83,7 @@ def main():
# Log details with indentation for readability
# logging.info(json.dumps(product_details, indent=2))

output_dir = os.path.normpath(output_dir)
product_output_dir = os.path.join(output_dir, f"item_{item_num}")
create_folder(product_output_dir, overwrite=True)

Expand All @@ -95,7 +97,7 @@ def main():
else:
logging.error(f"Failed to fetch product details for {url}")
except Exception as e:
logging.exception(f"Failed to fetch product details for {url}", e)
logging.error(f"Failed to fetch product details for {url}: {e}")


if __name__ == "__main__":
Expand Down
2 changes: 2 additions & 0 deletions amazon_product_details_scraper/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
DEFAULT_LOG_LEVEL,
DEFAULT_OUTPUT_FILENAME,
DEFAULT_OUTPUT_FOLDER,
VALID_DOMAIN_NAMES,
)


__all__ = [
"DEFAULT_LOG_LEVEL",
"DEFAULT_OUTPUT_FILENAME",
"DEFAULT_OUTPUT_FOLDER",
"VALID_DOMAIN_NAMES",
]
24 changes: 24 additions & 0 deletions amazon_product_details_scraper/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,27 @@

DEFAULT_OUTPUT_FILENAME = "product-info.json"
DEFAULT_OUTPUT_FOLDER = os.path.expanduser("../output")

VALID_DOMAIN_NAMES = [
"amazon.com",
"amazon.ca",
"amazon.co.uk",
"amazon.de",
"amazon.fr",
"amazon.in",
"amazon.it",
"amazon.co.jp",
"amazon.cn",
"amazon.com.mx",
"amazon.com.au",
"amazon.nl",
"amazon.pl",
"amazon.sg",
"amazon.sa",
"amazon.es",
"amazon.se",
"amazon.ae",
"amazon.br",
"amazon.com.tr",
"amzn.to",
]
33 changes: 31 additions & 2 deletions amazon_product_details_scraper/core/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,15 @@
import re
import json
import codecs
from urllib3.util.url import parse_url

import requests
from bs4 import BeautifulSoup

from amazon_product_details_scraper.config import DEFAULT_OUTPUT_FILENAME
from amazon_product_details_scraper.config import (
DEFAULT_OUTPUT_FILENAME,
VALID_DOMAIN_NAMES,
)
from amazon_product_details_scraper.core.utils import (
create_folder,
extract_image_extension,
Expand All @@ -31,7 +35,8 @@ def get_product_detail(url):
Raises:
Exception: An exception may be raised if there's an issue fetching or parsing the product details.
"""

if not is_valid_url(url):
raise Exception("Invalid amazon product URL")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
}
Expand Down Expand Up @@ -96,3 +101,27 @@ def download_product_images(image_urls, output_dir):
for i, image_url in enumerate(image_urls, start=1):
file_name = f"image_{i}.{extract_image_extension(image_url)}"
download_image(image_url, output_dir, file_name)


def is_valid_url(url):
"""Validates the provided URL.
This function validates the URL and checks if the hostname is a valid domain name.
Args:
url (str): The URL to validate.
Returns:
bool: True if the URL is valid, False otherwise.
"""

try:
parsed_url = parse_url(url)
if not parsed_url.netloc or not parsed_url.scheme:
return False # Not a valid URL format
domain = parsed_url.netloc.lower()

# Check if the domain ends with any of the valid domains
return any(domain.endswith(tld.lower()) for tld in VALID_DOMAIN_NAMES)
except Exception as e:
return False

0 comments on commit 772e60e

Please sign in to comment.