Skip to content

Commit

Permalink
Fix code scanning issues and add flake config
Browse files Browse the repository at this point in the history
  • Loading branch information
ranjan-mohanty committed May 18, 2024
1 parent e532bff commit 8e972ad
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 21 deletions.
12 changes: 12 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[flake8]
exclude =
.git,
.github,
__pycache__,
venv,
dist,
config,
build

max-complexity = 10
max-line-length = 120
2 changes: 1 addition & 1 deletion CODE_OF_CONDUCT.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ representative at an online or offline event.

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at
ranjan@duck.com.
<ranjan@duck.com>.
All complaints will be reviewed and investigated promptly and fairly.

All community leaders are obligated to respect the privacy and security of the
Expand Down
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ amazon-scraper --url-list product_urls.txt

This will process each URL in the file and save the scraped details for each product in separate directories within "output".

**Optional: Downloading Images**
**Optional: Downloading Images:**

```bash
amazon-scraper --url https://www.amazon.com/product-1 --download-image
Expand Down Expand Up @@ -109,16 +109,16 @@ Scrape details for two products from a file named "products.txt" and download im

1. Create a file named "products.txt" with the following content:

```
https://www.amazon.com/product-1
https://www.amazon.com/product-2
```
```
https://www.amazon.com/product-1
https://www.amazon.com/product-2
```

2. Run the script with the following command:

```bash
amazon-scraper --url-list products.txt --download-image
```
```bash
amazon-scraper --url-list products.txt --download-image
```

This will process both URLs in the file, scrape details, create separate output directories for each product, and download images.

Expand Down
4 changes: 2 additions & 2 deletions amazon_product_details_scraper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,12 @@ def main():
try:
url_list_path = os.path.normpath(url_list_path)
url_list = read_file(url_list_path)
except FileNotFoundError as e:
except FileNotFoundError:
logging.error(f"Error: URL list file not found: {url_list_path}")
exit(1)

# Process each product URL
for item_num, url in enumerate(url_list, start=1):
for url in enumerate(url_list, start=1):
try:
product_details = get_product_detail(url)

Expand Down
17 changes: 8 additions & 9 deletions amazon_product_details_scraper/core/scraper.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import os
import re
import json
Expand All @@ -12,11 +13,7 @@
DEFAULT_OUTPUT_FILENAME,
VALID_DOMAIN_NAMES,
)
from amazon_product_details_scraper.core.utils import (
create_folder,
extract_image_extension,
download_image,
)
from amazon_product_details_scraper.core.utils import create_folder, download_image, extract_image_extension


def get_product_detail(url):
Expand All @@ -39,16 +36,17 @@ def get_product_detail(url):
if not is_valid_url(url):
raise Exception("Invalid amazon product URL")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)" +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
}

response = requests.get(url, headers=headers)
response = requests.get(url, headers=headers, timeout=10000)
response.raise_for_status() # Raise an exception for non-200 status codes

soup = BeautifulSoup(response.content, "html.parser")

asin_element = soup.find("input", id="ASIN")
id = asin_element["value"] if asin_element else str(uuid.uuid4())
item_id = asin_element["value"] if asin_element else str(uuid.uuid4())

title_element = soup.find("span", id="productTitle")
title = title_element.text.strip() if title_element else None
Expand All @@ -67,7 +65,7 @@ def get_product_detail(url):
image_urls.extend(re.findall(image_url_pattern, script_text))

return {
"id": id,
"id": item_id,
"title": title,
"description": description,
"image_urls": image_urls,
Expand Down Expand Up @@ -129,4 +127,5 @@ def is_valid_url(url):
# Check if the domain ends with any of the valid domains
return any(domain.endswith(tld.lower()) for tld in VALID_DOMAIN_NAMES)
except Exception as e:
logging.error(f"Error validating URL: {e}")
return False
2 changes: 1 addition & 1 deletion amazon_product_details_scraper/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def download_image(
image_path = os.path.join(output_dir, file_name)

# Download the image using requests
response = requests.get(url, stream=True)
response = requests.get(url, stream=True, timeout=10000)
response.raise_for_status() # Raise an exception for unsuccessful requests

# Write the image data to the file
Expand Down

0 comments on commit 8e972ad

Please sign in to comment.