Fix code scanning issues and add flake config

ranjan-mohanty · May 18, 2024 · 8e972ad · 8e972ad
1 parent e532bff
commit 8e972ad
Show file tree

Hide file tree

Showing 6 changed files with 32 additions and 21 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,12 @@
+[flake8]
+exclude = 
+    .git,
+    .github,
+    __pycache__,
+    venv,
+    dist,
+    config,
+    build
+
+max-complexity = 10
+max-line-length = 120
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
@@ -60,7 +60,7 @@ representative at an online or offline event.
 
 Instances of abusive, harassing, or otherwise unacceptable behavior may be
 reported to the community leaders responsible for enforcement at
-ranjan@duck.com.
+<ranjan@duck.com>.
 All complaints will be reviewed and investigated promptly and fairly.
 
 All community leaders are obligated to respect the privacy and security of the

diff --git a/README.md b/README.md
@@ -78,7 +78,7 @@ amazon-scraper --url-list product_urls.txt
 
 This will process each URL in the file and save the scraped details for each product in separate directories within "output".
 
-**Optional: Downloading Images**
+**Optional: Downloading Images:**
 
 ```bash
 amazon-scraper --url https://www.amazon.com/product-1 --download-image
@@ -109,16 +109,16 @@ Scrape details for two products from a file named "products.txt" and download im
 
 1. Create a file named "products.txt" with the following content:
 
-```
-https://www.amazon.com/product-1
-https://www.amazon.com/product-2
-```
+   ```
+   https://www.amazon.com/product-1
+   https://www.amazon.com/product-2
+   ```
 
 2. Run the script with the following command:
 
-```bash
-amazon-scraper --url-list products.txt --download-image
-```
+   ```bash
+   amazon-scraper --url-list products.txt --download-image
+   ```
 
 This will process both URLs in the file, scrape details, create separate output directories for each product, and download images.
 

diff --git a/amazon_product_details_scraper/app.py b/amazon_product_details_scraper/app.py
@@ -69,12 +69,12 @@ def main():
         try:
             url_list_path = os.path.normpath(url_list_path)
             url_list = read_file(url_list_path)
-        except FileNotFoundError as e:
+        except FileNotFoundError:
             logging.error(f"Error: URL list file not found: {url_list_path}")
             exit(1)
 
     # Process each product URL
-    for item_num, url in enumerate(url_list, start=1):
+    for url in enumerate(url_list, start=1):
         try:
             product_details = get_product_detail(url)
 

diff --git a/amazon_product_details_scraper/core/scraper.py b/amazon_product_details_scraper/core/scraper.py
@@ -1,3 +1,4 @@
+import logging
 import os
 import re
 import json
@@ -12,11 +13,7 @@
     DEFAULT_OUTPUT_FILENAME,
     VALID_DOMAIN_NAMES,
 )
-from amazon_product_details_scraper.core.utils import (
-    create_folder,
-    extract_image_extension,
-    download_image,
-)
+from amazon_product_details_scraper.core.utils import create_folder, download_image, extract_image_extension
 
 
 def get_product_detail(url):
@@ -39,16 +36,17 @@ def get_product_detail(url):
     if not is_valid_url(url):
         raise Exception("Invalid amazon product URL")
     headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)" +
+        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
     }
 
-    response = requests.get(url, headers=headers)
+    response = requests.get(url, headers=headers, timeout=10000)
     response.raise_for_status()  # Raise an exception for non-200 status codes
 
     soup = BeautifulSoup(response.content, "html.parser")
 
     asin_element = soup.find("input", id="ASIN")
-    id = asin_element["value"] if asin_element else str(uuid.uuid4())
+    item_id = asin_element["value"] if asin_element else str(uuid.uuid4())
 
     title_element = soup.find("span", id="productTitle")
     title = title_element.text.strip() if title_element else None
@@ -67,7 +65,7 @@ def get_product_detail(url):
                 image_urls.extend(re.findall(image_url_pattern, script_text))
 
     return {
-        "id": id,
+        "id": item_id,
         "title": title,
         "description": description,
         "image_urls": image_urls,
@@ -129,4 +127,5 @@ def is_valid_url(url):
         # Check if the domain ends with any of the valid domains
         return any(domain.endswith(tld.lower()) for tld in VALID_DOMAIN_NAMES)
     except Exception as e:
+        logging.error(f"Error validating URL: {e}")
         return False
diff --git a/amazon_product_details_scraper/core/utils.py b/amazon_product_details_scraper/core/utils.py
@@ -72,7 +72,7 @@ def download_image(
     image_path = os.path.join(output_dir, file_name)
 
     # Download the image using requests
-    response = requests.get(url, stream=True)
+    response = requests.get(url, stream=True, timeout=10000)
     response.raise_for_status()  # Raise an exception for unsuccessful requests
 
     # Write the image data to the file