Extract the ASIN from the page for the dir naming or fallback to UUID

ranjan-mohanty · Mar 20, 2024 · 0c14cfc · 0c14cfc
1 parent 772e60e
commit 0c14cfc
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 8 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,4 +3,5 @@ build
 dist
 my_env
 venv
-output
+output
+data
diff --git a/README.md b/README.md
@@ -1,15 +1,16 @@
 ## Amazon Product Details Scraper
 
-[![GitHub Release](https://img.shields.io/github/v/release/ranjan-mohanty/amazon-product-details-scraper)](https://github.com/ranjan-mohanty/amazon-product-details-scraper/releases)
 [![GitHub License](https://img.shields.io/github/license/ranjan-mohanty/amazon-product-details-scraper)](https://github.com/ranjan-mohanty/amazon-product-details-scraper/blob/main/LICENSE)
+[![GitHub Release](https://img.shields.io/github/v/release/ranjan-mohanty/amazon-product-details-scraper)](https://github.com/ranjan-mohanty/amazon-product-details-scraper/releases)
 [![PyPI - Version](https://img.shields.io/pypi/v/amazon-product-details-scraper)](https://pypi.org/project/amazon-product-details-scraper/)
-[![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/ranjan-mohanty/amazon-product-details-scraper/build.yml)](https://github.com/ranjan-mohanty/amazon-product-details-scraper/actions/workflows/build.yml)
 [![Downloads](https://static.pepy.tech/badge/amazon-product-details-scraper)](https://pepy.tech/project/amazon-product-details-scraper)
-[![GitHub Issues or Pull Requests](https://img.shields.io/github/issues/ranjan-mohanty/amazon-product-details-scraper)](https://github.com/ranjan-mohanty/amazon-product-details-scraper/issues)
-![Libraries.io dependency status for GitHub repo](https://img.shields.io/librariesio/github/ranjan-mohanty/amazon-product-details-scraper)
 [![GitHub forks](https://img.shields.io/github/forks/ranjan-mohanty/amazon-product-details-scraper)](https://github.com/ranjan-mohanty/amazon-product-details-scraper/forks)
 [![GitHub Repo stars](https://img.shields.io/github/stars/ranjan-mohanty/amazon-product-details-scraper)](https://github.com/ranjan-mohanty/amazon-product-details-scraper/stargazers)
 
+[![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/ranjan-mohanty/amazon-product-details-scraper/build.yml)](https://github.com/ranjan-mohanty/amazon-product-details-scraper/actions/workflows/build.yml)
+[![GitHub Issues or Pull Requests](https://img.shields.io/github/issues/ranjan-mohanty/amazon-product-details-scraper)](https://github.com/ranjan-mohanty/amazon-product-details-scraper/issues)
+![Libraries.io dependency status for GitHub repo](https://img.shields.io/librariesio/github/ranjan-mohanty/amazon-product-details-scraper)
+
 This script helps you scrape product details from Amazon product pages. It extracts information like title, description, and image URLs, saving them to JSON files.
 
 ### Features

diff --git a/amazon_product_details_scraper/app.py b/amazon_product_details_scraper/app.py
@@ -84,7 +84,7 @@ def main():
                 # logging.info(json.dumps(product_details, indent=2))
 
                 output_dir = os.path.normpath(output_dir)
-                product_output_dir = os.path.join(output_dir, f"item_{item_num}")
+                product_output_dir = os.path.join(output_dir, product_details["id"])
                 create_folder(product_output_dir, overwrite=True)
 
                 write_product_details(product_details, product_output_dir)

diff --git a/amazon_product_details_scraper/core/scraper.py b/amazon_product_details_scraper/core/scraper.py
@@ -1,11 +1,12 @@
 import os
 import re
 import json
+import uuid
 import codecs
-from urllib3.util.url import parse_url
-
 import requests
+
 from bs4 import BeautifulSoup
+from urllib3.util.url import parse_url
 
 from amazon_product_details_scraper.config import (
     DEFAULT_OUTPUT_FILENAME,
@@ -46,6 +47,9 @@ def get_product_detail(url):
 
     soup = BeautifulSoup(response.content, "html.parser")
 
+    asin_element = soup.find("input", id="ASIN")
+    id = asin_element["value"] if asin_element else uuid.uuid4()
+
     title_element = soup.find("span", id="productTitle")
     title = title_element.text.strip() if title_element else None
 
@@ -63,6 +67,7 @@ def get_product_detail(url):
                 image_urls.extend(re.findall(image_url_pattern, script_text))
 
     return {
+        "id": id,
         "title": title,
         "description": description,
         "image_urls": image_urls,