Initial commit

ranjan-mohanty · Mar 10, 2024 · 20d8415 · 20d8415
commit 20d8415
Show file tree

Hide file tree

Showing 12 changed files with 603 additions and 0 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -0,0 +1,33 @@
+name: Build
+on:
+  workflow_call:
+  push:
+    branches-ignore:
+      - main
+    tags-ignore:
+      - "**"
+
+jobs:
+  build:
+    name: Build distribution
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.x"
+      - name: Install pypa/build
+        run: >-
+          python3 -m
+          pip install
+          build
+          --user
+      - name: Build a binary wheel and a source tarball
+        run: python3 -m build
+      - name: Store the distribution packages
+        uses: actions/upload-artifact@v3
+        with:
+          name: python-package-distributions
+          path: dist/
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -0,0 +1,96 @@
+name: Publish
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - "**"
+
+jobs:
+  build:
+    uses: ./.github/workflows/build.yml
+  publish-to-pypi:
+    name: Publish to PyPI
+    if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes
+    needs:
+      - build
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/amazon-product-details-scraper
+    permissions:
+      id-token: write
+
+    steps:
+      - name: Download all the dists
+        uses: actions/download-artifact@v3
+        with:
+          name: python-package-distributions
+          path: dist/
+      - name: Publish distribution to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+
+  github-release:
+    name: >-
+      Sign the Python distribution with Sigstore
+      and upload them to GitHub Release
+    needs:
+      - publish-to-pypi
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: write
+      id-token: write
+
+    steps:
+      - name: Download all the dists
+        uses: actions/download-artifact@v3
+        with:
+          name: python-package-distributions
+          path: dist/
+      - name: Sign the dists with Sigstore
+        uses: sigstore/gh-action-sigstore-python@v1.2.3
+        with:
+          inputs: >-
+            ./dist/*.tar.gz
+            ./dist/*.whl
+      - name: Create GitHub Release
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+        run: >-
+          gh release create
+          '${{ github.ref_name }}'
+          --repo '${{ github.repository }}'
+          --notes ""
+      - name: Upload artifact signatures to GitHub Release
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+        run: >-
+          gh release upload
+          '${{ github.ref_name }}' dist/**
+          --repo '${{ github.repository }}'
+
+  publish-to-testpypi:
+    name: Publish to TestPyPI
+    if: ${{ !startsWith(github.ref, 'refs/tags')}}
+    needs:
+      - build
+    runs-on: ubuntu-latest
+
+    environment:
+      name: testpypi
+      url: https://test.pypi.org/p/amazon-product-details-scraper
+
+    permissions:
+      id-token: write
+
+    steps:
+      - name: Download all the dists
+        uses: actions/download-artifact@v3
+        with:
+          name: python-package-distributions
+          path: dist/
+      - name: Publish distribution to TestPyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          repository-url: https://test.pypi.org/legacy/
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+*.egg-info
+build
+dist
+my_env
+venv
+output
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Ranjan Mohanty
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,114 @@
+## Amazon Product Details Scraper
+
+This script helps you scrape product details from Amazon product pages. It extracts information like title, description, and image URLs, saving them to JSON files.
+
+### Features
+
+- Fetches product details from a single Amazon product URL or a list of URLs in a file.
+- Writes extracted data to JSON files for easy storage and processing.
+- Optionally downloads product images along with details.
+
+### Installation
+
+**Requirements:**
+
+- Python 3 (tested with 3.7+)
+- Libraries:
+  - requests: 2.27.1
+  - beautifulsoup4: 4.11.1
+  - urllib3: 1.26.6
+
+**Instructions:**
+
+1. Make sure you have Python 3 installed. You can check by running `python3 --version` in your terminal.
+2. **Create a virtual environment (recommended):**
+
+   - Virtual environments help isolate project dependencies and avoid conflicts with other Python installations on your system.
+   - Here's how to create a virtual environment using `venv`:
+
+     ```bash
+     python3 -m venv my_env  # Replace "my_env" with your desired environment name
+     ```
+
+   - Activate the virtual environment:
+
+     ```bash
+     source my_env/bin/activate
+     ```
+
+3. **Install:**
+
+   ```bash
+   python3 setup.py install
+   ```
+
+   This will automatically download and install the necessary libraries based on the specifications within the activated virtual environment.
+
+### Usage
+
+**Basic Usage:**
+
+```bash
+amazon-scraper --url https://www.amazon.com/product-1  # Replace with your product URL
+```
+
+This will scrape details from the provided Amazon product URL and write them to a JSON file in the "output" directory (default).
+
+**Using a URL List:**
+
+1. Create a text file containing a list of Amazon product URLs (one per line).
+2. Run the script with the `--url-list` option and provide the file path:
+
+```bash
+amazon-scraper --url-list product_urls.txt
+```
+
+This will process each URL in the file and save the scraped details for each product in separate directories within "output".
+
+**Optional: Downloading Images**
+
+```bash
+amazon-scraper --url https://www.amazon.com/product-1 --download-image
+```
+
+The `--download-image` flag enables downloading product images along with other details.
+
+**Getting Help:**
+
+The script offers a built-in help message that provides a quick overview of available options and usage instructions. To access the help, run the script with the `--help` option:
+
+```bash
+amazon_scraper --help
+```
+
+### Configuration
+
+**Logging:**
+
+- The script uses basic logging for information and error messages.
+- You can modify the logging level by editing the `DEFAULT_LOG_LEVEL` in `config.py` line in the code (refer to the Python documentation for logging configuration).
+
+### Example
+
+**Scenario:**
+
+Scrape details for two products from a file named "products.txt" and download images:
+
+1. Create a file named "products.txt" with the following content:
+
+```
+https://www.amazon.com/product-1
+https://www.amazon.com/product-2
+```
+
+2. Run the script with the following command:
+
+```bash
+amazon-scraper --url-list products.txt --download-image
+```
+
+This will process both URLs in the file, scrape details, create separate output directories for each product, and download images.
+
+### Disclaimer
+
+This script is for educational purposes only. Please be respectful of Amazon's terms of service when using it. Consider using official APIs provided by Amazon for extensive data collection.
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,5 @@
+[build-system]
+# These are the assumed default build requirements from pip:
+# https://pip.pypa.io/en/stable/reference/pip/#pep-517-and-518-support
+requires = ["setuptools>=43.0.0", "wheel"]
+build-backend = "setuptools.build_meta"
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,4 @@
+[metadata]
+# This includes the license file(s) in the wheel.
+# https://wheel.readthedocs.io/en/stable/user_guide.html#including-license-files-in-the-generated-wheel-file
+license_files = LICENSE
diff --git a/setup.py b/setup.py
@@ -0,0 +1,28 @@
+from setuptools import setup, find_packages
+import pathlib
+
+here = pathlib.Path(__file__).parent.resolve()
+# Get the long description from the README file
+long_description = (here / "README.md").read_text(encoding="utf-8")
+
+setup(
+    name="amazon-product-details-scraper",
+    version="1.0.0",
+    description="Scrapes product details from Amazon product pages and also downloads the images",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/ranjan-mohanty/amazon-product-details-scraper/blob/main/README.md",
+    author="Ranjan Mohanty",
+    author_email="ranjan@duck.com",
+    packages=find_packages(),
+    keywords="amazon, scraper",
+    entry_points={
+        "console_scripts": [
+            "amazon-scraper=src.scraper:main",
+        ]
+    },
+    install_requires=["requests==2.31.0", "beautifulsoup4==4.11.1", "urllib3==1.26.18"],
+    project_urls={  # Optional
+        "Source": "https://github.com/ranjan-mohanty/amazon-product-details-scraper",
+    },
+)
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/config.py b/src/config.py
@@ -0,0 +1,8 @@
+import os
+import logging
+
+
+DEFAULT_LOG_LEVEL = logging.INFO
+
+DEFAULT_OUTPUT_FOLDER = os.path.expanduser("../output")
+DEFAULT_OUTPUT_FILENAME = "product-info.json"