<a href="https://colab.research.google.com/github/shaju4568/shajahan/blob/main/Amazon_scrap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import json
import time

# Function to scrape product data from a single URL
def scrape_product_data(url):
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Check for any HTTP errors

        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract product information
        product_title = soup.find('span', {'id': 'productTitle'}).text.strip()
        product_image_url = soup.find('img', {'id': 'landingImage'})['src']
        product_price = soup.find('span', {'id': 'priceblock_ourprice'}).text.strip()
        product_details = soup.find('div', {'id': 'productDescription'}).text.strip()

        return {
            "Product Title": product_title,
            "Product Image URL": product_image_url,
            "Price of the Product": product_price,
            "Product Details": product_details
        }

    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return None

# Function to scrape data from a batch of URLs
def scrape_batch(urls):
    results = []

    for url in urls:
        data = scrape_product_data(url)

        if data:
            results.append(data)
        else:
            print(f"URL not available: {url}")

    return results

# Main function
def main():
    # Load URLs from the provided CSV file
    csv_url = "https://docs.google.com/spreadsheets/d/1BZSPhk1LDrx8ytywMHWVpCqbm8URTxE4mTJrIRkD7PnGTM/export?format=csv"
    response = requests.get(csv_url)
    csv_data = response.text.splitlines()

    # Remove the header row
    csv_data.pop(0)

    # Split the CSV data into batches of 100 URLs each
    batch_size = 100
    url_batches = [csv_data[i:i+batch_size] for i in range(0, len(csv_data), batch_size)]

    # Scrape data from each batch
    total_results = []
    for i, url_batch in enumerate(url_batches):
        print(f"Scraping batch {i + 1}...")
        batch_results = scrape_batch(url_batch)
        total_results.extend(batch_results)

        # Sleep for a few seconds between batches to avoid overloading the server
        time.sleep(5)

    # Save the results to a JSON file
    with open('amazon_product_data.json', 'w', encoding='utf-8') as json_file:
        json.dump(total_results, json_file, ensure_ascii=False, indent=4)

if __name__ == "__main__":
    main()
