In [5]:
import os
import csv
from bs4 import BeautifulSoup
import requests
import re

category = 'Computer'
# Base directory path where you want to save everything
base_directory = f"D:\\NYX\\pond5\\{category}"  # Update this to your desired path

# Create directories if they don't exist
image_directory = os.path.join(base_directory, 'images')
os.makedirs(image_directory, exist_ok=True)

csv_file_name = f'{category}_post_urls.csv'

csv_directory = base_directory
os.makedirs(csv_directory, exist_ok=True)

# CSV file setup
csv_file_name = os.path.join(csv_directory, csv_file_name)

# Headers for the request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
}

# Function to fetch and process data from the webpage
def fetch_data_from_page(url):
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all a tags with class "SearchResultDSM SearchResultDSM--large Link--dark p5_photo js-searchResult js-searchResultLarge js-awProductLink"
        a_tags = soup.find_all('a', class_='SearchResultDSM SearchResultDSM--large Link--dark p5_photo js-searchResult js-searchResultLarge js-awProductLink', href=True)

        with open(csv_file_name, mode='a', newline='', encoding='utf-8') as file:  # Open file in append mode
            writer = csv.writer(file)
            if file.tell() == 0:  # Check if file is empty
                writer.writerow(['ID', 'Image URL', 'Post URL', 'Title'])  # Write header only if file is empty

            for a_tag in a_tags:
                # Extract the post URL
                post_url = a_tag['href']

                # Extract the image URL from the img tag within the a tag
                img_tag = a_tag.find('img', class_='SearchResultsV3-mosaicItemImg')
                if img_tag:
                    image_url = img_tag.get('src') or img_tag.get('data-src')
                    title = img_tag['alt']
                else:
                    image_url = None
                    title = None

                # Extract the ID from the post URL
                post_id = re.search(r'/item/(\d+)-', post_url).group(1) if post_url else None

                # Write ID, image URL, post URL, and title to CSV
                writer.writerow([post_id, image_url, post_url, title])
                #print(f"ID: {post_id}, Image URL: {image_url}, Post URL: {post_url}, Title: {title}")

                # Download image
                if image_url:
                    image_response = requests.get(image_url)
                    if image_response.status_code == 200:
                        # Get the image file extension
                        image_extension = image_url.split('.')[-1]

                        # Save the image with the post ID as filename
                        image_filename = f"{post_id}.{image_extension}"
                        image_path = os.path.join(image_directory, image_filename)
                        with open(image_path, 'wb') as image_file:
                            image_file.write(image_response.content)
                        print(f"Downloaded image: {image_filename}")
                    else:
                        print(f"Failed to download image from URL: {image_url}")
                else:
                    print("Image URL not found in the post.")

        print("Data extraction complete.")
    else:
        print(f"Failed to fetch page: {url}")

# URL of the webpage to scrape
base_url = "https://www.pond5.com/stock-images/photos/tag/computer/"
num_pages = 1  # Number of pages to scrape

# Iterate through each page
for page_num in range(1, num_pages + 1):
    page_url = f"{base_url}?pp={page_num}"
    print("Constructed URL:", page_url)
    print(f"Scraping page {page_num}...")
    fetch_data_from_page(page_url)

print("URLs download complete.")

Constructed URL: https://www.pond5.com/stock-images/photos/tag/computer/?pp=1
Scraping page 1...
Downloaded image: 157525847.jpeg
Downloaded image: 150323500.jpeg
Downloaded image: 101529771.jpeg
Downloaded image: 154330880.jpeg
Downloaded image: 31773263.jpeg
Downloaded image: 11657580.jpeg
Downloaded image: 11769410.jpeg
Downloaded image: 243412296.jpeg
Downloaded image: 92123587.jpeg
Downloaded image: 21623256.jpeg
Downloaded image: 165455012.jpeg
Downloaded image: 234407882.jpeg
Downloaded image: 234626620.jpeg
Downloaded image: 244729366.jpeg
Downloaded image: 13890725.jpeg
Downloaded image: 244578422.jpeg
Downloaded image: 172460373.jpeg
Downloaded image: 21579524.jpeg
Downloaded image: 222767744.jpeg
Downloaded image: 148423436.jpeg
Downloaded image: 170588220.jpeg
Downloaded image: 21687884.jpeg
Downloaded image: 152109357.jpeg
Downloaded image: 153904454.jpeg
Downloaded image: 24721437.jpeg
Downloaded image: 90751127.jpeg
Downloaded image: 11338243.jpeg
Downloaded image: 14355