In [None]:
# 依赖：
# pip install selenium webdriver-manager requests tqdm

import os
import time
import requests
from tqdm import tqdm

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager

def scrape_unsplash_images(query, target_folder, total=400):
    """
    Scrape images from the Unsplash search page. Supports automatic scrolling and handles stale element exceptions.
    """
    os.makedirs(target_folder, exist_ok=True)
    options = Options()
    options.headless = True

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    driver.get(f"https://unsplash.com/s/photos/{query.replace(' ', '-')}")
    img_urls = set()
    last_height = driver.execute_script("return document.body.scrollHeight")

    while len(img_urls) < total:

        imgs = driver.find_elements("tag name", "img")
        for img in imgs:
            try:
                srcset = img.get_attribute("srcset") or ""
            except StaleElementReferenceException:
                continue

            parts = [p.strip() for p in srcset.split(",") if p.strip()]
            if not parts:
                continue
            best_url = parts[-1].split(" ")[0]
            if best_url.startswith("http"):
                img_urls.add(best_url)
            if len(img_urls) >= total:
                break


        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    driver.quit()


    for i, url in enumerate(tqdm(list(img_urls)[:total], desc="Downloading Unsplash")):
        try:
            resp = requests.get(url, timeout=10)
            resp.raise_for_status()
            fname = f"{query.replace(' ', '_')}_{i}.jpg"
            with open(os.path.join(target_folder, fname), "wb") as f:
                f.write(resp.content)
        except Exception as e:
            print(f"[Unsplash] download fail {url}: {e}")

    print(f"Unsplash: download {min(len(img_urls), total)} image {target_folder}")


def scrape_pinterest_images(query, target_folder, total=400):

    os.makedirs(target_folder, exist_ok=True)
    options = Options()
    options.headless = True

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    driver.get(f"https://www.pinterest.com/search/pins/?q={query.replace(' ', '%20')}")
    img_urls = set()
    last_height = driver.execute_script("return document.body.scrollHeight")

    while len(img_urls) < total:
        imgs = driver.find_elements("tag name", "img")
        for img in imgs:
            try:
                src = img.get_attribute("src") or ""
            except StaleElementReferenceException:
                continue
            if src.startswith("http") and src.endswith(".jpg"):
                img_urls.add(src)
            if len(img_urls) >= total:
                break

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    driver.quit()

    for i, url in enumerate(tqdm(list(img_urls)[:total], desc="Downloading Pinterest")):
        try:
            resp = requests.get(url, timeout=10)
            resp.raise_for_status()
            fname = f"{query.replace(' ', '_')}_{i}.jpg"
            with open(os.path.join(target_folder, fname), "wb") as f:
                f.write(resp.content)
        except Exception as e:
            print(f"[Pinterest] download fail {url}: {e}")

    print(f"Pinterest: download {min(len(img_urls), total)} into {target_folder}")


if __name__ == "__main__":

    scrape_unsplash_images("ring", "earring", total=400)



Downloading Unsplash: 100%|██████████| 37/37 [00:23<00:00,  1.55it/s]

Unsplash: 已下载 37 张到 earring



