In [128]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
import requests
import os

def init_webdriver():
    return webdriver.Chrome()

def download_image(image_url, folder_name, image_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    try:
        # Adding referer and upgrading user-agent for more complex sites
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(image_url, headers=headers)
        if response.status_code == 200:
            file_path = os.path.join(folder_name, image_name)
            with open(file_path, 'wb') as f:
                f.write(response.content)
            print(f"Image downloaded: {file_path}")
        else:
            print(f"Failed to download {image_url} with status code {response.status_code}")
    except Exception as e:
        print(f"An error occurred while downloading {image_url}: {e}")


def scrape_images_and_details(driver, start_page=1, end_page=1):
    base_url = ''
    images_folder = f"{category}_{subcategory}_iStock_downloaded_images"
    csv_file_name = f"{category}_{subcategory}_iStock_image_details.csv"
    
    with open(csv_file_name, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['URL', 'Description', 'Stock ID', 'Location', 'Date', 'Downloaded Image Path'])

        for i in range(start_page, end_page + 1):
            print(f"Processing page: {i}")
            driver.get(base_url + str(i))
            time.sleep(2)
            
            container = driver.find_element(By.XPATH, "//div[@data-testid='gallery-items-container']")
            images = container.find_elements(By.XPATH, ".//a")
            image_urls = [image.get_attribute('href') for image in images]
            
            for url in image_urls:
                print(f"Processing URL: {url}")
                driver.get(url)
                time.sleep(3)  # Adjust time as needed
                try:
                    try:
                        description = driver.find_element(By.XPATH,'//div[@class="section-content"]//p').text
                    except Exception as e:
                        description = 'Not found'
                        print(f"Error getting description")
                    
                    try:
                        stock_id = driver.find_element(By.XPATH,'//div[@class="asset-id"]').text
                        stock_id = stock_id.split(':')[1]
                    except Exception as e:
                        stock_id = 'Not found'
                        print(f"Error getting stock ID")
                    
                    try:
                        location = driver.find_element(By.XPATH,'//div[@data-testid="location"]').text
                        location = location.split(':')[1]
                    except Exception as e:
                        location = 'Not found'
                        print(f"Error getting location")
                    
                    try:
                        date = driver.find_element(By.XPATH,'//div[@class="asset-upload-date"]').text
                        date = date.split(':')[1]
                    except Exception as e:
                        date = 'Not found'
                        print(f"Error getting date")
                    
                    try:
                        image_download_url = driver.find_element(By.XPATH,'//picture[@data-testid="hero-picture"]//img').get_attribute('src')
                        if stock_id != 'Not found':
                            image_name = f"{stock_id}.jpg"
                        else:
                            image_name = f"Image_{url.split('/')[-1]}.jpg"  # Fallback to using URL part as name
                        download_image(image_download_url, images_folder, image_name)
                    except Exception as e:
                        image_download_url = 'Not found'
                        print(f"Error getting image download URL")
                    writer.writerow([url, description, stock_id, location, date, os.path.join(images_folder, image_name if image_download_url != 'Not found' else '')])
                except Exception as e:
                    print(f"General error for URL {url}: {e}")
                    writer.writerow([url, 'Error processing this URL'])

driver = init_webdriver()
last_page = 3
category = 'Animal'
subcategory = 'snake'

try:
    scrape_images_and_details(driver, 1, last_page)  # Change end_page as needed
finally:
    driver.quit()


Processing page: 1
Processing URL: https://www.istockphoto.com/photo/cobra-gm540197184-96396255?phrase=indian%20snakes%20and%20cobras&searchscope=image%2Cfilm
Error getting location
Image downloaded: Animal_snake_iStock_downloaded_images\540197184.jpg
Processing URL: https://www.istockphoto.com/photo/naja-naja-common-spectacled-cobra-displaying-the-classic-snake-charmer-defensive-gm1164895397-320334465?phrase=indian%20snakes%20and%20cobras&searchscope=image%2Cfilm
Image downloaded: Animal_snake_iStock_downloaded_images\1164895397.jpg
Processing URL: https://www.istockphoto.com/photo/cobra-gm108126578-1208461?phrase=indian%20snakes%20and%20cobras&searchscope=image%2Cfilm
Error getting location
Image downloaded: Animal_snake_iStock_downloaded_images\108126578.jpg
Processing URL: https://www.istockphoto.com/photo/portrait-of-snake-with-mouth-open-king-cobra-gm1256132476-367692279?phrase=indian%20snakes%20and%20cobras&searchscope=image%2Cfilm
Image downloaded: Animal_snake_iStock_downloade