In [1]:
import os
import csv
import urllib.parse
import re
import requests
import logging
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import time

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def download_image(image_data):
    image_url, output_dir, title = image_data
    try:
        start_time = time.time()
        response = requests.get(image_url, timeout=180) #Set a timeout for the request 
        if response.status_code == 200:
            image_content = response.content
            image_name = re.sub(r'[\\/:*?"<>|]', '', title) # Sanitize title for file name
            image_path = os.path.join(output_dir, f"{image_name}.jpg") # Save as JPG

            # Check if the image already exists, skip download if it does
            if not os.path.exists(image_path):
                with open(image_path, 'wb') as f:
                    f.write(image_content)
                    f.flush()
                end_time = time.time()
                logger.info(f"Image downloaded: {image_path}. Time taken: {end_time - start_time} seconds")
                return image_path
            else:
                logger.info(f"Image already exists: {image_path}")
        else:
            logger.error(f"Failed to download image from: {image_url}. Status code: {response.status_code}")
    except Exception as e:
        logger.error(f"Error downloading image from {image_url}: {e}")
    return None

def extract_and_download_images(url_template, output_dir, start_page, end_page):
    output_csv = os.path.join(output_dir, 'images.csv')
    csv_header = ['Title', 'URL']
    unique_urls = set()

    try:
        with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(csv_header)

            with ThreadPoolExecutor(max_workers=20) as executor:
                for page_number in range(start_page, end_page + 1):
                    current_url = urllib.parse.urljoin(url_template, f"?search_page={page_number}")
                    response = requests.get(current_url, timeout=180)
                    if response.status_code == 200:
                        html_content = response.content
                        soup = BeautifulSoup(html_content, 'html.parser')
                        img_elements = soup.find_all('img')
                        image_data_list = []
                        for img_element in img_elements:
                            src_image_url = img_element.get('src')
                            if src_image_url and src_image_url.endswith('.jpg') and src_image_url not in unique_urls:
                                title = img_element.get('alt', '')
                                writer.writerow([title, src_image_url])
                                image_data_list.append((src_image_url, output_dir, title))
                                unique_urls.add(src_image_url)
                        image_paths = executor.map(download_image, image_data_list)
                    else:
                        logger.error(f"Failed to fetch content from {current_url}. Status code: {response.status_code}")
    except Exception as e:
        logger.error(f"An error occurred: {e}")

if __name__ == "__main__":
    url_template = "https://www.vecteezy.com/free-photos/nature" # Add website url 
    output_directory = "C:/nyx/vect/downloaded_image3.3" # Add the desired output directory 
    start_page = 1
    end_page = 2 # Modify to the desired last page 

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    extract_and_download_images(url_template, output_directory, start_page, end_page)

    logger.info("Data download and CSV generation complete.")


INFO:__main__:Image downloaded: C:/nyx/vect/downloaded_image3.3\Category image for Banners.jpg. Time taken: 7.392866849899292 seconds
INFO:__main__:Image downloaded: C:/nyx/vect/downloaded_image3.3\Category image for Textures.jpg. Time taken: 7.45831298828125 seconds
INFO:__main__:Image downloaded: C:/nyx/vect/downloaded_image3.3\Category image for People.jpg. Time taken: 7.512415885925293 seconds
INFO:__main__:Image downloaded: C:/nyx/vect/downloaded_image3.3\Category image for Animals.jpg. Time taken: 7.50049614906311 seconds
INFO:__main__:Image downloaded: C:/nyx/vect/downloaded_image3.3\Category image for Travel.jpg. Time taken: 7.548670053482056 seconds
INFO:__main__:Image downloaded: C:/nyx/vect/downloaded_image3.3\Category image for Wedding.jpg. Time taken: 7.566527366638184 seconds
INFO:__main__:Image downloaded: C:/nyx/vect/downloaded_image3.3\Category image for Nature.jpg. Time taken: 7.592215299606323 seconds
INFO:__main__:Image downloaded: C:/nyx/vect/downloaded_image3.3\Ca