In [None]:
import os                         # For operating system
import re                         # Regular expression
import cv2                        # Image processing
import glob                       # Unix style pathname pattern expansion
import random                     # For random number generation
import shutil                     # High-level file operations
import requests                   # Reading url
from PIL import Image             # Python Image Library
from bs4 import BeautifulSoup     # Web scrapping
import matplotlib.pyplot as plt   # Making plots
import matplotlib.image as mpimg  # To view color image
%matplotlib inline
import time

In [None]:
from keywords import fire_keywords, night_keywords

In [None]:
MIN_FILE_SIZE_BYTES = 1024

In [None]:

def download_images(query, dl_path, base_url, max_images=20000):
    os.makedirs(dl_path, exist_ok=True)
    page = 1
    total_downloaded = 0
    max_retry_attempts = 3
    retry_delay = 5  # seconds

    # Maintain a set of downloaded image URLs
    downloaded_urls = set()

    while total_downloaded < max_images:
        # Construct the URL with the search query and pagination
        url = f"{base_url}&search_page={page}"

        for _ in range(max_retry_attempts):
            try:
                # Send an HTTP GET request
                response = requests.get(url)

                if response.status_code == 200:
                    # Parse the HTML content of the page
                    soup = BeautifulSoup(response.text, "html.parser")

                    # Find all image elements on the page
                    img_elements = soup.find_all("img")

                    for img_element in img_elements:
                        img_url = img_element.get("src")

                        # Check if the image URL has already been downloaded
                        if img_url in downloaded_urls:
                            print(f"Skipping duplicate image: {img_url}")
                            continue

                        # Download the image
                        if img_url and img_url.startswith("https://"):
                            img_data = requests.get(img_url).content

                            # Check if the image is blank (below the minimum file size)
                            if len(img_data) < MIN_FILE_SIZE_BYTES:
                                print(f"Skipping blank image: {img_url}")
                                continue

                            with open(os.path.join(dl_path, f"{query}_{total_downloaded + 1}.jpg"), "wb") as img_file:
                                img_file.write(img_data)

                            # Add the downloaded URL to the set
                            downloaded_urls.add(img_url)
                            total_downloaded += 1

                            if total_downloaded >= max_images:
                                break
                    break
                else:
                    print(f"Failed to fetch page {page}. Status code: {response.status_code}")
            except requests.RequestException as e:
                print(f"Request failed: {e}")
                print(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
                continue

        # Increment the page number for the next iteration
        page += 1


In [None]:
if __name__ == "__main__":
    # Define the search query, download path, and base URL
    query = "fire"
    dl_path = "E:/Project/Fire_det/Fire2_set"
    base_url = f"https://stock.adobe.com/search/images?hide_panel=true&k={query}+&search_type=usertyped"

    # Set the maximum number of images to download
    max_images = 20000

    # Download images
    download_images(query, dl_path, base_url, max_images)