In [1]:
!pip3 install selenium




In [2]:
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import base64
import time

def create_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--start-maximized")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def scroll_down(driver, scroll_pause_time=2, scroll_limit=10):
    last_height = driver.execute_script("return document.body.scrollHeight")
    for i in range(scroll_limit):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(scroll_pause_time)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

def scrape_all_images(driver):
    try:
        images = driver.find_elements(By.TAG_NAME, 'img')
        image_urls = []
        for img in images:
            image_url = img.get_attribute('src') or img.get_attribute('data-src')
            if image_url and "data:image/gif" not in image_url:
                width = int(img.get_attribute('width') or 0)
                height = int(img.get_attribute('height') or 0)
                if width >= 100 and height >= 100:
                    image_urls.append(image_url)
        return image_urls
    except Exception as e:
        print(f"Error scraping images: {e}")
        return []

def save_image(image_url, folder_name, file_name, retry_count=3):
    try:
        file_path = os.path.join(folder_name, f"{file_name}.jpg")

        if image_url.startswith('data:image/'):
            header, encoded = image_url.split(',', 1)
            image_data = base64.b64decode(encoded)
            with open(file_path, 'wb') as f:
                f.write(image_data)
        else:
            for attempt in range(retry_count):
                response = requests.get(image_url, timeout=10)
                if response.status_code == 200:
                    with open(file_path, 'wb') as f:
                        f.write(response.content)
                    break
                else:
                    print(f"Failed attempt {attempt+1} for image: {image_url}")
                    time.sleep(2)
    except Exception as e:
        print(f"Error saving image {file_name}: {e}")

def scrape_and_save_images(base_folder, search_term, foldername, num_images=100):
    folder_path = os.path.join(base_folder, foldername)
    os.makedirs(folder_path, exist_ok=True)

    driver = create_driver()
    driver.get(f"https://www.google.com/search?q={search_term}&tbm=isch")
    time.sleep(5)
    scroll_down(driver)

    image_urls = scrape_all_images(driver)
    image_urls = image_urls[:num_images]
    print(f"Found {len(image_urls)} images for {search_term}")

    for index, image_url in enumerate(image_urls, start=1):
        file_name = f'{foldername}_{index}'
        save_image(image_url, folder_path, file_name)

    driver.quit()
    print(f"Finished scraping {len(image_urls)} images for {foldername}\n")

if __name__ == "__main__":
    base_folder = r"C:\Users\sandi\Downloads\new"

    fruit_descriptions = {
        "banana": {
            "raw": "isolated single raw green banana",
            "ripe": "isolated single ripe yellow banana",
            "rotten": "isolated single rotten banana with black spots and bruises"
        },
        "mango": {
            "raw": "isolated single raw dark green mango",
            "ripe": "isolated single ripe yellow orange mango",
            "rotten": "isolated single rotten mango with black skin and oozing"
        },
        "apple": {
            "raw": "isolated single raw green or light red apple",
            "ripe": "isolated single ripe glossy red yellow apple",
            "rotten": "isolated single rotten apple with brown spots and wrinkles"
        },
        "orange": {
            "raw": "isolated single raw greenish orange",
            "ripe": "isolated single ripe bright orange",
            "rotten": "isolated single rotten orange with mold and dark soft spots"
        },
        "papaya": {
            "raw": "isolated single raw dark green papaya",
            "ripe": "isolated single ripe yellow orange papaya",
            "rotten": "isolated single rotten papaya with mushy skin and mold patches"
        }
    }

    for fruit in fruit_descriptions:
        for condition in ["raw", "ripe", "rotten"]:
            search_term = fruit_descriptions[fruit][condition]
            foldername = f"{fruit}_{condition}"
            scrape_and_save_images(base_folder, search_term, foldername, num_images=100)

Found 100 images for isolated single raw green banana
Finished scraping 100 images for banana_raw

Found 100 images for isolated single ripe yellow banana
Finished scraping 100 images for banana_ripe

Found 100 images for isolated single rotten banana with black spots and bruises
Finished scraping 100 images for banana_rotten

Found 100 images for isolated single raw dark green mango
Finished scraping 100 images for mango_raw

Found 100 images for isolated single ripe yellow orange mango
Finished scraping 100 images for mango_ripe

Found 37 images for isolated single rotten mango with black skin and oozing
Finished scraping 37 images for mango_rotten

Found 100 images for isolated single raw green or light red apple
Finished scraping 100 images for apple_raw

Found 100 images for isolated single ripe glossy red yellow apple
Finished scraping 100 images for apple_ripe

Found 100 images for isolated single rotten apple with brown spots and wrinkles
Finished scraping 100 images for apple_