In [3]:
pip install selenium


Collecting selenium
  Downloading selenium-4.29.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium)
  Downloading attrs-25.1.0-py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.29.0-py3-none-any.whl (9.5 MB)
   ---------------------------------------- 0.0/9.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.5 MB 1.3 MB/s eta 0:00:08
   ---------------------------------------- 0.1/9.5 MB 1.3 MB/s eta 0:00:08
    --------------------------------------- 0.2/9.5 MB 1.8 MB/s 

In [1]:
import os
import time
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from PIL import Image, UnidentifiedImageError



In [2]:
labels = [
    "cats", "dogs", "birds", "cars", "airplanes", "mountains", "beaches", "flowers",
    "fruits", "insects", "trees", "cityscapes", "food", "boats", "bikes",
    "historical monuments", "laptops", "watches", "sports equipment", "musical instruments"
]

In [5]:
#setting up webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)

dataset_path = "image_dataset"
os.makedirs(dataset_path, exist_ok=True)

metadata = []

def download_image(url, folder, filename):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            filepath = os.path.join(folder, filename)
            with open(filepath, 'wb') as f:
                f.write(response.content)

            try:
                with Image.open(filepath) as img:
                    img.verify()  
                    img = Image.open(filepath).convert("RGB") 
                    jpg_filepath = filepath.replace(".jpg", ".jpeg") 
                    img.save(jpg_filepath, "JPEG")
                    os.remove(filepath)  
                    return jpg_filepath, img.width, img.height
            except UnidentifiedImageError:
                print(f"Skipping {filename}: Invalid image file.")
                os.remove(filepath)  
                
    except Exception as e:
        print(f"Error downloading {url}: {e}")

    return None, None, None


for label in labels:
        folder_path = os.path.join(dataset_path,label)
        os.makedirs(folder_path, exist_ok=True)

        search_url = f"https://www.google.com/search?q={label}+photo+-stock+-clipart&tbm=isch&tbs=isz:l"
        driver.get(search_url)
        time.sleep(2)

        body = driver.find_element(By.TAG_NAME, 'body')
        for _ in range(10):
                body.send_keys(Keys.PAGE_DOWN)
                time.sleep(4)

        images = driver.find_elements(By.CSS_SELECTOR, 'img')
        count = 0

        for img in images:
                if count>=50:
                        break
                src = img.get_attribute('src')
                if src and "http" in src:
                        filename = f"{label}_{count+1}.jpg"
                        filepath, width, height = download_image(src,folder_path,filename)
                        if filepath:
                                metadata.append([label,src,filepath,width,height])
                                count += 1

df = pd.DataFrame(metadata, columns=["label","url","filepath","width","height"])
df.to_csv(os.path.join(dataset_path,"image_metadata.csv"), index=False)

driver.quit()
print("Downloaded images successfully")

Skipping cats_4.jpg: Invalid image file.
Skipping dogs_4.jpg: Invalid image file.
Skipping birds_4.jpg: Invalid image file.
Skipping cars_5.jpg: Invalid image file.
Skipping airplanes_4.jpg: Invalid image file.
Skipping mountains_4.jpg: Invalid image file.
Skipping beaches_3.jpg: Invalid image file.
Skipping flowers_4.jpg: Invalid image file.
Skipping fruits_4.jpg: Invalid image file.
Skipping insects_4.jpg: Invalid image file.
Skipping trees_4.jpg: Invalid image file.
Skipping cityscapes_4.jpg: Invalid image file.
Skipping food_3.jpg: Invalid image file.
Skipping boats_4.jpg: Invalid image file.
Skipping bikes_4.jpg: Invalid image file.
Skipping historical monuments_4.jpg: Invalid image file.
Skipping laptops_4.jpg: Invalid image file.
Skipping watches_5.jpg: Invalid image file.
Skipping sports equipment_4.jpg: Invalid image file.
Skipping musical instruments_4.jpg: Invalid image file.
Downloaded images successfully
