In [19]:
import os
import csv
import time
import requests
from bs4 import BeautifulSoup
from io import BytesIO
from PIL import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from duckduckgo_search import DDGS
from concurrent.futures import ThreadPoolExecutor

# 📂 Image Dataset Categories (Hierarchical)

A structured collection of image categories, organized by theme and subject matter.

## 🌿 Nature
Images showcasing the beauty of natural landscapes and formations.

- **Forests** – Dense woodlands, rainforests, and serene green landscapes.  
- **Deserts**  
  - Expansive sandy dunes, arid landscapes, and unique desert formations.  
- **Mountains** – Majestic peaks, rugged terrain, and breathtaking highlands.  
- **Ice Formations**  
  - Stunning glacial structures, ice caves, and frozen landscapes.  

## 🏛️ Architecture
A collection of architectural wonders, ranging from historic sites to futuristic designs.

- **Modern**  
  - Sleek, cutting-edge buildings with contemporary designs.  
- **Historical**  
  - Iconic landmarks, heritage buildings, and old-world architecture.  
  - Rustic countryside barns with a vintage charm.  
- **Industrial**  
  - Factories, machinery, and mechanical structures showcasing human engineering.  
- **Urban**  
  - Vibrant cityscapes, abandoned buildings, and neon-lit nightlife.  

## 🎨 Art & Design
A mix of artistic expressions, from abstract creativity to digital art.

- **Abstract**  
  - Unique patterns, colors, and non-representational artwork.  
- **Minimalist**  
  - Simple yet elegant visuals, focused on clean lines and aesthetics.  
- **Digital**  
  - AI-generated and hand-crafted digital masterpieces.  
- **Street Art**  
  - Expressive murals, graffiti, and urban art styles.  

## 🔬 Science & Space
Exploring the wonders of the universe and the unseen microscopic world.

- **Astronomy**  
  - Space phenomena, nebulae, galaxies, and celestial wonders.  
- **Microscopic**  
  - Close-up views of tiny organisms, cells, and intricate structures.  
- **Surreal**  
  - Dreamlike, imaginative compositions with an artistic touch.  
- **Futuristic**  
  - Visionary concepts, sci-fi landscapes, and advanced technological designs.  

## 🌍 Culture & Events
Celebrating human traditions, history, and vintage aesthetics.

- **Festivals**  
  - Colorful cultural celebrations, parades, and traditional events.  
- **Landmarks**  
  - Famous global sites that define history and heritage. *(Moved under Architecture)*  
- **Vintage**  
  - Retro-style posters, advertisements, and nostalgic artwork.  
- **Retro**  
  - Classic them

# Scrapping Script using DuckDuckGo
```python
categories = {
    "Nature": ["Forests", "Deserts", "Mountains", "Ice Formations"],
    "Architecture": ["Modern", "Historical", "Industrial", "Urban"],
    "Art & Design": ["Abstract", "Minimalist", "Digital", "Street Art"],
    "Science & Space": ["Astronomy", "Microscopic", "Surreal", "Futuristic"],
    "Culture & Events": ["Festivals", "Landmarks", "Vintage", "Retro"]
}

# Dataset folder
dataset_name = "VisualSpectrumDataset"
os.makedirs(dataset_name, exist_ok=True)

# CSV metadata file
csv_path = os.path.join(dataset_name, "metadata.csv")
csv_columns = ["category", "subcategory", "image_url", "filename"]

def download_image(img_url, category, subcategory, img_id):
    """Download an image and append its metadata to CSV if successful."""
    subcat_dir = os.path.join(dataset_name, category, subcategory)
    os.makedirs(subcat_dir, exist_ok=True)
    filename = f"{subcategory}_{img_id}.jpg"
    filepath = os.path.join(subcat_dir, filename)

    try:
        response = requests.get(img_url, timeout=10)
        if response.status_code == 200:
            with open(filepath, "wb") as f:
                f.write(response.content)
            # print(f" Downloaded {filename}")

            # Append metadata to CSV
            with open(csv_path, "a", newline="", encoding="utf-8") as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
                writer.writerow({
                    "category": category,
                    "subcategory": subcategory,
                    "image_url": img_url,
                    "filename": os.path.join(category, subcategory, filename)
                })
            return True
    except Exception as e:
        print(f" Failed to download {img_url}: {e}")
    return False

def scrape_images():
    # Create CSV file with header
    with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()

    target_images = 70  # more than required

    with DDGS() as ddgs, ThreadPoolExecutor(max_workers=30) as executor:
        for category, subcategories in categories.items():
            print(f"\n Scraping images for category: {category}")
            images_downloaded = 0

            for subcategory in subcategories:
                if images_downloaded >= target_images:
                    break

                print(f"    Fetching images for subcategory: {subcategory}")
                try:
                    # Request more results to have a buffer
                    results = ddgs.images(subcategory, max_results=target_images)
                except Exception as e:
                    print(f"    Error retrieving images for {subcategory}: {e}")
                    continue

                # Preparing tasks while ensuring we don't over-schedule
                futures = []
                for img_id, img in enumerate(results):
                    # Check if we've reached our target from previous successes plus pending tasks
                    if images_downloaded + len(futures) >= target_images:
                        break

                    img_url = img.get("image")
                    if img_url:
                        futures.append(executor.submit(download_image, img_url, category, subcategory, img_id))

                # Wait for tasks to complete and count only successful downloads
                for future in futures:
                    if future.result():
                        images_downloaded += 1

                print(f"    Completed subcategory {subcategory}, Total successful downloads: {images_downloaded}")

                time.sleep(1)  # Delay between subcategory queries

            print(f" Finished category {category}: {images_downloaded} images downloaded.")

if __name__ == "__main__":
    scrape_images()
```
