In [None]:
import os

# Read keywords from external file
def read_keywords(filename="keywords.txt"):
    """Read keywords from a text file, one keyword per line"""
    if not os.path.exists(filename):
        print(f"Error: {filename} not found. Creating sample file...")
        # Create a sample keywords file
        with open(filename, 'w', encoding='utf-8') as f:
            f.write("croissant\n")
            f.write("단팥빵\n")
            f.write("소보로 빵\n")
        print(f"Sample {filename} created. Please edit it with your desired keywords.")
        return []
    
    with open(filename, 'r', encoding='utf-8') as f:
        keywords = [line.strip() for line in f if line.strip()]
    return keywords

# Read keywords from file
bread_types = read_keywords("keywords.txt")
base_dir = "raw_images_crawled"
TARGET_IMAGES = 200  # Target number of images per keyword
MAX_IMAGES_PER_SEARCH = 150  # Images to request per individual search

print(f"Loaded {len(bread_types)} keywords: {bread_types}")

# Create base directory if it doesn't exist
if not os.path.exists(base_dir):
    os.makedirs(base_dir)
    print(f"Created base directory: {base_dir}")
else:
    print(f"Base directory already exists: {base_dir}")

# Create subdirectories for each bread type
for bread in bread_types:
    safe_name = bread.replace(" ", "_").replace("/", "_")
    image_dir = os.path.join(base_dir, safe_name, "images")
    os.makedirs(image_dir, exist_ok=True)
    print(f"Prepared directory for '{bread}': {image_dir}")

In [None]:
# Cell 2: Simple image crawling (based on working first version)

from icrawler.builtin import GoogleImageCrawler
import time
import random

# Configuration options
USE_KEYWORD_VARIATIONS = False  # Set to False to use only original keywords
ADD_BREAD_SUFFIX = True         # Set to True to add "bread" to keywords (like original)
SKIP_CRAWLING = False          # Set to True to skip crawling entirely

def simple_crawl(keyword, save_path, max_images):
    """Simple crawling function similar to the original working version"""
    try:
        print(f"Starting image crawl for: {keyword} (target: {max_images})")
        
        # Use simple settings like the original
        crawler = GoogleImageCrawler(storage={"root_dir": save_path})
        crawler.crawl(keyword=keyword, max_num=max_images)
        
        # Count downloaded images
        import os
        if os.path.exists(save_path):
            count = len([f for f in os.listdir(save_path) if f.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp'))])
            print(f"Downloaded {count} images for '{keyword}'")
            return count
        return 0
        
    except Exception as e:
        print(f"Error during crawling with keyword '{keyword}': {e}")
        return 0

def enhanced_crawl(base_keyword, save_path, target_count):
    """Enhanced version with multiple keywords but simple approach"""
    
    # Generate keyword list
    if USE_KEYWORD_VARIATIONS:
        keywords = [
            base_keyword,
            f"{base_keyword} bread",
            f"{base_keyword} bakery",
            f"fresh {base_keyword}",
            f"{base_keyword} food"
        ]
    else:
        if ADD_BREAD_SUFFIX:
            keywords = [f"{base_keyword} bread"]  # Like original working version
        else:
            keywords = [base_keyword]
    
    total_downloaded = 0
    
    for i, keyword in enumerate(keywords):
        if total_downloaded >= target_count:
            break
        
        remaining = target_count - total_downloaded
        max_for_this_search = min(remaining + 20, MAX_IMAGES_PER_SEARCH)
        
        print(f"\nSearch {i+1}/{len(keywords)}: '{keyword}'")
        
        # Simple crawl like the original
        count = simple_crawl(keyword, save_path, max_for_this_search)
        total_downloaded = count  # Update with actual count
        
        # Small delay between searches
        if i < len(keywords) - 1:  # Not the last keyword
            delay = random.uniform(2, 4)
            print(f"Waiting {delay:.1f} seconds...")
            time.sleep(delay)
    
    return total_downloaded

# Main crawling loop
if not SKIP_CRAWLING:
    print("Configuration:")
    print(f"- Keyword variations: {'Enabled' if USE_KEYWORD_VARIATIONS else 'Disabled'}")
    print(f"- Add 'bread' suffix: {'Enabled' if ADD_BREAD_SUFFIX else 'Disabled'}")
    print()
    
    for bread in bread_types:
        safe_name = bread.replace(" ", "_").replace("/", "_")
        save_path = os.path.join(base_dir, safe_name, "images")
        
        print(f"Processing: {bread}")
        print(f"Target: {TARGET_IMAGES} images")
        
        if USE_KEYWORD_VARIATIONS or not ADD_BREAD_SUFFIX:
            # Use enhanced crawl
            final_count = enhanced_crawl(bread, save_path, TARGET_IMAGES)
        else:
            # Use simple crawl (exactly like original)
            keyword = f"{bread} bread" if ADD_BREAD_SUFFIX else bread
            final_count = simple_crawl(keyword, save_path, TARGET_IMAGES)
        
        print(f"Completed: '{bread}' - {final_count} images downloaded")
        
        # Delay between different bread types
        if bread != bread_types[-1]:  # Not the last item
            delay = random.uniform(3, 6)
            print(f"Waiting {delay:.1f} seconds before next category...\n")
            time.sleep(delay)

else:
    print("Crawling skipped.")
    
    # Show existing counts
    for bread in bread_types:
        safe_name = bread.replace(" ", "_").replace("/", "_")
        image_dir = os.path.join(base_dir, safe_name, "images")
        
        if os.path.exists(image_dir):
            count = len([f for f in os.listdir(image_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp'))])
            print(f"Existing images for '{bread}': {count}")

print("\nImage crawling completed.")

In [None]:
# Cell 3: Rename all remaining image files to 0000.jpg, 0001.jpg, etc.
# Run this after manual filtering of irrelevant images

import cv2
from glob import glob

def rename_images_in_directory(image_dir, bread_name):
    """Rename all images in directory to sequential format"""
    
    # Get all image files
    image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.bmp', '*.webp']
    image_paths = []
    
    for ext in image_extensions:
        image_paths.extend(glob(os.path.join(image_dir, ext)))
        image_paths.extend(glob(os.path.join(image_dir, ext.upper())))
    
    # Sort by modification time
    image_paths = sorted(image_paths, key=lambda x: os.path.getmtime(x))
    
    print(f"Renaming images for category: {bread_name} ({len(image_paths)} files)")
    
    # Create temporary directory to avoid naming conflicts
    temp_dir = os.path.join(image_dir, "temp_rename")
    os.makedirs(temp_dir, exist_ok=True)
    
    successfully_renamed = 0
    
    # First, move all files to temp directory with new names
    for idx, path in enumerate(image_paths):
        temp_path = os.path.join(temp_dir, f"{idx:04d}.jpg")
        try:
            img = cv2.imread(path)
            if img is not None:
                cv2.imwrite(temp_path, img)
                successfully_renamed += 1
            else:
                print(f"Warning: Could not read image {os.path.basename(path)}")
        except Exception as e:
            print(f"Error processing {os.path.basename(path)}: {e}")
    
    # Remove original files
    for path in image_paths:
        try:
            os.remove(path)
        except Exception as e:
            print(f"Error removing {os.path.basename(path)}: {e}")
    
    # Move renamed files back to original directory
    temp_files = glob(os.path.join(temp_dir, "*.jpg"))
    for temp_file in temp_files:
        final_path = os.path.join(image_dir, os.path.basename(temp_file))
        try:
            os.rename(temp_file, final_path)
        except Exception as e:
            print(f"Error moving {os.path.basename(temp_file)}: {e}")
    
    # Remove temporary directory
    try:
        os.rmdir(temp_dir)
    except:
        pass
    
    print(f"Successfully renamed {successfully_renamed} images for {bread_name}")
    return successfully_renamed

# Rename images for all bread types
for bread in bread_types:
    safe_name = bread.replace(" ", "_").replace("/", "_")
    image_dir = os.path.join(base_dir, safe_name, "images")
    
    if os.path.exists(image_dir):
        rename_images_in_directory(image_dir, bread)
    else:
        print(f"Warning: Directory not found for {bread}: {image_dir}")

print("\nImage renaming completed.")

In [None]:
# Cell 4: Preprocessing functions - filter and resize images

import numpy as np
import cv2

def is_blurry(image, threshold=100.0):
    """Check if image is blurry using Laplacian variance"""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return cv2.Laplacian(gray, cv2.CV_64F).var() < threshold

def is_too_dark_or_bright(image, dark_threshold=30, bright_threshold=220):
    """Check if image is too dark or too bright"""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    mean_brightness = np.mean(gray)
    return mean_brightness < dark_threshold or mean_brightness > bright_threshold

def has_sufficient_content(image, min_std=15):
    """Check if image has sufficient content variation"""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return np.std(gray) > min_std

def process_image(input_path, output_path, target_size=(640, 640)):
    """
    Process a single image: validate, filter, and resize
    
    Args:
        input_path: Path to input image
        output_path: Path to save processed image
        target_size: Target size tuple (width, height)
    
    Returns:
        bool: True if image was successfully processed, False otherwise
    """
    try:
        # Read image
        image = cv2.imread(input_path)
        if image is None:
            return False
        
        # Get original dimensions
        h, w = image.shape[:2]
        
        # Filter out images that are too small
        if h < 300 or w < 300:
            return False
        
        # Filter out images that would need significant upscaling
        if h < target_size[0] * 0.75 or w < target_size[1] * 0.75:
            return False
        
        # Apply quality filters
        if is_blurry(image):
            return False
        
        if is_too_dark_or_bright(image):
            return False
        
        if not has_sufficient_content(image):
            return False
        
        # Resize image maintaining aspect ratio
        # Calculate scaling factor
        scale_w = target_size[0] / w
        scale_h = target_size[1] / h
        scale = min(scale_w, scale_h)
        
        new_w = int(w * scale)
        new_h = int(h * scale)
        
        # Resize image
        resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA)
        
        # Create canvas with target size and paste resized image in center
        canvas = np.zeros((target_size[1], target_size[0], 3), dtype=np.uint8)
        canvas.fill(114)  # Fill with gray color (common in YOLO)
        
        # Calculate position to center the image
        y_offset = (target_size[1] - new_h) // 2
        x_offset = (target_size[0] - new_w) // 2
        
        canvas[y_offset:y_offset + new_h, x_offset:x_offset + new_w] = resized
        
        # Save processed image
        cv2.imwrite(output_path, canvas)
        return True
        
    except Exception as e:
        print(f"Error processing {os.path.basename(input_path)}: {e}")
        return False

print("Preprocessing functions defined successfully.")

In [None]:
# Cell 5: Apply preprocessing to all crawled images

from tqdm import tqdm
import shutil

def preprocess_category_images(bread_name, image_dir, target_size=(640, 640)):
    """Preprocess all images in a category directory"""
    
    # Get all jpg files (should be renamed by previous step)
    image_paths = sorted(glob(os.path.join(image_dir, "*.jpg")))
    
    if not image_paths:
        print(f"No images found for {bread_name}")
        return 0
    
    print(f"Processing images for: {bread_name} ({len(image_paths)} files)")
    
    # Create processed directory
    processed_dir = os.path.join(os.path.dirname(image_dir), "processed")
    os.makedirs(processed_dir, exist_ok=True)
    
    successful_count = 0
    failed_count = 0
    
    # Process each image
    for idx, input_path in enumerate(tqdm(image_paths, desc=f"Processing {bread_name}")):
        output_filename = f"{idx:04d}.jpg"
        output_path = os.path.join(processed_dir, output_filename)
        
        success = process_image(input_path, output_path, target_size)
        
        if success:
            successful_count += 1
        else:
            failed_count += 1
            # Remove failed image if it was created
            if os.path.exists(output_path):
                os.remove(output_path)
    
    print(f"Results for {bread_name}:")
    print(f"  Successfully processed: {successful_count}")
    print(f"  Failed/filtered out: {failed_count}")
    print(f"  Final images saved to: {processed_dir}")
    
    return successful_count

# Apply preprocessing to all categories
total_processed = 0

for bread in bread_types:
    safe_name = bread.replace(" ", "_").replace("/", "_")
    image_dir = os.path.join(base_dir, safe_name, "images")
    
    if os.path.exists(image_dir):
        count = preprocess_category_images(bread, image_dir)
        total_processed += count
    else:
        print(f"Warning: Directory not found for {bread}: {image_dir}")

print(f"\nAll image preprocessing completed.")
print(f"Total images successfully processed: {total_processed}")

# Create summary report
print("\nSummary Report:")
print("-" * 50)
for bread in bread_types:
    safe_name = bread.replace(" ", "_").replace("/", "_")
    processed_dir = os.path.join(base_dir, safe_name, "processed")
    
    if os.path.exists(processed_dir):
        count = len(glob(os.path.join(processed_dir, "*.jpg")))
        print(f"{bread}: {count} processed images")
    else:
        print(f"{bread}: 0 processed images")