In [None]:
pip install icrawler


In [None]:
import os
import requests
import time
import json
from bs4 import BeautifulSoup
from urllib.parse import quote, urljoin
import re
from PIL import Image
from io import BytesIO
import logging
from typing import List, Dict
import random

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class GettyImagesDownloader:
    def __init__(self, output_dir: str, max_images_per_person: int = 70):
        self.output_dir = output_dir
        self.max_images_per_person = max_images_per_person
        self.downloaded_counts = {}
        self.failed_downloads = {}
        self.session = requests.Session()
        
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
        
        # User agents to rotate
        self.user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        ]
    
    def get_random_user_agent(self):
        return random.choice(self.user_agents)
    
    def search_getty_images(self, query: str, max_results: int = 100):
        """Search for images on Getty Images"""
        search_url = f"https://www.gettyimages.in/photos/{quote(query)}?family=editorial&phrase={quote(query)}&sort=mostpopular"
        
        headers = {
            'User-Agent': self.get_random_user_agent(),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }
        
        try:
            response = self.session.get(search_url, headers=headers, timeout=30)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            image_elements = soup.find_all('img', {'src': re.compile(r'https://media\.gettyimages\.com/')})
            
            image_urls = []
            for img in image_elements[:max_results]:
                src = img.get('src')
                if src:
                    # Convert thumbnail URL to higher resolution if possible
                    high_res_url = src.replace('thumb', 'medium').replace('small', 'medium')
                    image_urls.append(high_res_url)
            
            return image_urls
            
        except Exception as e:
            logging.error(f"Error searching for {query}: {e}")
            return []
    
    def download_image(self, url: str, filename: str):
        """Download an image from URL"""
        try:
            headers = {
                'User-Agent': self.get_random_user_agent(),
                'Accept': 'image/webp,*/*',
                'Referer': 'https://www.gettyimages.in/',
            }
            
            response = self.session.get(url, headers=headers, timeout=30)
            response.raise_for_status()
            
            # Verify it's an image
            if 'image' not in response.headers.get('Content-Type', ''):
                logging.error(f"URL did not return an image: {url}")
                return False
            
            # Save image
            filepath = os.path.join(self.output_dir, filename)
            with open(filepath, 'wb') as f:
                f.write(response.content)
            
            # Verify the image is valid
            try:
                img = Image.open(filepath)
                img.verify()
            except:
                logging.error(f"Downloaded file is not a valid image: {filepath}")
                os.remove(filepath)
                return False
                
            return True
            
        except Exception as e:
            logging.error(f"Error downloading {url}: {e}")
            return False
    
    def download_celebrity_images(self, celebrity_name: str):
        """Download images for a specific celebrity"""
        if celebrity_name in self.downloaded_counts and self.downloaded_counts[celebrity_name] >= self.max_images_per_person:
            logging.info(f"Already downloaded maximum images for {celebrity_name}")
            return
        
        logging.info(f"Searching for images of {celebrity_name}...")
        image_urls = self.search_getty_images(celebrity_name, max_results=100)
        
        if not image_urls:
            logging.warning(f"No images found for {celebrity_name}")
            self.failed_downloads[celebrity_name] = self.failed_downloads.get(celebrity_name, 0) + 1
            return
        
        logging.info(f"Found {len(image_urls)} images for {celebrity_name}")
        
        downloaded = 0
        for i, url in enumerate(image_urls):
            if downloaded >= self.max_images_per_person:
                break
                
            # Create filename
            safe_name = "".join(c for c in celebrity_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
            filename = f"{safe_name}_{i+1:03d}.jpg"
            filepath = os.path.join(self.output_dir, filename)
            
            if os.path.exists(filepath):
                logging.info(f"Skipping {filename}, already exists")
                continue
                
            logging.info(f"Downloading {filename} from {url}")
            if self.download_image(url, filename):
                downloaded += 1
                self.downloaded_counts[celebrity_name] = self.downloaded_counts.get(celebrity_name, 0) + 1
                # Be respectful with delays between requests
                time.sleep(1)
        
        logging.info(f"Downloaded {downloaded} images for {celebrity_name}")
    
    def download_all_celebrities(self, celebrities: List[str]):
        """Download images for all celebrities in the list"""
        for celebrity in celebrities:
            self.download_celebrity_images(celebrity)
            # Be respectful with delays between celebrities
            time.sleep(2)
        
        self.print_summary()
    
    def print_summary(self):
        """Print a summary of the download process"""
        print("\n" + "="*50)
        print("DOWNLOAD SUMMARY")
        print("="*50)
        total_downloaded = sum(self.downloaded_counts.values())
        print(f"Total images downloaded: {total_downloaded}")
        
        print("\nImages per celebrity:")
        for celebrity, count in self.downloaded_counts.items():
            print(f"  {celebrity}: {count} images")
        
        if self.failed_downloads:
            print("\nFailed downloads:")
            for celebrity, count in self.failed_downloads.items():
                print(f"  {celebrity}: {count} failures")
        
        print("="*50)

# List of celebrities to download
CELEBRITIES = [
    "Robert Downey Jr", "Chris Evans", "Scarlett Johansson", "Tom Holland", "Zendaya",
    "Emma Watson", "Leonardo DiCaprio", "Brad Pitt", "Angelina Jolie", "Jennifer Lawrence",
    "Taylor Swift", "Ariana Grande", "Justin Bieber", "Selena Gomez", "Billie Eilish",
    "Ed Sheeran", "Beyoncé", "Rihanna", "Drake", "Shawn Mendes",
    "Lionel Messi", "Cristiano Ronaldo", "Neymar Jr", "Kylian Mbappé", "Virat Kohli",
    "Serena Williams", "Roger Federer", "LeBron James", "Michael Jordan", "Usain Bolt",
    "Kim Kardashian", "Kylie Jenner", "Dwayne Johnson", "Kevin Hart", "Will Smith",
    "Priyanka Chopra", "Deepika Padukone", "Shahrukh Khan", "Amitabh Bachchan", "Hrithik Roshan",
    "Barack Obama", "Elon Musk", "Jeff Bezos", "Bill Gates", "Mark Zuckerberg",
    "Oprah Winfrey", "Malala Yousafzai", "Pope Francis", "Jackie Chan"
]

def main():
    # Configuration
    OUTPUT_DIR = r"D:\test"
    MAX_IMAGES_PER_PERSON = 500
    
    # Create downloader instance
    downloader = GettyImagesDownloader(OUTPUT_DIR, MAX_IMAGES_PER_PERSON)
    
    # Start downloading
    downloader.download_all_celebrities(CELEBRITIES)

if __name__ == "__main__":
    main()

2025-09-22 12:03:30,080 - INFO - Searching for images of Robert Downey Jr...
