In [3]:
from __future__ import unicode_literals
from pybooru import Danbooru
import urllib.request
import os
import tqdm
import time
import random
from concurrent.futures import ThreadPoolExecutor, as_completed

def download_danbooru_dataset(tags):
    client = Danbooru('danbooru') 
    MAX_WORKERS = max(1, int((os.cpu_count() or 1) * 2 / 3))

    save_dir = fr'C:\Users\fixgk\Kuroko\data-resized\{tags}'
    
    if os.path.exists(save_dir):
        print(f"Directory {save_dir} already exists. Skipping download.")
        return

    os.makedirs(save_dir, exist_ok=True)

    all_posts = []

    print(f"Starting API fetching for tags: {tags} (Sequential)...")

    page = 1
    while True:
        try:
            print(f"Fetching page {page}...", end='\r')
            
            current_posts = client.post_list(tags=f"{tags} -animated", page=page, limit=100)
            
            if not current_posts:
                print(f"\nPage {page} is empty. Reached end of results.")
                break
                
            all_posts.extend(current_posts)
            page += 1
            
        except Exception as e:
            print(f"\nError fetching page {page}: {e}")
            break

    print(f"Total posts retrieved: {len(all_posts)}")

    download_items = []
    for post in all_posts:
        sample_url = None
        
        if 'media_asset' in post and 'variants' in post['media_asset']:
            sample_url = next(
                (v['url'] for v in post['media_asset']['variants'] if v['type'] == 'sample'),
                None
            )
        
        if not sample_url:
            sample_url = post.get('large_file_url') or post.get('file_url')

        if sample_url:
            ext = sample_url.split('.')[-1]
            download_items.append({
                'url': sample_url, 
                'id': post['id'], 
                'ext': ext
            })
        else:
            # print("Skipping post (no URL found):", post.get('id'))
            pass

    def download_image(item, tags_dir, max_retries=5):
        """Downloads a single image file with retry mechanism."""
        url = item['url']
        post_id = item['id']
        ext = item['ext']
        file_name = os.path.join(tags_dir, f"{post_id}_sample.{ext}")
        
        if os.path.exists(file_name):
            return False

        for attempt in range(max_retries):
            try:
                urllib.request.urlretrieve(url, file_name)
                return True
            except Exception as e:
                if attempt < max_retries - 1:
                    # Exponential backoff: 2s, 4s, 8s... + random jitter
                    wait_time = (2 ** attempt) + random.uniform(0, 1)
                    time.sleep(wait_time)
                else:
                    print(f"Failed {post_id} after {max_retries} attempts: {e}")
                    return False

    print(f"Prepared {len(download_items)} links for download.")
    success_count = 0

    if download_items:
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            # Submit tugas download
            download_futures = [executor.submit(download_image, item, save_dir) for item in download_items]

            for future in tqdm.tqdm(as_completed(download_futures), total=len(download_futures), desc="Downloading samples"):
                if future.result():
                    success_count += 1

    print(f"Downloaded {success_count} new images at {save_dir}")


In [None]:
tags = "tiger_june" #


download_danbooru_dataset(tags)

Starting API fetching for tags: shuan_0420 (Sequential)...
Fetching page 1...
Error fetching page 1: HTTPSConnectionPool(host='danbooru.donmai.us', port=443): Max retries exceeded with url: /posts.json?tags=shuan_0420+-animated&page=1&limit=100 (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'danbooru.donmai.us'. (_ssl.c:1000)")))
Total posts retrieved: 0
Prepared 0 links for download.
Downloaded 0 new images at C:\Users\fixgk\Kuroko\data-resized\shuan_0420
