In [1]:
import os
from joblib import Parallel, delayed
from tqdm import tqdm
import shutil
import zipfile
import json
from PIL import Image,ImageFile
import math
global COUNTER
import cairosvg
import io
import re
import random
import numpy as np
import subprocess
from realesrgan_ncnn_py import Realesrgan
import multiprocessing
import logging
from multiprocessing import Manager
from multiprocessing import Lock
from PIL import Image, ImageFile,ImageFilter  # Ensure this import is at the top
from utils.LLM_API import AsyncLLMProcessor
from utils.caption_utils import write_tags_file
from utils.image_utils import resize_and_crop_to_fit,fill_transparent_with_color,center_crop_square,svg_scaling
from utils.io_utils import ensure_directory_exists
import nltk
from utils.process_image_API import ProcessImageAPI
import time  # Import time module for timing

nltk.download('punkt')

#multiprocessing.log_to_stderr(logging.DEBUG)
ImageFile.LOAD_TRUNCATED_IMAGES = False
target_resolutions = [
    (1152, 896), (896, 1152), (1216, 832), (832, 1216),
    (1344, 768), (768, 1344), (1536, 640), (640, 1536),(1024, 1024)
]


def create_tags_file(annotation, tags, folder_path, output_path, augment=None):
    output_extension = '.txt'
    tags_file_name = f"{output_path}{output_extension}"
    tags_file_path = os.path.join(folder_path, tags_file_name)
    
    ensure_directory_exists(os.path.dirname(tags_file_path))

    if os.path.exists(tags_file_path):
        return
    
    if use_LLM:
        llm_processor.add_to_queue(folder_path, output_path, annotation, tags, augment)
    else:
        # Existing non-LLM processing logic
        from utils.caption_utils import prepare_content, remove_duplicate_phrases
        content = prepare_content(annotation, tags, add_tags, shuffle_content, augment=augment)
        content = remove_duplicate_phrases(content)
        write_tags_file(output_path=tags_file_path, content_list=[content])

def process_image_batch(api_instance, images_batch, folder_path, basefolder, image_paths_batch, counter, lock, augment=None):
    api_instance.process_images(images_batch, folder_path, basefolder, image_paths_batch, counter, lock, augment=augment)

  
def make_folders_recursively(root, folder, images, basefolder, processed_dir, counter, lock, augment=None):
    safe_folder_name = folder["name"].replace("/", "_")
    current_path = os.path.join(root, safe_folder_name)
    ensure_directory_exists(current_path)

    images_in_folder = [image for image in images if folder["id"] in image["folders"]]

    batch_size = 10  # Adjust batch size as needed
    image_processing_tasks = []
    api_instance = ProcessImageAPI({
        'pixelart': pixelart,
        'append_filename_to_captions': append_filename_to_captions,
        'doBucketing': doBucketing,
        'isEsganUpscale': isEsganUpscale,
        'usePilSave': usePilSave,
        'do_center_square_crop': do_center_square_crop,
        'padding': padding,
        'add_tags': add_tags,
        'shuffle_content': shuffle_content,
        'use_LLM': use_LLM,
        'target_resolutions': target_resolutions
    })

    for i in range(0, len(images_in_folder), batch_size):
        images_batch = images_in_folder[i:i + batch_size]
        image_paths_batch = [
            os.path.join(processed_dir, f"{image['id']}.info", f"{image['name']}.{image['ext']}") for image in images_batch
        ]

        # Add the task to process the batch
        image_processing_tasks.append(
            delayed(process_image_batch)(api_instance, images_batch, current_path, basefolder, image_paths_batch, counter, lock, augment)
        )

    if image_processing_tasks:
        Parallel(n_jobs=number_of_jobs)(
            tqdm(image_processing_tasks, desc="Processing image batches")
        )

    for child in folder.get("children", []):
        make_folders_recursively(current_path, child, images, basefolder, processed_dir, counter, lock, augment=augment)

    api_instance.close()  # Ensure any resources are properly released
    
    
def extract_EaglePack_and_process(eaglepacks_path):
    processed_dir = os.path.join(eaglepacks_path, "processed")
    basefolder = os.path.join(eaglepacks_path, "base")
    
    #Unzip all the eaglepacks
    ensure_directory_exists(processed_dir)
    for filename in os.listdir(eaglepacks_path):
        if filename.endswith(".eaglepack") and zipfile.is_zipfile(os.path.join(eaglepacks_path, filename)):
            with zipfile.ZipFile(os.path.join(eaglepacks_path, filename), 'r') as zipObj:
                zipObj.extractall(processed_dir)

    #Json to folders
    json_file = os.path.join(processed_dir, "pack.json")
    if os.path.exists(json_file):
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        images = data["images"]
        folders = data["folder"]
        
        ensure_directory_exists(basefolder)
        with Manager() as manager:
            counter = manager.Value('i', 0)  # Shared counter for parallel processes
            lock = manager.Lock()  # Lock to prevent race conditions

            for augment in augment_list:
                start_time = time.time()  # Start timer
                if augment is None:
                    # No augmentation
                    make_folders_recursively(processed_dir, folders, images, basefolder, processed_dir, counter, lock)
                else:
                    aug_type, aug_value, aug_percentage = augment
                    # With augmentation
                    make_folders_recursively(
                        processed_dir, folders, images, basefolder, processed_dir, counter, lock,
                        augment=(aug_type, aug_value, aug_percentage)
                    )           
                end_time = time.time()  # End timer
                print(f"Time taken for augmentation {augment}: {end_time - start_time:.2f} seconds")


# Global variables
pixelart = False
append_filename_to_captions = True
doBucketing=True
isEsganUpscale=False
usePilSave=False
number_of_jobs=16
do_center_square_crop=False
padding = random.randint(80, 100)
add_tags = False
shuffle_content = False
use_LLM=False
#target_resolutions = [(768, 768),]

if use_LLM:
    llm_processor = AsyncLLMProcessor(add_tags=True, shuffle_content=False)
    number_of_jobs=1

#('tokenOnly', '@regular_icon', 0.4),
#('random_dropout_keep_1', None, 1),

augment_list = [    
#   None,
#    ('random_dropout_keep_4', None, 0.7),
#    ('prepend', 'ohwx artstyle, ', 1),
    ('prepend', 'ohwx artstyle, ', 1),
#    ('append', ' ,regular_icon', 0.8)
    # None,  # First pass: No augmentation
    # ('tokenOnly', '@Bold_icon', 1),
    # ('tokenOnly', '@bold_icon', 0.2),
    # ('tokenOnly', 'Simple Icon', 0.2),
    # ('tokenOnly', 'regular icon', 0.2),
    # ('random_dropout_keep_1', None, 0.7),
    # ('random_dropout_keep_2', None, 0.7),
    # ('random_dropout_keep_3', None, 0.7),
    # ('random_dropout_keep_4', None, 0.7),
    # ('prepend', 'start_', 1),  # Prepend augmentation
    # ('append', '_end', 1)      # Append augmentation
]

if __name__ == "__main__":
    try:
        start_time = time.time()  # Start timer for entire process
        extract_EaglePack_and_process(r"P:\datasets\wallhaven")
        end_time = time.time()  # End timer for entire process
        print(f"Total processing time: {end_time - start_time:.2f} seconds")
    finally:
        if use_LLM:
            llm_processor.stop()
    print("Processing completed")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Processing image batches: 100%|██████████| 333/333 [01:38<00:00,  3.37it/s]
Processing image batches: 100%|██████████| 1/1 [00:00<00:00, 1002.46it/s]
Processing image batches: 100%|██████████| 1/1 [00:00<00:00, 1000.55it/s]
Processing image batches: 100%|██████████| 1/1 [00:00<00:00, 500.16it/s]
Processing image batches: 100%|██████████| 1/1 [00:00<00:00, 994.15it/s]
Processing image batches: 100%|██████████| 1/1 [00:00<00:00, 998.88it/s]
Processing image batches: 100%|██████████| 1/1 [00:00<00:00, 663.87it/s]
Processing image batches: 100%|██████████| 1/1 [00:00<00:00, 999.83it/s]
Processing image batches: 100%|██████████| 2/2 [00:00<00:00, 1999.19it/s]
Processing image batches: 100%|██████████| 1/1 [00:00<00:00, 1000.31it/s]
Processing image batches: 100%|██████████| 1/1 [00:00<00:00, 1001.27it/s]


Time taken for augmentation ('prepend', 'ohwx artstyle, ', 1): 125.26 seconds
Total processing time: 168.31 seconds
Processing completed
