<a href="https://colab.research.google.com/github/steinhaug/stable-diffusion/blob/main/tool/hf-dataset-collage-maker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Huggingface dataset collage maker

This notebook is intended for downloading files from a dataset.
Create a collage image from all images inside a folder.
Upload the collage images to the huggingface dataset.

**Requirements:** _writable token in secrets as HF_TOKEN_


In [4]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')

!huggingface-cli login --token {HF_TOKEN}
from huggingface_hub import snapshot_download

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [71]:
#@markdown load notebook functions
import os
import shutil
import tarfile
import glob
from PIL import Image
import cv2
import random
from IPython.display import clear_output

def delete_dir(directory_path):
    if os.path.exists(directory_path):
        shutil.rmtree(directory_path)
        print(f"Directory and all contents deleted: {directory_path}")
    else:
        print("The directory does not exist.")

def decompress_tar(tar_file, destination=None, flatten_structure=False):
    with tarfile.open(tar_file, 'r') as tar:
        if destination is not None:
            os.makedirs(destination, exist_ok=True)

        folder, extension = os.path.splitext(return__folderName(tar_file))

        for member in tar.getmembers():
            if flatten_structure:
                # Use just the filename without directories
                member.name = os.path.basename(member.name)
            if destination is not None:
                # Join the destination directory with the member's name
                if flatten_structure:
                    member_path = os.path.join(destination, folder, os.path.dirname(member.name))
                else:
                    member_path = destination
            else:
                member_path = os.path.dirname(member.name)

            if flatten_structure and destination==None:
                member_path = os.path.dirname(tar_file)

            tar.extract(member, path=member_path)

def return__folderName(directory_path, verify_folder=False):
    if not verify_folder:
        return os.path.basename(os.path.normpath(directory_path))
    if os.path.isdir(directory_path):
        last_folder_name = os.path.basename(os.path.normpath(directory_path))
        return last_folder_name
    else:
        return None # Return None for invalid paths

def get_image_dimensions(image_path):
    try:
        with Image.open(image_path) as img:
            return img.size  # img.size is a tuple (width, height)
    except IOError as e:
        print(f"Error opening image: {e}")
        return None

def add_title_and_subtitle_opencv(directory_path, image_path):
    """
    Opens an image, adds a title and a subtitle with shadows using OpenCV, and saves it as a JPG.

    Args:
    directory_path (str): The path to the directory containing images.
    image_path (str): The path to the image to be modified.
    """
    # Load the image
    img = cv2.imread(image_path)
    if img is None:
        print("Image could not be read.")
        return

    # Define the font
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 2  # Increased font scale for larger text
    font_color = (255, 255, 255)
    shadow_color = (50, 50, 50)  # Dark gray shadow
    font_thickness = 3
    shadow_offset = 2  # Shadow offset

    # Title text: Directory name
    title = os.path.basename(directory_path)
    # Subtitle text: Image count
    image_count = len([name for name in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, name))])
    subtitle = f"Total images: {image_count}"

    # Add text to the image
    text_size, _ = cv2.getTextSize(title, font, font_scale, font_thickness)
    text_x = 10
    text_y = text_size[1] + 10

    # Adding shadow first
    cv2.putText(img, title, (text_x + shadow_offset, text_y + shadow_offset), font, font_scale, shadow_color, font_thickness, cv2.LINE_AA)
    # Adding text
    cv2.putText(img, title, (text_x, text_y), font, font_scale, font_color, font_thickness, cv2.LINE_AA)

    # Add subtitle below the title
    subtitle_size, _ = cv2.getTextSize(subtitle, font, font_scale, font_thickness)
    subtitle_y = text_y + subtitle_size[1] + 10
    # Adding shadow first for subtitle
    cv2.putText(img, subtitle, (text_x + shadow_offset, subtitle_y + shadow_offset), font, font_scale, shadow_color, font_thickness, cv2.LINE_AA)
    # Adding subtitle
    cv2.putText(img, subtitle, (text_x, subtitle_y), font, font_scale, font_color, font_thickness, cv2.LINE_AA)

    # Save the modified image
    #save_path = os.path.join(os.path.dirname(image_path), os.path.basename(directory_path) + ".jpg")
    save_path = image_path[:-4] + ".jpg"
    cv2.imwrite(save_path, img)
    print(f"Modified image saved to {save_path}")


def make_collage(images, filename, width, init_height):
    """
    Make a collage image with a width equal to `width` from `images` and save to `filename`.
    """
    if not images:
        print('No images for collage found!')
        return False

    margin_size = 2
    # run until a suitable arrangement of images is found
    while True:
        # copy images to images_list
        images_list = images[:]
        coefs_lines = []
        images_line = []
        x = 0
        while images_list:
            # get first image and resize to `init_height`
            img_path = images_list.pop(0)
            img = Image.open(img_path)
            img.thumbnail((width, init_height))
            # when `x` will go beyond the `width`, start the next line
            if x > width:
                coefs_lines.append((float(x) / width, images_line))
                images_line = []
                x = 0
            x += img.size[0] + margin_size
            images_line.append(img_path)
        # finally add the last line with images
        coefs_lines.append((float(x) / width, images_line))

        # compact the lines, by reducing the `init_height`, if any with one or less images
        if len(coefs_lines) <= 1:
            break
        if any(map(lambda c: len(c[1]) <= 1, coefs_lines)):
            # reduce `init_height`
            init_height -= 10
        else:
            break

    # get output height
    out_height = 0
    for coef, imgs_line in coefs_lines:
        if imgs_line:
            out_height += int(init_height / coef) + margin_size
    if not out_height:
        print('Height of collage could not be 0!')
        return False

    collage_image = Image.new('RGB', (width, int(out_height)), (35, 35, 35))
    # put images to the collage
    y = 0
    for coef, imgs_line in coefs_lines:
        if imgs_line:
            x = 0
            for img_path in imgs_line:
                img = Image.open(img_path)
                # if need to enlarge an image - use `resize`, otherwise use `thumbnail`, it's faster
                k = (init_height / coef) / img.size[1]
                if k > 1:
                    img = img.resize((int(img.size[0] * k), int(img.size[1] * k)), Image.LANCZOS)
                else:
                    img.thumbnail((int(width / coef), int(init_height / coef)), Image.LANCZOS)
                if collage_image:
                    collage_image.paste(img, (int(x), int(y)))
                x += img.size[0] + margin_size
            y += int(init_height / coef) + margin_size
    collage_image.save(filename)
    return True


def collage(image_directory):

    class MyVariables:
        def __init__(self):
            self.width = 1600
            self.height = 250
            self.init_height = 250
            self.folder = image_directory
            self.output = f"{image_directory}.coll.png"
            self.shuffle = True
    args = MyVariables()

    # get images
    files = [os.path.join(args.folder, fn) for fn in os.listdir(args.folder)]
    images = [fn for fn in files if os.path.splitext(fn)[1].lower() in ('.jpg', '.jpeg', '.png')]
    if not images:
        print('No images for making collage! Please select other directory with images!')
        exit(1)

    # shuffle images if needed
    if args.shuffle:
        random.shuffle(images)

    print(f"Making collage: {image_directory}")
    res = make_collage(images, args.output, args.width, args.init_height)
    if not res:
        print('Failed to create collage!')
        exit(1)
    print('-- Collage is ready!')


In [75]:
delete_dir('/content/datasets')

Directory and all contents deleted: /content/datasets


In [None]:
#@markdown Download image sets
sub_folder = "MetArt" # @param {type:"string"}
allow_patterns = "MetArt/*.tar" # @param {type:"string"}

import os
SAVE_PATH = '/content/datasets'
REPO_ID = 'steinhaug/onceUponAtimeInPornVille'
os.makedirs(f"{SAVE_PATH}/{REPO_ID}", exist_ok=True)
path = snapshot_download(repo_id=REPO_ID, repo_type="dataset", revision="main", allow_patterns=f"{allow_patterns}", local_dir=f"{SAVE_PATH}/{REPO_ID}", local_dir_use_symlinks=False)

In [64]:
#@title decompress folder

IMAGE_FOLDER = os.path.join(SAVE_PATH, REPO_ID, sub_folder)
output_directory = f"/content/datasets/{sub_folder}"

for item_name in os.listdir(IMAGE_FOLDER):
    file_path = os.path.join(IMAGE_FOLDER, item_name)
    root, extension = os.path.splitext(file_path)
    if extension == '.tar':
        decompress_tar(file_path, output_directory, False)
        os.remove(file_path)
        print(f"Decompressed: {file_path}")

#delete_dir(IMAGE_FOLDER)
clear_output();
print(f"Done")

Done


In [72]:
#@markdown Create the collage images
directory = f"/content/datasets/{sub_folder}"

for item in os.listdir(directory):
    # os.path.join() constructs full path
    image_folder = os.path.join(directory, item)
    if item == '.ipynb_checkpoints':
        continue
    if os.path.isdir(image_folder):
        if not os.path.isfile(f"{image_folder}.coll.jpg"):
            collage(image_folder)
            add_title_and_subtitle_opencv(image_folder, f"{image_folder}.coll.png")
            os.remove( f"{image_folder}.coll.png" )
            xy = get_image_dimensions(f"{image_folder}.coll.jpg")
            print(f"Collage created: {xy[0]} x {xy[1]}")
        else:
            print(f"Collage exists.")

clear_output();
print(f"Done")

Done


In [73]:
#@markdown connect to huggingface
from slugify import slugify
from huggingface_hub import HfApi, HfFolder, CommitOperationAdd
from huggingface_hub import create_repo
from IPython.display import display_markdown
from IPython.display import clear_output
from IPython.utils import capture
from google.colab import files
import shutil
import time
import os

Create_repo = False
hf_token = HF_TOKEN = userdata.get('HF_TOKEN')
your_repository = 'onceUponAtimeInPornVille'

api = HfApi()
your_username = api.whoami(token=hf_token)["name"]
repo_id = f"{your_username}/{your_repository}"

In [None]:
#@markdown Upload collage images
print(f"Uploading folder /content/datasets/{sub_folder}")

api.upload_folder(
    folder_path=f"/content/datasets/{sub_folder}",
    path_in_repo=f"{sub_folder}",
    repo_type="dataset",
    repo_id=repo_id, token=hf_token,
    allow_patterns="*.coll.jpg",
    ignore_patterns="**/*.coll.jpg",
)
api.create_commit(
    repo_id=repo_id,
    operations=[],
    commit_message=f"Added {sub_folder} collages",
    token=hf_token
)