In [21]:
#!pip3 install opencv-python
!pip3 install numpy

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [None]:
import ijson
from PIL import Image
from io import BytesIO
import requests
import cv2
import numpy as np

def download_image(url):
    print(f"Downloading: {url}")
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers, timeout=100)  # Добавлен таймаут
        response.raise_for_status()  # Проверка на ошибки HTTP
        image = Image.open(BytesIO(response.content))
        image.load()  # Загружаем изображение, чтобы проверить его корректность
        return image
    except Exception as e:
        print(f"Error downloading image from {url}: {e}")
        return None

def crop_to_multiple_of_64(image):
    try:
        width, height = image.size  # Получаем размеры изображения
        new_width = (width // 64) * 64
        new_height = (height // 64) * 64
        return image.crop((0, 0, new_width, new_height))
    except Exception as e:
        print(f"Error cropping image: {e}")
        return None

def downscale_image_opencv(image, max_size):
    try:
        image = np.array(image)
        height, width = image.shape[:2]
        if width > height:
            new_width = max_size
            new_height = int(height * (max_size / width))
        else:
            new_height = max_size
            new_width = int(width * (max_size / height))
        resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
        return resized_image
    except Exception as e:
        print(f"Error downscaling image: {e}")
        return None

def iterate_range(file_path, start, end):
    with open(file_path, 'r', encoding='utf-8') as file:
        parser = ijson.items(file, 'item')
        for i, item in enumerate(parser, start=1):
            try:
                if start <= i <= end:
                    print(f"Processing item {i}: {item['f']}")
                    
                    # Скачиваем изображение
                    image = download_image(item["f"])
                    if image is None:
                        print(f"Skipping item {i} due to download error.")
                        continue

                    # Проверяем, что изображение корректно
                    if not hasattr(image, "size") or not isinstance(image.size, tuple):
                        print(f"Invalid image size for item {i}. Skipping.")
                        continue

                    # Даунсемплим изображение
                    resized_image = downscale_image_opencv(image, 1536)
                    if resized_image is None:
                        print(f"Skipping item {i} due to downscaling error.")
                        continue

                    image = Image.fromarray(resized_image)
                    # Обрезаем изображение
                    cropped_image = crop_to_multiple_of_64(image)
                    if cropped_image is None:
                        print(f"Skipping item {i} due to cropping error.")
                        continue

                    folder = "/Users/v.kulibaba/Desktop"

                    # Сохраняем изображение
                    output_path = f"{folder}/gb_{start}_{end}_{i}.jpg"
                    image.save(output_path, quality=96)
                    print(f"Saved image {i} to {output_path}")


                    # Удаляем переводы строк и заменяем _ на пробелы
                    processed_text = item['t'].replace("\n", "").replace("_", " ")

                    # Сохраняем в файл
                    output_file = f"{folder}/gb_{start}_{end}_{i}.txt"
                    with open(output_file, 'w', encoding='utf-8') as file:
                        file.write(processed_text)

                    print(f"Текст сохранён в файл: {output_file}")

                elif i > end:
                    break

            except Exception as e:
                print(f"Error processing item {i}: {e}")
                continue

# Пример использования
iterate_range('/Users/v.kulibaba/Pictures/sana/hqdataset.txt', 10, 12)


Processing item 10: https://img3.gelbooru.com/images/00/00/00008e64de644368730f30997cf027d4.jpg
Downloading: https://img3.gelbooru.com/images/00/00/00008e64de644368730f30997cf027d4.jpg
Saved image 10 to /Users/v.kulibaba/Desktop/gb_10_12_10.jpg
Текст сохранён в файл: /Users/v.kulibaba/Desktop/gb_10_12_10.txt
Processing item 11: https://img3.gelbooru.com/images/00/00/000091e6c39a427fd1d82782bda05f6c.jpg
Downloading: https://img3.gelbooru.com/images/00/00/000091e6c39a427fd1d82782bda05f6c.jpg
Saved image 11 to /Users/v.kulibaba/Desktop/gb_10_12_11.jpg
Текст сохранён в файл: /Users/v.kulibaba/Desktop/gb_10_12_11.txt
Processing item 12: https://img3.gelbooru.com/images/00/00/0000a8790f26ff6850b0eec9d7c8f79a.jpg
Downloading: https://img3.gelbooru.com/images/00/00/0000a8790f26ff6850b0eec9d7c8f79a.jpg
Saved image 12 to /Users/v.kulibaba/Desktop/gb_10_12_12.jpg
Текст сохранён в файл: /Users/v.kulibaba/Desktop/gb_10_12_12.txt


In [7]:
ASPECT_RATIO_576 = {}
width = 384
height = 768
step = 64

# Перебираем все возможные значения ширины и высоты с шагом 64
for w in range(width, height + 1, step):  # Диапазон ширины
    for h in range(width, height + 1, step):  # Диапазон высоты
        ratio = round(w / h, 2)  # Вычисляем соотношение сторон и округляем до 2 знаков
        ASPECT_RATIO_576[str(ratio)] = [float(w), float(h)]  # Добавляем в словарь

# Отсортировать словарь по ключу
ASPECT_RATIO_576 = dict(sorted(ASPECT_RATIO_576.items()))

# Вывод словаря в нужном формате
output = "ASPECT_RATIO_576 = {\n"
for key, value in ASPECT_RATIO_576.items():
    output += f'    "{key}": [{value[0]}, {value[1]}],\n'
output += "}"

print(output)



ASPECT_RATIO_576 = {
    "0.5": [384.0, 768.0],
    "0.55": [384.0, 704.0],
    "0.58": [448.0, 768.0],
    "0.6": [384.0, 640.0],
    "0.64": [448.0, 704.0],
    "0.67": [512.0, 768.0],
    "0.7": [448.0, 640.0],
    "0.73": [512.0, 704.0],
    "0.75": [576.0, 768.0],
    "0.78": [448.0, 576.0],
    "0.8": [512.0, 640.0],
    "0.82": [576.0, 704.0],
    "0.83": [640.0, 768.0],
    "0.86": [384.0, 448.0],
    "0.88": [448.0, 512.0],
    "0.89": [512.0, 576.0],
    "0.9": [576.0, 640.0],
    "0.91": [640.0, 704.0],
    "0.92": [704.0, 768.0],
    "1.0": [768.0, 768.0],
    "1.09": [768.0, 704.0],
    "1.1": [704.0, 640.0],
    "1.11": [640.0, 576.0],
    "1.12": [576.0, 512.0],
    "1.14": [512.0, 448.0],
    "1.17": [448.0, 384.0],
    "1.2": [768.0, 640.0],
    "1.22": [704.0, 576.0],
    "1.25": [640.0, 512.0],
    "1.29": [576.0, 448.0],
    "1.33": [768.0, 576.0],
    "1.38": [704.0, 512.0],
    "1.43": [640.0, 448.0],
    "1.5": [768.0, 512.0],
    "1.57": [704.0, 448.0],
    "1.6

In [27]:
import math

preferred_pixel_count = 576 * 576

min_size = 576 // 1.5
max_size = int(576 * 1.3334)
step = 64

ratios_array = []
while min_size != max_size:
    width = int(preferred_pixel_count / min_size)
    if width % step != 0:
        mod = width % step
        if mod < step // 2:
            width -= mod
        else:
            width += step - mod
    ratio = min_size / width

    ratios_array.append((ratio, (int(min_size), width)))
    min_size += step
print(max_size,ratios_array)

ASPECT_RATIO = {}
width = int(576 // 1.5)  # Преобразуем в int
height = int(576 * 1.3334)  # Преобразуем в int
for w in range(width, height + 1, step):  # Диапазон ширины
    for h in range(width, height + 1, step):  # Диапазон высоты
        ratio = round(w / h, 2)  # Вычисляем соотношение сторон и округляем до 2 знаков
        ASPECT_RATIO[ratio] = [int(w), int(h)]  # Добавляем в словарь

# Отсортировать словарь по ключу
ASPECT_RATIO = dict(sorted(ASPECT_RATIO.items()))

ratios_array = []
for key, value in ASPECT_RATIO.items():
    ratios_array.append((key, (value[0], value[1])))
print(max_size,ratios_array)

def get_closest_ratio( width: float,height: float):
        aspect_ratio = width / height 
        closest_ratio = min(ratios_array, key=lambda ratio: abs(ratio[0] - aspect_ratio))
        return closest_ratio
print(get_closest_ratio(446,767))

def get_preffered_size( width: float, height: float):
        pixel_count = height * width

        scale = math.sqrt(pixel_count / preferred_pixel_count)
        return  width / scale, height / scale
print(get_preffered_size(448,768))

768 [(0.42857142857142855, (384, 896)), (0.5833333333333334, (448, 768)), (0.8, (512, 640)), (1.0, (576, 576)), (1.25, (640, 512)), (1.5714285714285714, (704, 448))]
768 [(0.5, (384, 768)), (0.55, (384, 704)), (0.58, (448, 768)), (0.6, (384, 640)), (0.64, (448, 704)), (0.67, (512, 768)), (0.7, (448, 640)), (0.73, (512, 704)), (0.75, (576, 768)), (0.78, (448, 576)), (0.8, (512, 640)), (0.82, (576, 704)), (0.83, (640, 768)), (0.86, (384, 448)), (0.88, (448, 512)), (0.89, (512, 576)), (0.9, (576, 640)), (0.91, (640, 704)), (0.92, (704, 768)), (1.0, (768, 768)), (1.09, (768, 704)), (1.1, (704, 640)), (1.11, (640, 576)), (1.12, (576, 512)), (1.14, (512, 448)), (1.17, (448, 384)), (1.2, (768, 640)), (1.22, (704, 576)), (1.25, (640, 512)), (1.29, (576, 448)), (1.33, (768, 576)), (1.38, (704, 512)), (1.43, (640, 448)), (1.5, (768, 512)), (1.57, (704, 448)), (1.67, (640, 384)), (1.71, (768, 448)), (1.83, (704, 384)), (2.0, (768, 384))]
(0.58, (448, 768))
(439.9272667157606, 754.1610286555896)


In [None]:
import os
from PIL import Image

import cv2
import numpy as np
from PIL import Image

def downscale_image_by(image, max_size,x=64):
    try:
        image = np.array(image)
        height, width = image.shape[:2]
        if width > height:
            new_width = max_size
            new_height = int(height * (max_size / width))
        else:
            new_height = max_size
            new_width = int(width * (max_size / height))
        new_width = (new_width // x) * x
        new_height = (new_height // x) * x
        image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
        image = Image.fromarray(image)
        if image.mode == "RGBA":
            image = image.convert("RGB")
        return image
    except Exception as e:
        print(f"Error downscaling image: {e}")
        return None


def process_images_in_directory(input_dir, output_dir, max_size=768):
    # Создаем выходную директорию, если её нет
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Перебираем все файлы в директории
    for filename in os.listdir(input_dir):
        try:
            # Проверяем, является ли файл изображением
            if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
                input_path = os.path.join(input_dir, filename)
                print(f"Processing image: {input_path}")

                # Открываем изображение
                image = Image.open(input_path)

                # Обрезаем изображение
                image = downscale_image_by(image,768,64)
                if image is None:
                    print(f"Skipping {filename} due to cropping error.")
                    continue

                # Сохраняем обработанное изображение
                name, _ = os.path.splitext(filename)
                output_path = os.path.join(output_dir, f"{name}.jpg")
                image.save(output_path, quality=96)
                print(f"Saved processed image: {output_path}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue

# Пример использования
input_directory = "/Users/v.kulibaba/Desktop/1"  # Укажите путь к папке с изображениями
output_directory = "/Users/v.kulibaba/Desktop/2"  # Укажите путь к папке для сохранения обработанных изображений
process_images_in_directory(input_directory, output_directory)



Processing image: /Users/v.kulibaba/Desktop/1/cropped_image.png
Saved processed image: /Users/v.kulibaba/Desktop/2/cropped_image.png
Processing image: /Users/v.kulibaba/Desktop/1/gb_10_12_11.jpg
Saved processed image: /Users/v.kulibaba/Desktop/2/gb_10_12_11.jpg
Processing image: /Users/v.kulibaba/Desktop/1/gb_10_12_10.jpg
Saved processed image: /Users/v.kulibaba/Desktop/2/gb_10_12_10.jpg
Processing image: /Users/v.kulibaba/Desktop/1/gb_10_12_12.jpg
Saved processed image: /Users/v.kulibaba/Desktop/2/gb_10_12_12.jpg
Processing image: /Users/v.kulibaba/Desktop/1/gb_0_10000_1.jpg
Saved processed image: /Users/v.kulibaba/Desktop/2/gb_0_10000_1.jpg
Processing image: /Users/v.kulibaba/Desktop/1/gb_0_10000_9948.jpg
Saved processed image: /Users/v.kulibaba/Desktop/2/gb_0_10000_9948.jpg


In [None]:
from streaming.base.format.mds.encodings import Encoding, _encodings
import numpy as np
from typing import Any
import torch
from streaming import StreamingDataset

class uint8(Encoding):
    def encode(self, obj: Any) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes) -> Any:
        x=  np.frombuffer(data, np.uint8).astype(np.float32)
        return (x / 255.0 - 0.5) * 24.0

_encodings["uint8"] = uint8

remote_train_dir = "./vae_mds" # this is the path you installed this dataset.
local_train_dir = "./imagenet"

train_dataset = StreamingDataset(
    local=local_train_dir,
    remote=remote_train_dir,
    split=None,
    shuffle=False,
    shuffle_algo="naive",
    num_canonical_nodes=1,
    batch_size = 10
)

train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=100000,
    num_workers=1,
)



In [None]:
from diffusers.models import AutoencoderKL
from diffusers.image_processor import VaeImageProcessor

vae = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae").to("cuda:0")
batch = next(iter(train_dataloader))

i = 0
while i < 100000:
    vae_latent = batch["vae_output"].reshape(-1, 4, 32, 32)[i:i+1].cuda().float()
    text_label = batch['label_as_text'][i]
    #print(text_label)
    
    x = vae.decode(vae_latent.cuda()).sample
    img = VaeImageProcessor().postprocess(image = x.detach(), do_denormalize = [True, True])[0]
    img.save(f"imagenet100k/{i}.jpg", quality=96)
    with open(f"imagenet100k/{i}.txt", "w", encoding="utf-8") as file:
        file.write(text_label)
    i += 1

print("ok")



In [None]:
import os
import shutil
import zlib
import zipfile

import cv2
import numpy as np
from PIL import Image

def downscale_image_by(image, max_size,x=64):
    try:
        image = np.array(image)
        height, width = image.shape[:2]
        if width > height:
            new_width = max_size
            new_height = int(height * (max_size / width))
        else:
            new_height = max_size
            new_width = int(width * (max_size / height))
        new_width = (new_width // x) * x
        new_height = (new_height // x) * x
        image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
        # Кроп изображения: обрезаем 64 пикселей снизу/сверху
        image = image[64:new_height - 64, :]
        image = Image.fromarray(image)
        if image.mode == "RGBA":
            image = image.convert("RGB")
        return image
    except Exception as e:
        print(f"Error downscaling image: {e}")
        return None

def process_images_in_directory(input_dir, output_dir, max_size=1600):
    # Создаем выходную директорию, если её нет
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Рекурсивно обходим все директории и файлы
    for root, dirs, files in os.walk(input_dir):
        for filename in files:
            try:
                input_path = os.path.join(root, filename)
                
                # Проверяем, является ли файл изображением или ZIP-архивом
                if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
                    process_image(input_path, output_dir, max_size)
                elif filename.lower().endswith('.zip'):
                    print(f"processing {filename}")
                    process_zip(input_path, output_dir, max_size)

            except Exception as e:
                print(f"Error processing {filename}: {e}")
                continue

    print('done')

def process_image(input_path, output_dir, max_size):
    try:
        # Открываем изображение
        image = Image.open(input_path)

        # Обрезаем изображение
        image = downscale_image_by(image, max_size, 64)
        if image is None:
            print(f"Skipping {input_path} due to cropping error.")
            return

        # Генерируем уникальное имя файла с использованием CRC32
        crc32_hash = zlib.crc32(input_path.encode('utf-8')) & 0xffffffff
        output_filename = f"{crc32_hash}.jpg"
        output_path = os.path.join(output_dir, output_filename)

        # Сохраняем обработанное изображение
        image.save(output_path, quality=96)
        #print(f"Saved processed image: {output_path}")

    except Exception as e:
        print(f"Error processing image {input_path}: {e}")

def process_zip(zip_path, output_dir, max_size):
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            for zip_info in zip_ref.infolist():
                if zip_info.filename.lower().endswith(('.png', '.jpg', '.jpeg')):
                    # Читаем изображение из архива
                    with zip_ref.open(zip_info) as img_file:
                        image = Image.open(img_file)

                        # Обрезаем изображение
                        image = downscale_image_by(image, max_size, 64)
                        if image is None:
                            print(f"Skipping {zip_info.filename} due to cropping error.")
                            continue

                        # Генерируем уникальное имя файла с использованием CRC32
                        crc32_hash = zlib.crc32(zip_info.filename.encode('utf-8')) & 0xffffffff
                        output_filename = f"{crc32_hash}.jpg"
                        output_path = os.path.join(output_dir, output_filename)

                        # Сохраняем обработанное изображение
                        image.save(output_path, quality=96)
                        #print(f"Saved processed image from zip: {output_path}")

    except Exception as e:
        print(f"Error processing zip {zip_path}: {e}")

# Пример использования
input_dir = '/Users/v.kulibaba/Downloads/nsfw'
output_dir = '/Users/v.kulibaba/Downloads/nsfw_milf2'
process_images_in_directory(input_dir, output_dir, 1280)
print('done')



KeyboardInterrupt: 

In [None]:
import os
import shutil
import zipfile

def unzip_and_remove_zips(input_dir):
    """
    Рекурсивно обходит директорию, распаковывает каждый ZIP-архив в отдельную папку
    с именем архива (без расширения .zip) и удаляет исходный ZIP-архив.
    """
    for root, dirs, files in os.walk(input_dir):
        for filename in files:
            if filename.lower().endswith('.zip'):
                zip_path = os.path.join(root, filename)
                print(f"Processing ZIP: {zip_path}")

                try:
                    # Создаем папку для распаковки с именем архива (без расширения .zip)
                    extract_dir = os.path.splitext(zip_path)[0]  # Убираем расширение .zip
                    os.makedirs(extract_dir, exist_ok=True)

                    # Распаковываем ZIP-архив в созданную папку
                    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                        zip_ref.extractall(extract_dir)

                    # Удаляем исходный ZIP-архив
                    os.remove(zip_path)
                    print(f"Unzipped and removed: {zip_path}")

                except Exception as e:
                    print(f"Error processing {zip_path}: {e}")

# Пример использования
input_dir = '/Users/v.kulibaba/Downloads/nsfw'
unzip_and_remove_zips(input_dir)