In [1]:
from datasets import load_dataset, Image

ds = load_dataset("sylvainlapeyrade/kanji_english_meaning", split="train")

In [2]:
!pip install -q --upgrade transformers diffusers ftfy peft

In [None]:
# !pip install numba

from numba import cuda
device = cuda.get_current_device()
device.reset()
torch.cuda.empty_cache()

In [3]:
from base64 import b64encode

import numpy
import torch
from diffusers import AutoencoderKL, LMSDiscreteScheduler, UNet2DConditionModel, DDPMScheduler, StableDiffusionPipeline, DiffusionPipeline
from diffusers.training_utils import compute_dream_and_update_latents
from huggingface_hub import notebook_login
from torchvision import transforms

# For video display:
from IPython.display import HTML
from matplotlib import pyplot as plt
from pathlib import Path
from PIL import Image
from torch import autocast
from tqdm.auto import tqdm
from transformers import CLIPTextModel, CLIPTokenizer, logging
import os

from accelerate import Accelerator, notebook_launcher
from accelerate.utils import set_seed
from huggingface_hub import create_repo, upload_folder
from diffusers.optimization import get_cosine_schedule_with_warmup
from tqdm.auto import tqdm
from pathlib import Path
import torch.nn.functional as F
import os
from diffusers.utils.torch_utils import is_compiled_module
from diffusers.utils import convert_state_dict_to_diffusers, is_wandb_available
from peft.utils import get_peft_model_state_dict

from peft import LoraConfig, get_peft_model

torch.manual_seed(1)
if not (Path.home()/'.cache/huggingface'/'token').exists(): notebook_login()

# Supress some unnecessary warnings when loading the CLIPTextModel
logging.set_verbosity_error()

# Set device
torch_device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
if "mps" == torch_device: os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = "1"

In [4]:
def cast_training_params(model, dtype):
    if not isinstance(model, list):
        model = [model]
    for m in model:
        for param in m.parameters():
            # only upcast trainable parameters into fp32
            if param.requires_grad:
                param.data = param.to(dtype)

In [5]:
pretrained_model_name_or_path = "bguisard/stable-diffusion-nano-2-1"
# pretrained_model_name_or_path = "CompVis/stable-diffusion-v1-4"
# pretrained_model_name_or_path = "stabilityai/stable-diffusion-2-1-base"

def get_models_and_scheduler(pretrained_model_name_or_path):
    # Load the autoencoder model which will be used to decode the latents into image space.
    vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae")

    # Load the tokenizer and text encoder to tokenize and encode the text.
    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
    if "openai" in pretrained_model_name_or_path:
        text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
    else:
        text_encoder = CLIPTextModel.from_pretrained(pretrained_model_name_or_path, subfolder="text_encoder") ## due to size mismatch 

    # The UNet model for generating the latents.
    unet = UNet2DConditionModel.from_pretrained(pretrained_model_name_or_path, subfolder="unet")

    # The noise scheduler
    # noise_scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
    noise_scheduler = DDPMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
    
    return vae, tokenizer, text_encoder, unet, noise_scheduler
    
vae, tokenizer, text_encoder, unet, noise_scheduler = get_models_and_scheduler(pretrained_model_name_or_path)

gradient_accumulation_steps = 1
weight_dtype = torch.float32
seed = 42
set_seed(seed)

# Initialize accelerator and tensorboard logging
accelerator = Accelerator(
    mixed_precision="fp16",
    gradient_accumulation_steps=gradient_accumulation_steps,
    log_with="wandb",
    project_dir=os.path.join("/kaggle/working/", "logs"),
)

# freeze parameters of models to save more memory
unet.requires_grad_(False)
vae.requires_grad_(False)
text_encoder.requires_grad_(False)

if accelerator.mixed_precision == "fp16":
    weight_dtype = torch.float16
elif accelerator.mixed_precision == "bf16":
    weight_dtype = torch.bfloat16

# Freeze the unet parameters before adding adapters
for param in unet.parameters():
    param.requires_grad_(False)

# Define LoRA configuration
lora_config = LoraConfig(
    r=4,  # Rank of the low-rank update
    lora_alpha=4,  # Scaling factor
    init_lora_weights="gaussian",
    target_modules=["to_k", "to_q", "to_v", "to_out.0"],
)

# Move unet, vae and text_encoder to device and cast to weight_dtype
unet.to(accelerator.device, dtype=weight_dtype)
vae.to(accelerator.device, dtype=weight_dtype)
text_encoder.to(accelerator.device, dtype=weight_dtype)

unet.add_adapter(lora_config)

cast_training_params(unet, dtype=torch.float32)

lora_layers = filter(lambda p: p.requires_grad, unet.parameters())

optimizer = torch.optim.AdamW(lora_layers, lr=0.00001, betas=(0.9, 0.999), weight_decay=1e-2)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [6]:
def model_size_in_params(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total Parameters: {total_params:,}")
    return total_params
def model_size_in_mb(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    total_size = (param_size + buffer_size) / (1024**2)  # Convert to MB
    print(f"Model Size: {total_size:.2f} MB")
    return total_size

def pil_to_latents(pil):
    img = transforms.ToTensor()(pil).unsqueeze(0).to(torch_device) ## *2 -1 (inside the encode)
    with torch.no_grad():
        latent = vae.encode(img) 
    return 0.1825 * latent.latent_dist.sample()

def latents_to_pil(latents):
    # bath of latents -> list of images
    latents = (1 / 0.18215) * latents
    with torch.no_grad():
        image = vae.decode(latents).sample
    image = (image / 2 + 0.5).clamp(0, 1)
    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
    images = (image * 255).round().astype("uint8")
    pil_images = [Image.fromarray(image) for image in images]
    return pil_images


In [7]:
def get_text_embs_orig(prompts):
    batch_size = len(prompts)
    cond_tokens = tokenizer(prompts, padding='max_length', max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
    with torch.no_grad():
        cond_embs = text_encoder(cond_tokens.input_ids.to(accelerator.device), return_dict=False)[0]
    return cond_embs

def get_text_embs(prompts):
    batch_size = len(prompts)
    cond_tokens = tokenizer(prompts, padding='max_length', max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
    with torch.no_grad():
        cond_embs = text_encoder(cond_tokens.input_ids.to(accelerator.device))[0]
    max_length = cond_embs.shape[-1]
    uncond_tokens = tokenizer([''] * batch_size, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
    with torch.no_grad():
        uncond_embs = text_encoder(uncond_tokens.input_ids.to(accelerator.device))[0]
    text_embs = torch.cat([uncond_embs, cond_embs])
    return text_embs

def get_latent_embs(img_tensors, dtype=None):
    with torch.no_grad():
        if dtype: img_tensors = img_tensors.to(dtype=weight_dtype)
        latents = vae.encode(img_tensors)
    return 0.1825 * latents.latent_dist.sample()
    
    

In [8]:
width = 128 
height = 128
def plot_image(data):
    img = data['image'].permute(1, 2, 0).cpu().numpy()
    img = (img * 255).round().astype("uint8")
    return data['labels'], Image.fromarray(img)
preprocess = transforms.Compose([
            transforms.Resize((width, height)),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5])
            ])
def transform(examples):
    images = [preprocess(img) for img in examples['image']]
    labels = [label for label in examples['text']]
    return {'image': images, 'labels': labels}

ds.set_transform(transform)

In [9]:
dataloader = torch.utils.data.DataLoader(ds, shuffle=True, batch_size=20)

In [10]:
lr_scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(dataloader) * 1000),
)

unet, optimizer, dataloader, lr_scheduler, vae, text_encoder = accelerator.prepare(
    unet, optimizer, dataloader, lr_scheduler, vae, text_encoder
)

**Training**

In [11]:
def unwrap_model(model):
    model = accelerator.unwrap_model(model)
    model = model._orig_mod if is_compiled_module(model) else model
    return model

In [12]:
save_image_epochs = 3
save_model_epochs = 10
num_epochs = 1000
output_dir = '/kaggle/working/checkpoints'
generator = torch.manual_seed(0)
max_grad_norm = 1.0
guidance_scale = 8.5

def make_image_grid(imgs, rows, cols):
    w,h = imgs[0].size
    grid = Image.new('RGB', size=(cols*w, rows*h))
    for i, img in enumerate(imgs): grid.paste(img, box=(i%cols*w, i//cols*h))
    return grid


def validation(pipeline, accelerator, width, height, epoch):
    validation_prompts = ['a kanji meaning fire', 'a golf course', 'a kanji meaning rage, excitement']
    num_inference_steps = 50

    pipeline = pipeline.to(accelerator.device)
    
    images = []
    autocast_ctx = torch.autocast(accelerator.device.type)
    with autocast_ctx:
        for i in range(len(validation_prompts)):
            image = pipeline(validation_prompts[i], num_inference_steps=num_inference_steps, generator=generator, width=width, height=height).images[0]
            images.append(image)
        
    # Make a grid out of the images and save them
    image_grid = make_image_grid(images, rows=1, cols=3)

    # Save the images
    test_dir = os.path.join("/kaggle/working/", "samples")
    os.makedirs(test_dir, exist_ok=True)
    image_grid.save(f"{test_dir}/{epoch:04d}.png")

def training_loop(unet, num_epoch=10, cfg=False):
    max_train_step = 100000
    
    if accelerator.is_main_process:
        if output_dir is not None:
            os.makedirs(output_dir, exist_ok=True)
        accelerator.init_trackers("sd_kanjivg")
        
    global_step = 0
    # Now you train the model
    for epoch in range(num_epochs):
        progress_bar = tqdm(total=len(dataloader), disable=not accelerator.is_local_main_process)
        progress_bar.set_description(f"Epoch {epoch}")
        train_loss = 0.0
        unet.train()
#         noise_scheduler.set_timesteps(noise_scheduler.config.num_train_timesteps)

        for step, batch in enumerate(dataloader):
            with accelerator.accumulate(unet):
                clean_images = batch['image']
                latents = get_latent_embs(clean_images, dtype=weight_dtype)
                if cfg:
                    latents = torch.cat([latents] * 2)
                noise = torch.rand_like(latents)
                target = noise
                bs = clean_images.shape[0]
                timestep = torch.randint(
                0, noise_scheduler.config.num_train_timesteps, (latents.shape[0],)
                )
                timestep = timestep.long()
                noisy_latents = noise_scheduler.add_noise(latents, noise, timestep) ## for DDPMS scheduler 
#                 noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps=noise_scheduler.timesteps[timestep])
                timestep = timestep.to(accelerator.device)
                if cfg:
                    encoder_hidden_states = get_text_embs(batch['labels'])
                else:
                    encoder_hidden_states = get_text_embs_orig(batch['labels'])
#                 unet = unet.to(accelerator.device)
                model_pred = unet(noisy_latents, timestep, encoder_hidden_states, return_dict=False)[0]
                if cfg:
                    noise_pred_uncond, noise_pred_text = model_pred.chunk(2)
                    # Classifier-free guidance prediction
                    model_pred = torch.cat([noise_pred_uncond, guidance_scale * (noise_pred_text - noise_pred_uncond)])

                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
                # Gather the losses across all processes for logging (if we use distributed training).
                avg_loss = accelerator.gather(loss.repeat(latents.shape[0])).mean()
                train_loss += avg_loss.item() / gradient_accumulation_steps

                # Backpropagate
                accelerator.backward(loss)
                if accelerator.sync_gradients:
                    params_to_clip = lora_layers
                    accelerator.clip_grad_norm_(params_to_clip, max_grad_norm)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

                progress_bar.update(1)
                logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step}
                progress_bar.set_postfix(**logs)
                accelerator.log(logs, step=global_step)
                global_step += 1

        # After each epoch you optionally sample some demo images with evaluate() and save the model
        if accelerator.is_main_process:

            if (epoch + 1) % save_image_epochs == 0 or epoch == num_epochs - 1:
                pipeline = DiffusionPipeline.from_pretrained(
                    pretrained_model_name_or_path,
                    unet=unwrap_model(unet),
                    revision=None,
                    variant=None,
                    torch_dtype=weight_dtype,
                )
                validation(pipeline, accelerator, width, height, epoch)
                del pipeline

            if (epoch + 1) % save_model_epochs == 0 or epoch == num_epochs - 1:
#                 unet.save_pretrained(output_dir)
                save_path = os.path.join(output_dir, f"checkpoint-{global_step}")
                accelerator.save_state(save_path)

                unwrapped_unet = unwrap_model(unet)
                unet_lora_state_dict = convert_state_dict_to_diffusers(
                    get_peft_model_state_dict(unwrapped_unet)
                )

                StableDiffusionPipeline.save_lora_weights(
                    save_directory=save_path,
                    unet_lora_layers=unet_lora_state_dict,
                    safe_serialization=True,
                )
                accelerator.log({"note": "Saved model"})

        if global_step > max_train_step:
            break

In [None]:
pipeline = DiffusionPipeline.from_pretrained(
                    pretrained_model_name_or_path,
                    unet=unwrap_model(unet),
                    revision=None,
                    variant=None,
                    torch_dtype=weight_dtype,
                )
validation(pipeline, accelerator, width, height, epoch=0)
del pipeline

In [None]:
training_loop(unet)

[34m[1mwandb[0m: Currently logged in as: [33msanketsans97[0m ([33msanketsans97-university-of-genoa[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

In [None]:
# pipeline = DiffusionPipeline.from_pretrained(
#     "CompVis/stable-diffusion-v1-4",
#     torch_dtype=weight_dtype,
# )
pipeline = StableDiffusionPipeline.from_pretrained("bguisard/stable-diffusion-nano-2-1", torch_dtype=weight_dtype, safety_checker=None)

In [None]:
pipeline = DiffusionPipeline.from_pretrained(
                    pretrained_model_name_or_path,
                    unet=unwrap_model(unet),
                    revision=None,
                    variant=None,
                    torch_dtype=weight_dtype,
                )

In [None]:
pipeline = StableDiffusionPipeline.from_pretrained(
    pretrained_model_name_or_path,
    vae=accelerator.unwrap_model(vae),
    text_encoder=accelerator.unwrap_model(text_encoder),
    tokenizer=tokenizer,
    unet=accelerator.unwrap_model(unet),
    safety_checker=None,
    torch_dtype=weight_dtype,
)

In [None]:
pipeline.load_lora_weights('/kaggle/working/checkpoints/checkpoint-96')

In [None]:
# sample_prompts = ['a football stadium', 'a golf course', 'a tennis stadium'] 
sample_prompts = ['a kanji meaning Elon Musk', 'a kanji meaning gaming', 'a kanji meaning football']
images = []
pipeline = pipeline.to(accelerator.device)
autocast_ctx = torch.autocast(accelerator.device.type)
with autocast_ctx:
    for p in sample_prompts:
        images.append(pipeline(p, num_inference_steps=30, generator=generator, height=128, width=128).images[0])

In [None]:
images[2]

In [None]:
import os

def make_image_grid(imgs, rows, cols):
    w,h = imgs[0].size
    grid = Image.new('RGB', size=(cols*w, rows*h))
    for i, img in enumerate(imgs): grid.paste(img, box=(i%cols*w, i//cols*h))
    return grid

@torch.no_grad()
def evaluate(seed=42):
    vae, tokenizer, text_encoder, unet, noise_scheduler = get_models_and_scheduler(pretrained_model_name_or_path)
    set_seed(seed)
    # freeze parameters of models to save more memory
    unet.requires_grad_(False)
    vae.requires_grad_(False)
    text_encoder.requires_grad_(False)
    
    unet.to(accelerator.device)
    vae.to(accelerator.device)
    text_encoder.to(accelerator.device)
    
    sample_prompts = ['a kanji meaning Elon Musk', 'a kanji meaning gaming', 'a kanji meaning robot']
    batch_size = len(sample_prompts)
    guidance_scale = 12 
    num_inference_steps = 50
    generator = torch.manual_seed(0)
    text_embeddings = get_text_embs_orig(sample_prompts)
    
    # Prep Scheduler
    def set_timesteps(scheduler, num_inference_steps):
        scheduler.set_timesteps(num_inference_steps)
#         scheduler.timesteps = scheduler.timesteps.to(torch.float32) # minor fix to ensure MPS compatibility, fixed in diffusers PR 3925

    set_timesteps(noise_scheduler,num_inference_steps)

    # Prep latents
    latents = torch.randn(
      (batch_size, unet.config.in_channels, height // 8, width // 8),
      generator=generator,
    )
    latents = latents.to(accelerator.device)
    latents = latents * noise_scheduler.init_noise_sigma 
    autocast_ctx = torch.autocast(accelerator.device.type)
    
    # Loop
    with autocast_ctx:  # will fallback to CPU if no CUDA; no autocast for MPS
        for i, t in tqdm(enumerate(noise_scheduler.timesteps), total=len(noise_scheduler.timesteps)):
            # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
#             latent_model_input = torch.cat([latents] * 2)
            latent_model_input = latents
#             sigma = noise_scheduler.sigmas[i]
            # Scale the latents (preconditioning):
            # latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5) # Diffusers 0.3 and below
            latent_model_input = noise_scheduler.scale_model_input(latent_model_input, t)
            

            # predict the noise residual
            with torch.no_grad():
                noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

#             # perform guidance
#             noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
#             noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

            # compute the previous noisy sample x_t -> x_t-1
            latents = noise_scheduler.step(noise_pred, t, latents).prev_sample

        # scale and decode the image latents with vae
        latents = 1 / 0.18215 * latents.to(accelerator.device)
        with torch.no_grad():
            image = vae.decode(latents).sample
        images = (image / 2 + 0.5).clamp(0, 1)
        
    images = images.detach().cpu().permute(0, 2, 3, 1).numpy()
    images = (images * 255).round().astype("uint8")
    pil_images = [Image.fromarray(image) for image in images]

    # Make a grid out of the images and save them
    image_grid = make_image_grid(pil_images, rows=1, cols=3)

    # Save the images
    test_dir = os.path.join("/kaggle/working/", "samples")
    os.makedirs(test_dir, exist_ok=True)
    image_grid.save(f"{test_dir}/test.png")

In [None]:
evaluate()

In [None]:
!rm -rf '/kaggle/working/checkpoints'
!rm -rf '/kaggle/working/logs'
!rm -rf '/kaggle/working/samples'
!rm -rf '/kaggle/working/wandb'

In [None]:
torch.cuda.empty_cache()

In [None]:
!ls '/kaggle/working/checkpoints/checkpoint-96'

In [None]:
!pip install safetensors

In [None]:
from safetensors.torch import load_file
from peft import PeftModel, LoraConfig

# Path to the directory where LoRA weights are stored
output_dir = "/kaggle/working/checkpoints"  # Replace this with your output directory

# Load LoRA weights stored in safetensors format
lora_weights = load_file(f"{output_dir}/diffusion_pytorch_model.safetensors")

In [None]:
# Here, you need to map the weights back to their corresponding layers in the UNet
unet.load_state_dict(lora_weights, strict=True)

**Downloading files**

In [None]:
# !cd /kaggle/working
!zip -r file.zip /kaggle/working

In [None]:
from IPython.display import FileLink
FileLink(r'file.zip')

In [None]:
os.chdir("/kaggle/working")
!git clone https://github.com/huggingface/diffusers

In [None]:
!pip install git+https://github.com/huggingface/diffusers

In [None]:

os.chdir("/kaggle/working/diffusers/examples/text_to_image/")
!pip install -r requirements.txt


In [None]:
!accelerate config

In [None]:
accelerate launch train_text_to_image_lora.py \
--pretrained_model_name_or_path="CompVis/stable-diffusion-v1-4" \
--dataset_name="sylvainlapeyrade/kanji_english_meaning" --caption_column="text" \
--resolution=128 --random_flip \
--train_batch_size=8 \
--num_train_epochs=1 --checkpointing_steps=500 \
--learning_rate=1e-04 --lr_scheduler="constant" --lr_warmup_steps=0 \
--seed=42 \
--output_dir="kanji2english" \
--validation_prompt="A kanji meaning Elon Musk" \