<a href="https://www.kaggle.com/code/sukhmansaran/lora-image-generation-sd-models?scriptVersionId=254644590" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

Please read README.md file on my github for more information.

https://github.com/sukhmansaran/fine-tuning-stable-diffusion-models-lora-dreambooth

# DreamBooth + LoRA Inference Pipeline for Stable Diffusion Models

In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
from transformers import AutoTokenizer
from diffusers import StableDiffusionPipeline, UNet2DConditionModel
from diffusers.models.attention_processor import LoRAAttnProcessor
from accelerate import Accelerator

In [None]:
# HF TOKEN
from huggingface_hub import login
login("your_token")

In [None]:
# Downloading Realistic Vision V5
from huggingface_hub import hf_hub_download
import os

model_dir = "your_dir_for_saving_downloaded_base_model"
os.makedirs(model_dir, exist_ok=True)

ckpt_path = hf_hub_download(
    repo_id="SG161222/Realistic_Vision_V5.1_noVAE",
    filename="Realistic_Vision_V5.1.safetensors",
    local_dir=model_dir,
)

This is inference file necessary. The file has to be this exact file as it was used for stable diffusion 1.5 version you cannot use any other file here.

In [None]:
# @title Downloading v1-inference.yaml
!wget -O v1-inference.yaml https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml

In [None]:
# Convert to diffusers format
from diffusers.pipelines.stable_diffusion.convert_from_ckpt import download_from_original_stable_diffusion_ckpt
import os

safetensors_path = "downloaded_base_model_paths"
output_dir = "your_dir_for_saving_converted_base_model"

converted_pipeline = download_from_original_stable_diffusion_ckpt(
    safetensors_path,
    "/v1-inference.yaml",  # Must match SD1.5 or SD2.x
    from_safetensors=True,
    extract_ema=True,
    device="cuda"  # or "cpu"
)

# saving
converted_pipeline.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

Model page: https://huggingface.co/SG161222/Realistic_Vision_V5.1_noVAE

⚠️ If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/SG161222/Realistic_Vision_V5.1_noVAE)
			and/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) 🙏

In [None]:
# ✅ Paths
base_model_path = "your_base_model_path"  # your base model dir
lora_path = "your_lora_path"  # folder containing your LORA weights
output_dir = "./outputs"
os.makedirs(output_dir, exist_ok=True)


Creating the pipeline for loading the model and then using it for inference.

In [None]:
from diffusers.schedulers.scheduling_dpmsolver_multistep import DPMSolverMultistepScheduler

# ✅ Load pipeline
pipe = StableDiffusionPipeline.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,
    safety_checker=None,
    requires_safety_checker=False,
)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.to("cuda")


The below code patches and loads trained weights for unet and text_encoder into the Stable Diffusion model used during the fine tuning process.

In [None]:
# patching to_q, to_k, to_v and to_out

import torch
import torch.nn as nn
from safetensors.torch import load_file

# ✅ LoRA wrapper
class LoRALinear(nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.rank = rank
        self.alpha = alpha
        self.scaling = alpha / rank

        self.lora_down = nn.Linear(linear.in_features, rank, bias=False)
        self.lora_up = nn.Linear(rank, linear.out_features, bias=False)

        nn.init.zeros_(self.lora_up.weight)
        nn.init.kaiming_uniform_(self.lora_down.weight, a=5**0.5)

        self.lora_down.to(linear.weight.device, dtype=linear.weight.dtype)
        self.lora_up.to(linear.weight.device, dtype=linear.weight.dtype)

    def forward(self, x):
        return self.linear(x) + self.lora_up(self.lora_down(x)) * self.scaling

# ✅ Patch UNet with LoRA — q, k, v, and to_out[0]
def patch_unet_cross_attn_with_lora(unet, rank, alpha):
    for module in unet.modules():
        if hasattr(module, 'to_q') and hasattr(module, 'to_k') and hasattr(module, 'to_v'):
            for attr in ['to_q', 'to_k', 'to_v']:
                original = getattr(module, attr)
                if isinstance(original, nn.Linear) and not isinstance(original, LoRALinear):
                    lora_layer = LoRALinear(original, rank=rank, alpha=alpha).to(original.weight.device)
                    setattr(module, attr, lora_layer)

            # Handle to_out[0] if it's a linear layer
            if hasattr(module, 'to_out') and isinstance(module.to_out, nn.ModuleList):
                if isinstance(module.to_out[0], nn.Linear) and not isinstance(module.to_out[0], LoRALinear):
                    original = module.to_out[0]
                    lora_layer = LoRALinear(original, rank=rank, alpha=alpha).to(original.weight.device)
                    module.to_out[0] = lora_layer

# ✅ Load LoRA weights
def apply_lora_weights(unet, lora_path):
    print("🔄 Applying LoRA weights...")
    state_dict = load_file(lora_path, device="cuda")
    missing = []

    for name, param in unet.named_parameters():
        if "lora" in name:
            if name in state_dict:
                param.data.copy_(state_dict[name])
            else:
                missing.append(name)

    print("✅ LoRA weights loaded.")
    if missing:
        print("⚠️ Missing LoRA keys:", missing)

# ✅ Patch CLIP TextEncoder Attention (q_proj, k_proj, v_proj, out_proj)
def patch_text_encoder_attention_with_lora(text_encoder, rank, alpha):
    for module in text_encoder.modules():
        if all(hasattr(module, attr) for attr in ['q_proj', 'k_proj', 'v_proj', 'out_proj']):
            for attr in ['q_proj', 'k_proj', 'v_proj', 'out_proj']:
                original = getattr(module, attr)
                if isinstance(original, nn.Linear) and not isinstance(original, LoRALinear):
                    lora_layer = LoRALinear(original, rank=rank, alpha=alpha).to(original.weight.device)
                    setattr(module, attr, lora_layer)

# ✅ Load LoRA weights into Text Encoder
def apply_lora_weights_to_text_encoder(text_encoder, lora_state_dict):
    missing = []
    for name, param in text_encoder.named_parameters():
        if "lora" in name:
            if name in lora_state_dict:
                param.data.copy_(lora_state_dict[name])
            else:
                missing.append(name)
    print("✅ Text encoder LoRA weights loaded.")
    if missing:
        print("⚠️ Missing text encoder LoRA keys:", missing)


This function code patches and loads our unet LORA weights into our pipeline. You have to use the same rank and alpha used during the fine tuning process if they mismatch it will cause error and not patch the layers.

In [None]:
# ✅ Apply to loaded pipeline UNet
patch_unet_cross_attn_with_lora(pipe.unet, rank=4, alpha=8)
apply_lora_weights(pipe.unet, f"{lora_path}/diffusion_pytorch_model.safetensors")

Same as unet this function code patches and loads our text_encoder LORA weights into our pipeline. You have to use the same rank and alpha used during the fine tuning process if they mismatch it will cause error and not patch the layers.

In [None]:
# ✅ Apply to loaded pipeline Text Encoder
patch_text_encoder_attention_with_lora(pipe.text_encoder, rank=4, alpha=8)

# Reuse same weights already loaded (avoid re-loading from disk)
lora_state_dict = load_file(f"{lora_path}/diffusion_pytorch_model.safetensors", device="cuda")
apply_lora_weights_to_text_encoder(pipe.text_encoder, lora_state_dict)

The inference process make sure to use your trigger_word here. You can change your prompt, negative prompt, guidance scale, num_inference_steps, height and width. 

Guidance scale recommended is 5.5-7.5

Num inference steps recommeded are 30-50

You must use standard images sizes don't use custom sizes as you like as the model may give unnatural, messy results.

In [None]:
# ✅ Define inference settings
prompt = f"{trigger_word}, your prompt"
negative_prompt = (
    "blurry, low resolution, grainy, overexposed, underexposed, bad lighting, jpeg artifacts, glitch, "
    "cropped, out of frame, watermark, duplicate, poorly drawn face, asymmetrical face, deformed features, "
    "bad skin texture, doll-like face, bad eyes, mutated hands, extra fingers, unrealistic proportions, "
    "cartoon, anime, illustration, painting, horror, morbid"
)
guidance_scale = 6.5
num_inference_steps = 30
height = 768
width = 768


This code generates and saves your images.

In [None]:
# ✅ Generate and save images
image = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    height=height,
    width=width,
    num_inference_steps=num_inference_steps,
    guidance_scale=guidance_scale
).images[0]

image.save(os.path.join(output_dir, "output.png"))
# image.show()
