# 🔍 Debug Compel: Why Are Results Poor?

## Hypothesis Testing

Based on the poor results, let's test different weighting levels and strategies:

1. **Test lighter weighting**: `+` instead of `++`
2. **Test specific terms**: Only weight the most important terms
3. **Test different models**: See if it's model-specific
4. **Visual inspection**: Look at actual images to see what's wrong

---


In [None]:
# Setup
import torch
from compel import Compel, ReturnedEmbeddingsType
from diffusers import StableDiffusionXLPipeline
import matplotlib.pyplot as plt

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

# Load SDXL
pipe = StableDiffusionXLPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0", 
    variant="fp16", torch_dtype=torch.float16
).to(device)

compel = Compel(
    tokenizer=[pipe.tokenizer, pipe.tokenizer_2],
    text_encoder=[pipe.text_encoder, pipe.text_encoder_2],
    returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
    requires_pooled=[False, True],
)

print("✅ Setup complete")

# Test prompt
test_prompt = "modern signet ring, oval face, engraved gothic initial 'M', high-polish sterling silver"


In [None]:
# Test different weighting strategies
strategies = {
    "baseline": test_prompt,
    "light_weight": "modern signet+ ring, oval face, engraved+ gothic+ initial+ 'M'+",
    "medium_weight": "modern signet++ ring, oval face, engraved++ gothic++ initial++ 'M'++", 
    "heavy_weight": "modern signet+++ ring, oval face, engraved+++ gothic+++ initial+++ 'M'+++",
    "selective": "modern (signet)1.2 ring, oval face, (engraved)1.3 gothic (initial)1.2 'M'"
}

print("🧪 Testing different weighting strategies:")
for name, prompt in strategies.items():
    print(f"  {name}: {prompt}")

# Generate and compare
results = {}
for name, prompt in strategies.items():
    print(f"\n🎨 Generating: {name}")
    
    if name == "baseline":
        # Standard generation
        image = pipe(
            prompt=prompt,
            num_inference_steps=20,
            guidance_scale=5.0,
            width=512, height=512,
            generator=torch.Generator(device=device).manual_seed(42)
        ).images[0]
    else:
        # Compel generation
        try:
            cond, pooled = compel([prompt])
            image = pipe(
                prompt_embeds=cond,
                pooled_prompt_embeds=pooled,
                num_inference_steps=20,
                guidance_scale=5.0,
                width=512, height=512,
                generator=torch.Generator(device=device).manual_seed(42)
            ).images[0]
        except Exception as e:
            print(f"  ❌ Failed: {e}")
            continue
    
    results[name] = image
    print(f"  ✅ Generated: {name}")

print(f"\n🎉 Generated {len(results)} test images")


In [None]:
# Visual comparison
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, (name, image) in enumerate(results.items()):
    if i < len(axes):
        axes[i].imshow(image)
        axes[i].set_title(f"{name}", fontweight='bold')
        axes[i].axis('off')

# Hide unused subplots
for i in range(len(results), len(axes)):
    axes[i].axis('off')

plt.tight_layout()
plt.savefig("compel_debug_comparison.png", dpi=150, bbox_inches='tight')
plt.show()

print("💡 Key Questions:")
print("1. Do the Compel images look worse than baseline?")
print("2. Is the 'M' more visible in any version?") 
print("3. Are there artifacts or quality issues?")
print("4. Which weighting level works best?")
