# Solutions




---

### CADQuery Code Generation - Notebook Summary

#### 1. Objective

Generate CadQuery code from rendered 2D images using a vision-to-language model.

---

#### 2. Dataset

* `CADCODER/GenCAD-Code`
* 147K pairs of rendered image and corresponding CadQuery script.

---

#### 3. Baseline Model

* **CLIP (ViT-B/16)** as image encoder (frozen)
* **GPT2** as language decoder (frozen)
* **Projection Head**: MLP to map CLIP features to GPT2 token embeddings
* **Training**: CrossEntropy loss on code tokens

---

#### 4. Enhanced Model

* **Beam search decoding**: `num_beams=5`, `temperature=0.7`, `top_p=0.95`
* **Feedback signal**: Valid syntax, executability, and geometry IOU
* **PPO Reinforcement**: Reward-weighted training using PPO
* **Optional geometry loss**: SDF/mesh loss placeholder

---

#### 5. Evaluation Metrics

* **Valid Syntax Rate (VSR)**
* **IOU\_best**: 3D shape overlap via voxelization

---

#### 6. Outcome

* Both baseline and enhanced models implemented
* Reinforcement and feedback-ready pipeline
* Evaluation setup for fair comparison

---

> Full implementation details, design choices, and results are documented in the [README](./README.md).


# Implementation

In [8]:
# losses/mesh_loss.py
import torch
import torch.nn.functional as F
from cadquery import exporters
from cadquery import cq
import tempfile
import os

def dummy_sdf_generator(code):
    # Placeholder: create a fake voxel grid for now
    # In practice, this should voxelize the actual CAD geometry
    return torch.rand((32, 32, 32))

def mesh_loss(pred_code: str, gt_sdf: torch.Tensor) -> torch.Tensor:
    try:
        sdf_pred = dummy_sdf_generator(pred_code)
        sdf_pred = sdf_pred.to(gt_sdf.device)
        return F.mse_loss(sdf_pred, gt_sdf)
    except Exception as e:
        print("Mesh loss generation failed:", e)
        return torch.tensor(0.0, requires_grad=True, device=gt_sdf.device)


In [6]:
# rl/ppo.py

import torch
from torch import nn
from torch.optim import AdamW
import torch.nn.functional as F
from transformers import CLIPProcessor, CLIPModel

# You may need to import clip_model and clip_processor externally or pass as argument

class PPOTrainer:
    def __init__(self, model, proj, tokenizer, reward_fn, clip_model, clip_processor, device):
        self.model = model
        self.proj = proj
        self.tokenizer = tokenizer
        self.reward_fn = reward_fn
        self.clip_model = clip_model
        self.clip_processor = clip_processor
        self.device = device

        self.optimizer = AdamW(list(model.parameters()) + list(proj.parameters()), lr=1e-5)

    def step(self, images, reward):
        self.model.eval()
        self.proj.eval()
        self.optimizer.zero_grad()

        with torch.no_grad():
            processed = self.clip_processor(images=images, return_tensors="pt", padding=True).to(self.device)
            clip_out = self.clip_model.get_image_features(**processed)
            img_emb = self.proj(clip_out)
            prefix_emb = img_emb.unsqueeze(1)

        input_embeds = prefix_emb
        outputs = self.model(inputs_embeds=input_embeds, labels=None)
        logits = outputs.logits
        log_probs = F.log_softmax(logits, dim=-1)

        # Simple scalar reward signal applied to log-probabilities
        loss = -reward * log_probs.mean()
        loss.backward()
        self.optimizer.step()

        return loss.detach()


In [7]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import (
    CLIPProcessor, CLIPModel,
    GPT2Tokenizer, GPT2LMHeadModel,
    AdamW, get_linear_schedule_with_warmup
)
from cadquery import exporters
from metrics.best_iou import get_iou_best
from metrics.valid_syntax_rate import evaluate_syntax_rate
from datasets import load_dataset
import ast
import wandb
from PIL import Image
import io
from torchvision import transforms

# Optional: For SDF/mesh-based loss and PPO
# from losses.mesh_loss import mesh_loss
# from rl.ppo import PPOTrainer

# ------------------- WandB -------------------
# wandb.init(project="cadquery-codegen", name="clip-gpt2-enhanced")

# ------------------- Models -------------------
class ClipToGPT2Improved(nn.Module):
    def __init__(self, clip_dim, gpt2_dim):
        super().__init__()
        self.proj = nn.Sequential(
            nn.Linear(clip_dim, 4 * gpt2_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(4 * gpt2_dim, gpt2_dim)
        )

    def forward(self, x):
        return self.proj(x)

# ------------------- Utilities -------------------
def is_valid_python(code):
    try:
        ast.parse(code)
        return True
    except:
        return False

def is_executable(code):
    try:
        exec_globals = {}
        exec(code, exec_globals)
        return exec_globals.get("result", None) is not None
    except Exception:
        return False

# ------------------- Dataset -------------------
class ImageCodeDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=256):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
        ])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        img_data = item["image"]

        if isinstance(img_data, dict) and "bytes" in img_data:
            image = Image.open(io.BytesIO(img_data["bytes"])).convert("RGB")
        elif isinstance(img_data, str):
            image = Image.open(img_data).convert("RGB")
        elif isinstance(img_data, Image.Image):
            image = img_data.convert("RGB")
        else:
            raise ValueError(f"Unknown image format: {type(img_data)}")

        image_tensor = self.transform(image)

        code = item["cadquery"]
        code_ids = self.tokenizer(code, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")

        return {
            "image": image_tensor,
            "input_ids": code_ids["input_ids"].squeeze(0),
            "attention_mask": code_ids["attention_mask"].squeeze(0),
            "code": code
        }

# ------------------- Load Models -------------------
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

gpt2 = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2.config.pad_token_id = tokenizer.pad_token_id

proj = ClipToGPT2Improved(clip_model.config.projection_dim, gpt2.config.n_embd)

# ------------------- Device Setup -------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt2.to(device)
clip_model.to(device)
proj.to(device)

# ------------------- Optimizer -------------------
optimizer = AdamW(list(gpt2.parameters()) + list(proj.parameters()), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=1000)

# ------------------- Flags -------------------
use_feedback = True
use_sdf_loss = True
use_ppo = True
ppo_trainer = PPOTrainer(gpt2, proj, tokenizer, reward_fn=get_iou_best)

# ------------------- Load Dataset -------------------
ds = load_dataset("CADCODER/GenCAD-Code", cache_dir="/tmp/hf_cache")
train_dataset = ImageCodeDataset(ds["train"], tokenizer)
test_dataset = ImageCodeDataset(ds["test"], tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1)

# ------------------- Training -------------------
def train(train_loader):
    gpt2.train()
    proj.train()

    for epoch in range(5):
        for step, batch in enumerate(train_loader):
            images = batch["image"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            with torch.no_grad():
                processed = clip_processor(images=images, return_tensors="pt", padding=True).to(device)
                clip_out = clip_model.get_image_features(**processed)

            img_emb = proj(clip_out)
            prefix_emb = img_emb.unsqueeze(1)

            code_emb = gpt2.transformer.wte(input_ids)
            inputs_embeds = torch.cat([prefix_emb, code_emb[:, :-1, :]], dim=1)

            labels = input_ids.clone()
            labels[:, 0] = -100

            outputs = gpt2(inputs_embeds=inputs_embeds, labels=labels)
            loss = outputs.loss

            if use_sdf_loss:
                try:
                    sdf_gt = batch.get("sdf", None)
                    if sdf_gt is not None:
                        pred_code = gpt2.generate(inputs_embeds=prefix_emb, max_length=256)[0]
                        sdf_loss = mesh_loss(pred_code, sdf_gt)
                        loss += 0.1 * sdf_loss
                        # wandb.log({"train/sdf_loss": sdf_loss.item()})
                except Exception as e:
                    print("SDF loss failed", e)

            # wandb.log({"train/loss": loss.item(), "epoch": epoch, "step": step})

            if use_feedback:
                pred_code = tokenizer.decode(
                    gpt2.generate(inputs_embeds=prefix_emb, max_length=256)[0],
                    skip_special_tokens=True
                )
                reward = 0.0
                if not is_valid_python(pred_code):
                    reward -= 0.5
                if not is_executable(pred_code):
                    reward -= 0.5
                try:
                    gt_code = batch["code"][0]
                    iou = get_iou_best(gt_code, pred_code)
                    wandb.log({"train/feedback_iou": iou})
                    reward += iou
                except:
                    reward -= 0.5
                if use_ppo:
                    ppo_loss = ppo_trainer.step(images, reward)
                    loss += 0.1 * ppo_loss
                    wandb.log({"train/ppo_loss": ppo_loss.item()})

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            if step % 10 == 0:
                print(f"Epoch {epoch+1} Step {step} Loss: {loss.item():.4f}")

# ------------------- Entry Points -------------------
def train_baseline(train_loader):
    global use_feedback, use_sdf_loss, use_ppo
    use_feedback = False
    use_sdf_loss = False
    use_ppo = False
    train(train_loader)
    torch.save(gpt2.state_dict(), "baseline_gpt2.pth")
    torch.save(proj.state_dict(), "baseline_proj.pth")

def train_enhanced(train_loader):
    global use_feedback, use_sdf_loss, use_ppo
    use_feedback = True
    use_sdf_loss = True
    use_ppo = True
    train(train_loader)
    torch.save(gpt2.state_dict(), "enhanced_gpt2.pth")
    torch.save(proj.state_dict(), "enhanced_proj.pth")

# ------------------- Evaluation -------------------
def generate_code_from_image(image, max_length=256):
    gpt2.eval()
    proj.eval()
    with torch.no_grad():
        processed = clip_processor(images=image, return_tensors="pt").to(device)
        clip_out = clip_model.get_image_features(**processed)
        img_emb = proj(clip_out)
        prefix_emb = img_emb.unsqueeze(1)

        generated = gpt2.generate(
            inputs_embeds=prefix_emb,
            attention_mask=torch.ones(prefix_emb.shape[:-1], dtype=torch.long, device=device),
            max_length=max_length,
            num_beams=5,
            temperature=0.7,
            top_p=0.95,
            pad_token_id=tokenizer.pad_token_id,
        )

        return tokenizer.decode(generated[0], skip_special_tokens=True)

def evaluate_syntax_rate_dataset(dataset):
    generated_codes = {}
    for i, sample in enumerate(dataset):
        image = sample["image"].unsqueeze(0).to(device)
        code = generate_code_from_image(image)
        generated_codes[f"sample_{i}"] = code
    return evaluate_syntax_rate(generated_codes)

def evaluate_iou_dataset(dataset):
    scores = []
    for i, sample in enumerate(dataset):
        image = sample["image"].unsqueeze(0).to(device)
        gt_code = sample["code"]
        pred_code = generate_code_from_image(image)
        try:
            iou = get_iou_best(gt_code, pred_code)
        except Exception:
            iou = 0.0
        scores.append(iou)
    return sum(scores) / len(scores) if scores else 0.0

def evaluate_baseline_vs_enhanced(dataset):
    print("=== [Baseline Model] ===")
    gpt2.load_state_dict(torch.load("baseline_gpt2.pth", map_location=device))
    proj.load_state_dict(torch.load("baseline_proj.pth", map_location=device))
    vsr = evaluate_syntax_rate_dataset(dataset)
    print(f"Valid Syntax Rate: {vsr['vsr']:.3f}")
    iou_score = evaluate_iou_dataset(dataset)
    print(f"Mean IOU: {iou_score:.3f}")

    print("\n=== [Enhanced Model] ===")
    gpt2.load_state_dict(torch.load("enhanced_gpt2.pth", map_location=device))
    proj.load_state_dict(torch.load("enhanced_proj.pth", map_location=device))
    vsr = evaluate_syntax_rate_dataset(dataset)
    print(f"Valid Syntax Rate: {vsr['vsr']:.3f}")
    iou_score = evaluate_iou_dataset(dataset)
    print(f"Mean IOU: {iou_score:.3f}")

In [None]:
train_baseline(train_loader)
train_enhanced(train_loader)
evaluate_baseline_vs_enhanced(test_dataset)


Epoch 1 Step 0 Loss: 9.7545
Epoch 1 Step 10 Loss: 8.1049
Epoch 1 Step 20 Loss: 5.8820
Epoch 1 Step 30 Loss: 4.8052
Epoch 1 Step 40 Loss: 4.8572
Epoch 1 Step 50 Loss: 3.4424
Epoch 1 Step 60 Loss: 3.1419
Epoch 1 Step 70 Loss: 2.7112
Epoch 1 Step 80 Loss: 1.7764
Epoch 1 Step 90 Loss: 1.2486
Epoch 1 Step 100 Loss: 1.1238
Epoch 1 Step 110 Loss: 1.2595
Epoch 1 Step 120 Loss: 0.8214
Epoch 1 Step 130 Loss: 0.5899
Epoch 1 Step 140 Loss: 0.6288
Epoch 1 Step 150 Loss: 0.6511
Epoch 1 Step 160 Loss: 0.9431
Epoch 1 Step 170 Loss: 0.6655
Epoch 1 Step 180 Loss: 0.5017
Epoch 1 Step 190 Loss: 0.5071
Epoch 1 Step 200 Loss: 0.5804
Epoch 1 Step 210 Loss: 0.7913
Epoch 1 Step 220 Loss: 0.5411
Epoch 1 Step 230 Loss: 0.7341
Epoch 1 Step 240 Loss: 0.2968
Epoch 1 Step 250 Loss: 0.5749
Epoch 1 Step 260 Loss: 0.5423
Epoch 1 Step 270 Loss: 0.3573
Epoch 1 Step 280 Loss: 0.4015
Epoch 1 Step 290 Loss: 0.4285
Epoch 1 Step 300 Loss: 0.3992
Epoch 1 Step 310 Loss: 0.5604
Epoch 1 Step 320 Loss: 0.5135
Epoch 1 Step 330 Loss