## ENV SETUP

1. Install uv (or do it you're own way)
2. Run `uv sync`
3. Run `source .venv/bin/activate`

You're good to go.

# Instructions

The Task : Create the best CadQuery code generator model. 

1. Load the dataset (147K pairs of Images/CadQuery code).
2. Create a baseline model and evaluate it with the given metrics.
3. Enhance by any manner the baseline model and evaluate it again.
4. Explain you choices and possible bottlenecks. 
5. Show what enhancements you would have done if you had more time.

You can do *WHATEVER* you want, be creative, result is not what matters the most. 
Creating new model architectures, reusing ones you used in the past, fine-tuning, etc...

If you are GPU poor, there are solutions. Absolute value is not what matters, relative value between baseline and enhanced model is what matters.

In [9]:
from datasets import load_dataset
ds = load_dataset("CADCODER/GenCAD-Code", num_proc=16, split=["train", "test"])

## Evaluation Metrics

1. Valid Syntax Rate metric assess the validity of the code by executing and checking if error are returned.
2. Best IOU assess the similarity between the meshes generated by the code.

In [2]:
from metrics.valid_syntax_rate import evaluate_syntax_rate_simple
from metrics.best_iou import get_iou_best

In [3]:
## Example usage of the metrics
sample_code = """
height = 60.0
width = 80.0
thickness = 10.0
diameter = 22.0

# make the base
result = (
    cq.Workplane("XY")
    .box(height, width, thickness)
)
"""

sample_code_2 = """
 height = 60.0
 width = 80.0
 thickness = 10.0
 diameter = 22.0
 padding = 12.0

 # make the base
 result = (
     cq.Workplane("XY")
     .box(height, width, thickness)
     .faces(">Z")
     .workplane()
     .hole(diameter)
     .faces(">Z")
     .workplane()
     .rect(height - padding, width - padding, forConstruction=True)
     .vertices()
     .cboreHole(2.4, 4.4, 2.1)
 )
"""
codes = {
    "sample_code": sample_code,
    "sample_code_2": sample_code_2,
}
vsr = evaluate_syntax_rate_simple(codes)
print("Valid Syntax Rate:", vsr)
iou = get_iou_best(sample_code, sample_code_2)
print("IOU:", iou)

Valid Syntax Rate: 1.0
IOU: 0.5834943417057687


## Have Fun

# Experiments.
## Actual solution is in base_model.ipynb


In [None]:
# 1. Install dependencies (if not done)
# !pip install datasets transformers torch torchvision cadquery

# 2. Load the dataset
from datasets import load_dataset
ds = load_dataset("CADCODER/GenCAD-Code", num_proc=4, split={"train": "train[:1000]", "test": "test[:100]"}, cache_dir="/tmp/hf_cache")

# 3. Preprocess dataset
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import io

from torchvision import transforms

class ImageCodeDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=256):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),  # CLIP expects 224x224
            transforms.ToTensor(),          # Converts to [C, H, W] in range [0,1]
        ])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        img_data = item["image"]

        # Load image
        if isinstance(img_data, dict) and "bytes" in img_data:
            image = Image.open(io.BytesIO(img_data["bytes"])).convert("RGB")
        elif isinstance(img_data, str):
            image = Image.open(img_data).convert("RGB")
        elif isinstance(img_data, Image.Image):
            image = img_data.convert("RGB")
        else:
            raise ValueError(f"Unknown image format: {type(img_data)}")

        image_tensor = self.transform(image)  # Convert to tensor

        # Use correct field name for CAD code
        code = item["cadquery"]
        code_ids = self.tokenizer(code, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")

        return {
            "image": image_tensor,
            "input_ids": code_ids["input_ids"].squeeze(0),
            "attention_mask": code_ids["attention_mask"].squeeze(0),
            "code": code
        }

# 4. Load models and tokenizer
import torch
from torch import nn
from transformers import CLIPProcessor, CLIPModel, GPT2Tokenizer, GPT2LMHeadModel

device = "cuda" if torch.cuda.is_available() else "cpu"

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have pad token by default

gpt2 = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

# Projection from CLIP -> GPT2 embedding
class ClipToGPT2(nn.Module):
    def __init__(self, clip_dim, gpt2_dim):
        super().__init__()
        self.proj = nn.Linear(clip_dim, gpt2_dim)
    def forward(self, x):
        return self.proj(x)

proj = ClipToGPT2(clip_model.config.projection_dim, gpt2.config.n_embd).to(device)

# 5. Prepare data
train_dataset = ImageCodeDataset(ds["train"], tokenizer)
test_dataset = ImageCodeDataset(ds["test"], tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1)
print(ds["train"][0].keys())


# 6. Training
from torch.optim import AdamW

gpt2.train()
clip_model.eval()
proj.train()

optimizer = AdamW(list(gpt2.parameters()) + list(proj.parameters()), lr=5e-5)

for epoch in range(2):  # You can increase to 5–10
    for step, batch in enumerate(train_loader):
        images = batch["image"]
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        # Encode images using CLIP
        with torch.no_grad():
            processed = clip_processor(images=images, return_tensors="pt", padding=True).to(device)
            clip_out = clip_model.get_image_features(**processed)

        img_emb = proj(clip_out)  # [B, gpt2_dim]
        prefix_emb = img_emb.unsqueeze(1)  # [B, 1, gpt2_dim]

        code_emb = gpt2.transformer.wte(input_ids)  # [B, L, gpt2_dim]
        inputs_embeds = torch.cat([prefix_emb, code_emb[:, :-1, :]], dim=1)

        # Adjust labels: ignore prefix in loss
        labels = input_ids.clone()
        labels[:, 0] = -100  # Ignore loss for prefix

        outputs = gpt2(inputs_embeds=inputs_embeds, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % 10 == 0:
            print(f"Epoch {epoch+1} Step {step} Loss: {loss.item():.4f}")

# 7. Inference function
def generate_code_from_image(image: Image.Image, max_length=256):
    gpt2.eval()
    proj.eval()
    with torch.no_grad():
        processed = clip_processor(images=image, return_tensors="pt").to(device)
        clip_out = clip_model.get_image_features(**processed)
        img_emb = proj(clip_out)  # [1, gpt2_dim]
        prefix_emb = img_emb.unsqueeze(1)  # [1, 1, gpt2_dim]
        
        # Generate empty input
        generated = gpt2.generate(
            inputs_embeds=prefix_emb,
            max_length=max_length,
            num_beams=1,
            do_sample=True,
            temperature=0.7,
            top_p=0.95,
            pad_token_id=tokenizer.eos_token_id,
        )
        code = tokenizer.decode(generated[0], skip_special_tokens=True)
        return code

# 8. Try on a test sample
sample = test_dataset[0]
print("\n=== Ground Truth Code ===")
print(sample["code"])

generated_code = generate_code_from_image(sample["image"])
print("\n=== Generated Code ===")
print(generated_code)


## EValuation

# geometry render feeback

In [69]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from transformers import (
    CLIPProcessor, CLIPModel,
    GPT2Tokenizer, GPT2LMHeadModel,
    AdamW, get_linear_schedule_with_warmup
)
from cadquery import exporters
from metrics.best_iou import get_iou_best
import ast

# ------------------- Models -------------------
class ClipToGPT2Improved(nn.Module):
    def __init__(self, clip_dim, gpt2_dim):
        super().__init__()
        self.proj = nn.Sequential(
            nn.Linear(clip_dim, 4 * gpt2_dim),
            nn.ReLU(),
            nn.Linear(4 * gpt2_dim, gpt2_dim)
        )

    def forward(self, x):
        return self.proj(x)

# ------------------- Utilities -------------------
def is_valid_python(code):
    try:
        ast.parse(code)
        return True
    except:
        return False

def is_executable(code):
    try:
        exec_globals = {}
        exec(code, exec_globals)
        return exec_globals.get("result", None) is not None
    except Exception:
        return False

# ------------------- Load Models -------------------
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

gpt2 = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2.config.pad_token_id = tokenizer.pad_token_id

proj = ClipToGPT2Improved(clip_model.config.projection_dim, gpt2.config.n_embd)

# ------------------- Device Setup -------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt2.to(device)
clip_model.to(device)
proj.to(device)

# ------------------- Optimizer -------------------
optimizer = AdamW(list(gpt2.parameters()) + list(proj.parameters()), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=1000)

# ------------------- Training Loop -------------------
def train(train_loader):
    gpt2.train()
    proj.train()

    for epoch in range(5):
        for step, batch in enumerate(train_loader):
            images = batch["image"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            with torch.no_grad():
                processed = clip_processor(images=images, return_tensors="pt", padding=True).to(device)
                clip_out = clip_model.get_image_features(**processed)

            img_emb = proj(clip_out)
            prefix_emb = img_emb.unsqueeze(1)  # [B, 1, D]

            code_emb = gpt2.transformer.wte(input_ids)
            inputs_embeds = torch.cat([prefix_emb, code_emb[:, :-1, :]], dim=1)

            labels = input_ids.clone()
            labels[:, 0] = -100

            outputs = gpt2(inputs_embeds=inputs_embeds, labels=labels)
            loss = outputs.loss

            # -- Feedback loop --
            if step % 10 == 0:
                gen_ids = gpt2.generate(
                    inputs_embeds=prefix_emb,
                    attention_mask=torch.ones(prefix_emb.shape[:-1], dtype=torch.long, device=device),
                    max_length=256,
                    do_sample=True,
                    temperature=0.7,
                    top_p=0.95,
                    pad_token_id=tokenizer.pad_token_id
                )

                code_str = tokenizer.decode(gen_ids[0], skip_special_tokens=True)

                if not is_valid_python(code_str):
                    loss += 0.5

                if not is_executable(code_str):
                    loss += 0.5

                try:
                    gt_code = batch["code"][0]  # batch size 1 assumed for eval
                    iou = get_iou_best(gt_code, code_str)
                    if iou < 0.7:
                        loss += 0.5
                except Exception:
                    loss += 0.5

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            if step % 10 == 0:
                print(f"Epoch {epoch+1} Step {step} Loss: {loss.item():.4f}")

# ------------------- Inference -------------------
def generate_code_from_image(image, max_length=256):
    gpt2.eval()
    proj.eval()
    with torch.no_grad():
        processed = clip_processor(images=image, return_tensors="pt").to(device)
        clip_out = clip_model.get_image_features(**processed)
        img_emb = proj(clip_out)
        prefix_emb = img_emb.unsqueeze(1)

        generated = gpt2.generate(
            inputs_embeds=prefix_emb,
            attention_mask=torch.ones(prefix_emb.shape[:-1], dtype=torch.long, device=device),
            max_length=max_length,
            do_sample=True,
            temperature=0.7,
            top_p=0.95,
            pad_token_id=tokenizer.pad_token_id,
        )

        return tokenizer.decode(generated[0], skip_special_tokens=True)


In [70]:
train(train_loader)


Epoch 1 Step 0 Loss: 10.0131
Epoch 1 Step 10 Loss: 9.3295
Epoch 1 Step 20 Loss: 7.3636
Epoch 1 Step 30 Loss: 7.0372
Epoch 1 Step 40 Loss: 5.8355
Epoch 1 Step 50 Loss: 5.1651
Epoch 1 Step 60 Loss: 4.8483
Epoch 1 Step 70 Loss: 3.7347
Epoch 1 Step 80 Loss: 3.3575
Epoch 1 Step 90 Loss: 2.7997
Epoch 1 Step 100 Loss: 2.2731
Epoch 1 Step 110 Loss: 2.2724
Epoch 1 Step 120 Loss: 2.0882
Epoch 1 Step 130 Loss: 2.5202
Epoch 1 Step 140 Loss: 2.3015
Epoch 1 Step 150 Loss: 2.0852
Epoch 1 Step 160 Loss: 2.1079
Epoch 1 Step 170 Loss: 2.4301
Epoch 1 Step 180 Loss: 2.0459
Epoch 1 Step 190 Loss: 2.0023
Epoch 1 Step 200 Loss: 2.0316
Epoch 1 Step 210 Loss: 1.9034
Epoch 1 Step 220 Loss: 2.7380
Epoch 1 Step 230 Loss: 2.4274
Epoch 1 Step 240 Loss: 1.7835
Epoch 2 Step 0 Loss: 1.9063
Epoch 2 Step 10 Loss: 2.1494
Epoch 2 Step 20 Loss: 2.3537
Epoch 2 Step 30 Loss: 1.7930
Epoch 2 Step 40 Loss: 1.8439
Epoch 2 Step 50 Loss: 2.2492
Epoch 2 Step 60 Loss: 1.9673
Epoch 2 Step 70 Loss: 2.6388
Epoch 2 Step 80 Loss: 1.9775




Epoch 2 Step 230 Loss: 2.2525
Epoch 2 Step 240 Loss: 2.4427
Epoch 3 Step 0 Loss: 2.1474
Epoch 3 Step 10 Loss: 2.0802
Epoch 3 Step 20 Loss: 2.1448
Epoch 3 Step 30 Loss: 1.9489
Epoch 3 Step 40 Loss: 1.9981
Epoch 3 Step 50 Loss: 2.1860
Epoch 3 Step 60 Loss: 2.4666
Epoch 3 Step 70 Loss: 2.1745
Epoch 3 Step 80 Loss: 1.9184
Epoch 3 Step 90 Loss: 1.6972
Epoch 3 Step 100 Loss: 1.9227
Epoch 3 Step 110 Loss: 2.2501
Epoch 3 Step 120 Loss: 1.7316
Epoch 3 Step 130 Loss: 1.8896
Epoch 3 Step 140 Loss: 1.7910
Epoch 3 Step 150 Loss: 2.2076
Epoch 3 Step 160 Loss: 1.8964
Epoch 3 Step 170 Loss: 2.1147
Epoch 3 Step 180 Loss: 1.8996
Epoch 3 Step 190 Loss: 2.0661
Epoch 3 Step 200 Loss: 2.2954
Epoch 3 Step 210 Loss: 1.8461
Epoch 3 Step 220 Loss: 2.2627
Epoch 3 Step 230 Loss: 1.8993
Epoch 3 Step 240 Loss: 2.0723
Epoch 4 Step 0 Loss: 1.8321
Epoch 4 Step 10 Loss: 2.3393
Epoch 4 Step 20 Loss: 2.1764
Epoch 4 Step 30 Loss: 1.9032
Epoch 4 Step 40 Loss: 1.9255
Epoch 4 Step 50 Loss: 1.8957
Epoch 4 Step 60 Loss: 2.0152

In [71]:
# 9. Evaluate on the test set using provided metrics

from metrics.valid_syntax_rate import evaluate_syntax_rate_simple
from metrics.best_iou import get_iou_best

# Generate code for all test images
pred_codes = {}
gt_codes = {}

for i in range(len(test_dataset)):
    sample = test_dataset[i]
    gt_codes[str(i)] = sample["code"]
    # Generate code from image
    gen_code = generate_code_from_image(sample["image"])
    pred_codes[str(i)] = gen_code

# 1. Valid Syntax Rate
vsr = evaluate_syntax_rate_simple(pred_codes)
print(f"\nValid Syntax Rate: {vsr:.3f}")

# 2. Mean IOU
ious = []
for i in gt_codes:
    try:
        iou = get_iou_best(gt_codes[i], pred_codes[i])
        ious.append(iou)
    except Exception as e:
        print(f"Sample {i}: IOU computation failed ({e})")
mean_iou = sum(ious) / len(ious) if ious else 0.0
print(f"Mean IOU: {mean_iou:.3f}")


Valid Syntax Rate: 0.000
Sample 0: IOU computation failed (Error executing script unknown: invalid decimal literal (<string>, line 2))
Sample 1: IOU computation failed (Error executing script unknown: invalid syntax (<string>, line 1))
Sample 2: IOU computation failed (Error executing script unknown: invalid syntax (<string>, line 1))
Sample 3: IOU computation failed (Error executing script unknown: invalid syntax (<string>, line 1))
Sample 4: IOU computation failed (Error executing script unknown: invalid decimal literal (<string>, line 3))
Sample 5: IOU computation failed (Error executing script unknown: invalid syntax (<string>, line 1))
Sample 6: IOU computation failed (Error executing script unknown: invalid syntax (<string>, line 1))
Sample 7: IOU computation failed (Error executing script unknown: invalid syntax (<string>, line 1))
Sample 8: IOU computation failed (Error executing script unknown: invalid decimal literal (<string>, line 2))
Sample 9: IOU computation failed (Erro