In [1]:
!rm -rf /content/huggingface_cache

In [None]:
!pip uninstall -y datasets
!pip install git+https://github.com/huggingface/datasets.git

In [None]:
!pip install transformers cadquery trimesh ipywidgets accelerate

In [None]:
# Install dependencies
!pip install transformers datasets accelerate pillow --quiet

In [27]:
import torch
from torch.utils.data import DataLoader
from transformers import VisionEncoderDecoderModel, AutoProcessor, AutoTokenizer
from datasets import load_dataset
from torchvision import transforms
from PIL import Image
import matplotlib.pyplot as plt


In [None]:
from datasets import load_dataset

# Load splits
dataset = load_dataset("CADCODER/GenCAD-Code", split=["train", "test"])
train_dataset, test_dataset = dataset

# Sample
print("Train size:", len(train_dataset))
print("Test size:", len(test_dataset))
print(train_dataset[0])  # One image + one CadQuery code


In [25]:
from transformers import CLIPProcessor, CLIPVisionModel, GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, VisionEncoderDecoderModel

# Processor
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Encoder (CLIP Vision only)
encoder = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")

# GPT2 with cross-attention enabled
gpt2_config = GPT2Config.from_pretrained("gpt2")
gpt2_config.add_cross_attention = True
decoder = GPT2LMHeadModel.from_pretrained("gpt2", config=gpt2_config)

# Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token


Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['transformer.h.0.crossattention.c_attn.bias', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.0.crossattention.q_attn.bias', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.0.ln_cross_attn.bias', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.1.crossattention.c_attn.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.1.crossattention.q_attn.bias', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.1.ln_cross_attn.bias', 'transformer.h.1.ln_cross_attn.weight', 'transformer.h.10.crossattention.c_attn.bias', 'transformer.h.10.crossattention.c_attn.weight', 'transformer.h.10.crossattention.c_proj.bias', 'transformer.h.10.cros

In [26]:
model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)

model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size
model.config.max_length = 256
model.config.no_repeat_ngram_size = 2
model.config.length_penalty = 2.0
model.config.num_beams = 4
model.config.early_stopping = True


In [27]:
def preprocess(example):
    image_inputs = clip_processor(images=example["image"], return_tensors="pt")
    tokenized_code = tokenizer(
        example["cadquery"],
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )

    return {
        "pixel_values": image_inputs["pixel_values"].squeeze(),
        "input_ids": tokenized_code["input_ids"].squeeze(),
        "attention_mask": tokenized_code["attention_mask"].squeeze()
    }

# Small subset for training
train_small = train_dataset.select(range(1000))
test_small = test_dataset.select(range(200))

train_data = train_small.map(preprocess, remove_columns=train_dataset.column_names)
test_data = test_small.map(preprocess, remove_columns=test_dataset.column_names)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [28]:
import torch
from torch.utils.data import Dataset, DataLoader

class CadQueryDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        return {
            "pixel_values": item["pixel_values"],
            "labels": item["input_ids"],
            "attention_mask": item["attention_mask"]
        }

# Collate function fix
def collate_fn(batch):
    pixel_values = torch.stack([torch.tensor(item["pixel_values"]) for item in batch])
    labels = torch.stack([torch.tensor(item["labels"]) for item in batch])
    attention_mask = torch.stack([torch.tensor(item["attention_mask"]) for item in batch])

    return {
        "pixel_values": pixel_values,
        "labels": labels,
        "attention_mask": attention_mask
    }

train_loader = DataLoader(CadQueryDataset(train_data), batch_size=4, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(CadQueryDataset(test_data), batch_size=4, collate_fn=collate_fn)


In [29]:
from torch import optim
from tqdm import tqdm

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# Training function
def train_one_epoch(model, dataloader, optimizer, epoch):
    model.train()
    total_loss = 0
    loop = tqdm(dataloader, desc=f"Epoch {epoch}")

    for batch in loop:
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(pixel_values=pixel_values, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch} average loss: {avg_loss:.4f}")


In [30]:
for epoch in range(1, 3):  # 2 epochs
    train_one_epoch(model, train_loader, optimizer, epoch)


Epoch 1:   0%|          | 0/250 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Epoch 1: 100%|██████████| 250/250 [03:23<00:00,  1.23it/s, loss=0.785]


✅ Epoch 1 average loss: 1.1304


Epoch 2: 100%|██████████| 250/250 [03:25<00:00,  1.22it/s, loss=0.458]

✅ Epoch 2 average loss: 0.5482





In [37]:
import sys
sys.path.append('/content/mecagent-technical-test')  # Add repo root to Python path

from metrics.valid_syntax_rate import evaluate_syntax_rate_simple

# Format predictions
code_dict = {f"sample_{i}": code for i, code in enumerate(generated_codes)}

# Evaluate VSR
vsr = evaluate_syntax_rate_simple(code_dict)
print(f"\nValid Syntax Rate: {vsr:.2%}")



✅ Valid Syntax Rate: 0.00%


In [38]:
# Extract code strings for pretraining
code_only_dataset = train_dataset.select(range(10000))  # Try with 10K for now
cadquery_texts = [item["cadquery"] for item in code_only_dataset]


In [39]:
from datasets import Dataset

# Create Hugging Face Dataset with only 'text'
code_ds = Dataset.from_dict({"text": cadquery_texts})


In [40]:
def tokenize_code(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

lm_dataset = code_ds.map(tokenize_code, batched=True)


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [42]:
# Use fine-tuned decoder in encoder-decoder
model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
