In [None]:
%pip install datasets huggingface-cli

This dataset is big and not properly formatted for HF. I found it more reliable to first download the dataset with the HF CLI
   ```bash
   # Create datasets directory (it's gitignored)
   mkdir datasets

   # Download the dataset
   huggingface-cli download --repo-type dataset filapro/cad-recode --local-dir datasets/cad-recode
   ```

In [37]:
# Import necessary libraries
from datasets import load_dataset

# Load from local datasets directory with each file as a single entry
dataset = load_dataset(
    "text",  # We're loading text files (Python files are text)
    data_files="datasets/cad-recode/train/batch_00/*.py",  # Full path to Python files
    split="train",
    streaming=True,
    sample_by="document"
)


print("\nLoaded first 10 files:")
# Use take() for IterableDataset which is more efficient than iterating
examples = dataset.take(10)
for example in examples:
    print(example)


Loaded first 10 files:
{'text': "import cadquery as cq\nw0=cq.Workplane('XY',origin=(0,0,-11))\nr=w0.sketch().segment((12,-10),(16,-7)).arc((48,-11),(26,14)).arc((-6,7),(18,-16)).close().assemble().finalize().extrude(-13).union(w0.sketch().segment((-50,-14),(-49,-17)).segment((-43,-14)).segment((-44,-12)).close().assemble().reset().face(w0.sketch().segment((-28,-5),(-26,-8)).segment((-12,-3)).segment((-13,0)).close().assemble()).finalize().extrude(36))"}
{'text': "import cadquery as cq\nw0=cq.Workplane('YZ',origin=(-1,0,0))\nr=w0.workplane(offset=-13/2).moveTo(24,3).cylinder(13,12).union(w0.sketch().segment((-50,-8),(-26,-19)).segment((-26,-26)).segment((30,-26)).segment((30,2)).segment((50,2)).segment((50,4)).segment((30,4)).segment((30,26)).segment((-26,26)).segment((-26,21)).segment((-35,25)).close().assemble().finalize().extrude(15))"}
{'text': "import cadquery as cq\nw0=cq.Workplane('ZX',origin=(0,15,0))\nr=w0.sketch().segment((-49,-7),(-34,-19)).arc((38,-36),(14,33)).segment((14

In [35]:
import modal

app = modal.App("cad-recode-repro")

# Create an image with our requirements
image = (
    modal.Image.debian_slim(python_version="3.11")
    .apt_install(
        "git",
        "libgl1",
        "libglu1-mesa",
        "freeglut3-dev",
        "libxrender1",
        "gmsh",
        "python3-gmsh",
        "xvfb",
        "curl"
    )
    .pip_install("uv")
    .run_commands("uv pip install --system --compile-bytecode trimesh==4.5.3 jupyterlab datasets")
    .run_commands("uv pip install --system --compile-bytecode git+https://github.com/CadQuery/cadquery.git@e99a15df3cf6a88b69101c405326305b5db8ed94")
)

@app.function()
def double_with_modal(x: int) -> int:
    return x + x


@app.function(cpu=10, image=image)
def quadruple(x: int) -> int:
    return double_with_modal.remote(x) + double_with_modal.remote(x)


with app.run():
    print(quadruple.local(100))   # running locally
    print(quadruple.remote(100))  # run remotely

400
400


In [43]:
@app.function(image=image, cpu=10)
def process_cad_files():
    import trimesh
    import numpy as np
    
    def mesh_to_point_cloud(mesh, n_points=256):
        vertices, faces = trimesh.sample.sample_surface(mesh, n_points)
        point_cloud = np.concatenate((
            np.asarray(vertices),
            mesh.face_normals[faces]
        ), axis=1)
        ids = np.lexsort((point_cloud[:, 0], point_cloud[:, 1], point_cloud[:, 2]))
        point_cloud = point_cloud[ids]
        return point_cloud

    def py_string_to_mesh(py_string):
        exec(py_string, globals())
        compound = globals()['r'].val()
        vertices, faces = compound.tessellate(0.001, 0.1)
        mesh = trimesh.Trimesh([(v.x, v.y, v.z) for v in vertices], faces)
        mesh.apply_translation(-(mesh.bounds[0] + mesh.bounds[1]) / 2.0)
        mesh.apply_scale(2.0 / max(mesh.extents))
        return mesh


    # Execute all examples in sequence and process the final result
    results = []
    example_texts = []
    try:
        # Extract just the text from the examples
        example_texts = [ex['text'] for ex in examples]
        print("example_texts", example_texts)
        
        # Execute the setup code first
        for setup_code in example_texts[:2]:
            exec(setup_code, globals())
        
        # Execute the final piece that creates 'r'
        exec(example_texts[2], globals())
        
        # Now process the result
        mesh = py_string_to_mesh(example_texts[2])
        point_cloud = mesh_to_point_cloud(mesh)
        results.append({
            'code': '\n'.join(example_texts),
            'point_cloud_shape': point_cloud.shape,
            'point_cloud': point_cloud.tolist()  # Convert to list for JSON serialization
        })
    except Exception as e:
        results.append({
            'code': '\n'.join(example_texts),
            'error': str(e)
        })
    return results

# Run the function
with app.run():
    results = process_cad_files.remote()
    for r in results:
        print("\nComplete Code:")
        print(r['code'])
        if 'point_cloud_shape' in r:
            print("\nPoint cloud shape:", r['point_cloud_shape'])
            print("First few points:", r['point_cloud'][:2])  # Show first 2 points
        else:
            print("\nError:", r['error'])




Complete Code:


Error: /Users/raymondweitekamp/Documents/GitHub/cad-recode-repro/datasets/cad-recode/train/batch_00/0.py


note: cadquery and mac silicon still don't get along unfortunately...

In [None]:
%pip install git+https://github.com/CadQuery/cadquery.git@e99a15df3cf6a88b69101c405326305b5db8ed94
%pip install trimesh 

In [23]:
import trimesh
import numpy as np

def mesh_to_point_cloud(mesh, n_points=256):
    """Convert a mesh to a point cloud with normals"""
    vertices, faces = trimesh.sample.sample_surface(mesh, n_points)
    point_cloud = np.concatenate((
        np.asarray(vertices),
        mesh.face_normals[faces]
    ), axis=1)
    # Sort points for consistency
    ids = np.lexsort((point_cloud[:, 0], point_cloud[:, 1], point_cloud[:, 2]))
    point_cloud = point_cloud[ids]
    return point_cloud

def py_string_to_mesh(py_string):
    """Convert CadQuery Python code to a mesh"""
    # Execute the full, multiline Python string
    exec(py_string, globals())
    compound = globals()['r'].val()
    
    # Convert to mesh
    vertices, faces = compound.tessellate(0.001, 0.1)
    mesh = trimesh.Trimesh([(v.x, v.y, v.z) for v in vertices], faces)
    
    # Center and normalize the mesh
    mesh.apply_translation(-(mesh.bounds[0] + mesh.bounds[1]) / 2.0)
    mesh.apply_scale(2.0 / max(mesh.extents))
    
    return mesh

# Load from local datasets directory
dataset = load_dataset(
    "text",  # We're loading text files
    data_files="datasets/cad-recode/train/batch_00/*.py",  # Full path to Python files
    split="train",
    streaming=True
)

# Look at the examples and convert to point clouds
print("\nSample examples:")
for example in dataset.take(3):
    print("\nOriginal Python code:")
    print(example['text'])
    
    try:
        # Convert Python code to mesh to point cloud using the multiline string as-is
        mesh = py_string_to_mesh(example['text'])
        point_cloud = mesh_to_point_cloud(mesh)
        print("\nPoint cloud shape:", point_cloud.shape)
    except Exception as e:
        print("\nError converting to point cloud:", str(e))


Sample examples:

Original Python code:
import cadquery as cq

Error converting to point cloud: dlopen(/Users/raymondweitekamp/Documents/GitHub/cad-recode-repro/venv/lib/python3.11/site-packages/OCP/OCP.cpython-311-darwin.so, 0x0002): Library not loaded: @rpath/libvtkWrappingPythonCore3.11-9.2.dylib
  Referenced from: <CCC3E9CA-0ECE-3AA4-8D7F-735550B5EF82> /Users/raymondweitekamp/Documents/GitHub/cad-recode-repro/venv/lib/python3.11/site-packages/OCP/OCP.cpython-311-darwin.so
  Reason: tried: '/Users/raymondweitekamp/Documents/GitHub/cad-recode-repro/venv/lib/python3.11/site-packages/OCP/../vtkmodules/.dylibs/libvtkWrappingPythonCore3.11-9.2.dylib' (no such file), '/Users/raymondweitekamp/Documents/GitHub/cad-recode-repro/venv/lib/python3.11/site-packages/OCP/../vtkmodules/.dylibs/libvtkWrappingPythonCore3.11-9.2.dylib' (no such file), '/Users/raymondweitekamp/.pyenv/versions/3.11.7/lib/libvtkWrappingPythonCore3.11-9.2.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/Users

if you see something like this, you're on apple silicon and not going to have fun. 

```
Error converting to point cloud: dlopen(/path/to/venv/lib/python3.11/site-packages/OCP/OCP.cpython-311-darwin.so, 0x0002): Library not loaded: @rpath/libvtkWrappingPythonCore3.11-9.2.dylib
  Referenced from: <CCC3E9CA-0ECE-3AA4-8D7F-735550B5EF82> /path/to/venv/lib/python3.11/site-packages/OCP/OCP.cpython-311-darwin.so
  Reason: tried: '/path/to/venv/lib/python3.11/site-packages/OCP/../vtkmodules/.dylibs/libvtkWrappingPythonCore3.11-9.2.dylib' (no such file), '/path/to/venv/lib/python3.11/site-packages/OCP/../vtkmodules/.dylibs/libvtkWrappingPythonCore3.11-9.2.dylib' (no such file), '/path/to/python/3.11.7/lib/libvtkWrappingPythonCore3.11-9.2.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/path/to/python/3.11.7/lib/libvtkWrappingPythonCore3.11-9.2.dylib' (no such file), '/opt/homebrew/lib/libvtkWrappingPythonCore3.11-9.2.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/lib/libvtkWrappingPythonCore3.11-9.2.dylib' (no such file), '/usr/local/lib/libvtkWrappingPythonCore3.11-9.2.dylib' (no such file), '/usr/lib/libvtkWrappingPythonCore3.11-9.2.dylib' (no such file, not in dyld cache
```

In [None]:
from datasets import Dataset, DatasetDict
import json



# Create a Hugging Face Dataset from the list of dictionaries
dataset = Dataset.from_list(data)

# Wrap your dataset in a DatasetDict (useful if you plan on having multiple splits)
dataset_dict = DatasetDict({
    "train": dataset,
})

# Optionally, save it locally (debug or offline version)
dataset_dict.save_to_disk("cadquery_dataset")

# Push the dataset to the Hugging Face Hub (replace "your_username" with your actual HF username)
dataset_dict.push_to_hub("rawwerks/test_upload_dataset", private=False)