In [None]:
import cadquery as cq

# Parameters
block_length = 100
block_width = 60
block_height = 10

hole_diameter = 5
hole_spacing_x = 15
hole_spacing_y = 15
holes_x = 5
holes_y = 3

# Create block
block = cq.Workplane("XY").box(block_length, block_width, block_height)

# Add holes in grid
for i in range(holes_x):
    for j in range(holes_y):
        x = -block_length / 2 + hole_spacing_x + i * hole_spacing_x
        y = -block_width / 2 + hole_spacing_y + j * hole_spacing_y
        block = block.faces(">Z").workplane().pushPoints([(x, y)]).hole(hole_diameter)

# Export to STL
cq.exporters.export(block, 'block_with_holes.stl')


: 

In [2]:
import cadquery as cq

# Parameters
block_length = 100
block_width = 60
block_height = 10

hole_diameter = 5
hole_spacing_x = 15
hole_spacing_y = 15
holes_x = 5
holes_y = 3

# Create block
block = cq.Workplane("XY").box(block_length, block_width, block_height)

# Add holes in grid
for i in range(holes_x):
    for j in range(holes_y):
        x = -block_length / 2 + hole_spacing_x + i * hole_spacing_x
        y = -block_width / 2 + hole_spacing_y + j * hole_spacing_y
        block = block.faces(">Z").workplane().pushPoints([(x, y)]).hole(hole_diameter)

# Export to STEP
cq.exporters.export(block, 'block_with_holes.step')


In [1]:
!pip install datasets

Successfully installed aiohappyeyeballs-2.6.1 aiohttp-3.12.14 aiosignal-1.4.0 attrs-25.3.0 datasets-4.0.0 dill-0.3.8 filelock-3.18.0 frozenlist-1.7.0 fsspec-2025.3.0 hf-xet-1.1.5 huggingface-hub-0.33.4 multidict-6.6.3 multiprocess-0.70.16 pandas-2.3.1 propcache-0.3.2 pyarrow-21.0.0 pytz-2025.2 tzdata-2025.2 xxhash-3.5.0 yarl-1.20.1


In [5]:
import os
import json
import time
from datasets import load_dataset
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Initialize the LLM using local LM Studio settings
llm = ChatOpenAI(
    base_url="http://localhost:1234/v1",
    api_key="lm-studio",
    name="local",
    max_tokens=2000,
)

# Output files
OUTPUT_FILE = "rag_dataset_local.jsonl"
PROGRESS_FILE = "progress_local.json"

# Load train split of the dataset
dataset = load_dataset("CADCODER/GenCAD-Code", split="train")

# Check if progress exists and is valid
if os.path.exists(PROGRESS_FILE):
    try:
        with open(PROGRESS_FILE, "r") as f:
            content = f.read().strip()
            progress = json.loads(content) if content else {}
        start_index = progress.get("last_processed_index", -1) + 1
    except json.JSONDecodeError:
        print("⚠️ Corrupted progress file. Restarting from index 0.")
        start_index = 0
else:
    start_index = 0

print(f"🔁 Resuming from index {start_index}")

# Open output file in append mode
with open(OUTPUT_FILE, "a", encoding="utf-8") as out_file:
    for i in range(start_index, len(dataset)):
        if i >= 50000:
            print("🚫 Reached limit of 50000 items. Stopping.")
            break

        try:
            # Wait for 2 minutes every 500 iterations to avoid rate limits
            if i > 0 and i % 500 == 0:
                print("⏳ Waiting for 2 minutes to avoid rate limits...")
                time.sleep(120)

            cad_code = dataset[i]["cadquery"]

            response = llm.invoke([
                {
                    "role": "system",
                    "content": (
                        "You are helping to create a dataset for a CAD assistant.\n"
                        "Your task is to read a CadQuery script and write a short, natural-sounding prompt "
                        "that a human user might give to get that result.\n\n"
                        "**DO NOT** describe the code or its steps.\n"
                        "Just write the kind of brief request a user would give. Think of what they're trying to make — "
                        "not how it's made.\n"
                        "Examples: 'Make a simple logo shape and extrude it.' or 'Design a 3D part with rounded corners.'"
                    )
                },
                {
                    "role": "user",
                    "content": f"What kind of prompt would a user give to create this object?\n\n{cad_code}"
                }
            ])

            user_prompt = response.content.strip()

            # Save result as JSON line
            item = {
                "prompt": user_prompt,
                "code": cad_code.strip()
            }
            out_file.write(json.dumps(item, ensure_ascii=False) + "\n")

            # Save progress
            with open(PROGRESS_FILE, "w") as f:
                json.dump({ "last_processed_index": i }, f)

            print(f"✅ Processed index {i}")

        except Exception as e:
            print(f"❌ Error at index {i}: {e}")
            break


🔁 Resuming from index 0
✅ Processed index 0
✅ Processed index 1
✅ Processed index 2
✅ Processed index 3
✅ Processed index 4
✅ Processed index 5
✅ Processed index 6
✅ Processed index 7
✅ Processed index 8
✅ Processed index 9
✅ Processed index 10
✅ Processed index 11
✅ Processed index 12
✅ Processed index 13
✅ Processed index 14
✅ Processed index 15
✅ Processed index 16
✅ Processed index 17
✅ Processed index 18
✅ Processed index 19
✅ Processed index 20
✅ Processed index 21
✅ Processed index 22
✅ Processed index 23
✅ Processed index 24
✅ Processed index 25
✅ Processed index 26
✅ Processed index 27
✅ Processed index 28
✅ Processed index 29
✅ Processed index 30
✅ Processed index 31
✅ Processed index 32
✅ Processed index 33
✅ Processed index 34
✅ Processed index 35
✅ Processed index 36
✅ Processed index 37
✅ Processed index 38
✅ Processed index 39
✅ Processed index 40
✅ Processed index 41


KeyboardInterrupt: 

In [6]:
!pip install pandas




In [None]:
import pandas as pd
import json

# Path to your JSONL file
jsonl_file = "rag_dataset_local.jsonl"

# Read each line and parse JSON
data = []
with open(jsonl_file, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

# Create DataFrame
df = pd.DataFrame(data)

# Display the first few rows
print(df.head())


In [9]:
import pandas as pd
import json
import cadquery as cq
import os

# Create output directory
os.makedirs("models", exist_ok=True)

# Load JSONL
jsonl_file = "rag_dataset_local.jsonl"
data = []
with open(jsonl_file, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))
df = pd.DataFrame(data)

# Process and export models
for i, row in df.iterrows():
    code_str = row["code"]
    try:
        # Evaluate the code in a restricted namespace
        local_vars = {}
        exec(code_str, {"cq": cq}, local_vars)

        # Look for an object of type Workplane
        workplane_objs = [v for v in local_vars.values() if isinstance(v, cq.Workplane)]

        if not workplane_objs:
            print(f"❌ No valid CadQuery object found at index {i}")
            continue

        # Take the first valid object
        model = workplane_objs[0]

        # Export to STEP (you can also export to STL)
        export_path = f"models/model_{i}.step"
        cq.exporters.export(model, export_path)
        print(f"✅ Exported model {i} to {export_path}")

    except Exception as e:
        print(f"❌ Failed at index {i}: {e}")


ModuleNotFoundError: No module named 'cadquery'

In [2]:
import json

# Input files
file1 = "rag_dataset_local.jsonl"
file2 = "rag_dataset_local_50k.jsonl"

# Output file
merged_file = "rag_dataset_merged.jsonl"

# Use a set to avoid duplicates
seen = set()
merged_data = []

# Function to read and deduplicate
def load_jsonl(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)
            key = json.dumps(item, sort_keys=True)  # Unique identifier for deduplication
            if key not in seen:
                seen.add(key)
                merged_data.append(item)

# Load both files
load_jsonl(file1)
load_jsonl(file2)

# Write to merged output file
with open(merged_file, "w", encoding="utf-8") as f:
    for item in merged_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ Merged {len(merged_data)} unique records into {merged_file}")


FileNotFoundError: [Errno 2] No such file or directory: 'rag_dataset_local.jsonl'

# 2mins 10 request

In [None]:
import os
import json
import time
from datasets import load_dataset
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Initialize the LLM using local LM Studio settings
llm = ChatOpenAI(
    base_url="http://localhost:1234/v1",
    api_key="lm-studio",
    name="local",
    max_tokens=2000,
)

# Output files
OUTPUT_FILE = "rag_dataset_local.jsonl"
PROGRESS_FILE = "progress_local.json"

# Load train split of the dataset
dataset = load_dataset("CADCODER/GenCAD-Code", split="train")

# Check if progress exists and is valid
if os.path.exists(PROGRESS_FILE):
    try:
        with open(PROGRESS_FILE, "r") as f:
            content = f.read().strip()
            progress = json.loads(content) if content else {}
        start_index = progress.get("last_processed_index", -1) + 1
    except json.JSONDecodeError:
        print("⚠️ Corrupted progress file. Restarting from index 0.")
        start_index = 0
else:
    start_index = 0

print(f"🔁 Resuming from index {start_index}")

# Open output file in append mode
with open(OUTPUT_FILE, "a", encoding="utf-8") as out_file:
    for i in range(start_index, len(dataset)):
        if i >= 50000:
            print("🚫 Reached limit of 50000 items. Stopping.")
            break

        try:
            # Wait for 2 minutes every 10 iterations to avoid rate limits
            if i > 0 and i % 10 == 0:
                print("⏳ Waiting for 2 minutes to avoid rate limits...")
                time.sleep(120)

            cad_code = dataset[i]["cadquery"]

            response = llm.invoke([
                {
                    "role": "system",
                    "content": (
                        "You are helping to create a dataset for a CAD assistant.\n"
                        "Your task is to read a CadQuery script and write a short, natural-sounding prompt "
                        "that a human user might give to get that result.\n\n"
                        "**DO NOT** describe the code or its steps.\n"
                        "Just write the kind of brief request a user would give. Think of what they're trying to make — "
                        "not how it's made.\n"
                        "Examples: 'Make a simple logo shape and extrude it.' or 'Design a 3D part with rounded corners.'"
                    )
                },
                {
                    "role": "user",
                    "content": f"What kind of prompt would a user give to create this object?\n\n{cad_code}"
                }
            ])

            user_prompt = response.content.strip()

            # Save result as JSON line
            item = {
                "prompt": user_prompt,
                "code": cad_code.strip()
            }
            out_file.write(json.dumps(item, ensure_ascii=False) + "\n")

            # Save progress
            with open(PROGRESS_FILE, "w") as f:
                json.dump({ "last_processed_index": i }, f)

            print(f"✅ Processed index {i}")

        except Exception as e:
            print(f"❌ Error at index {i}: {e}")
            break
