In [2]:
import cadquery as cq

# Parameters
block_length = 100
block_width = 60
block_height = 10

hole_diameter = 5
hole_spacing_x = 15
hole_spacing_y = 15
holes_x = 5
holes_y = 3

# Create block
block = cq.Workplane("XY").box(block_length, block_width, block_height)

# Add holes in grid
for i in range(holes_x):
    for j in range(holes_y):
        x = -block_length / 2 + hole_spacing_x + i * hole_spacing_x
        y = -block_width / 2 + hole_spacing_y + j * hole_spacing_y
        block = block.faces(">Z").workplane().pushPoints([(x, y)]).hole(hole_diameter)

# Export to STEP
cq.exporters.export(block, 'block_with_holes.step')


In [18]:
import os
import json
import time
from datasets import load_dataset
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

from langchain_community.chat_models import ChatOllama

llm = ChatOllama(
    model="nchapman/ministral-8b-instruct-2410:8b",  # or any model you pulled (e.g., mistral, gemma)
    base_url="http://localhost:11434"
)


# # Load environment variables
# load_dotenv()

# # Initialize the LLM using local LM Studio settings
# llm = ChatOpenAI(
#     base_url="http://localhost:1234/v1",
#     api_key="lm-studio",
#     name="local",
#     max_tokens=2000,
# )

# Output files
OUTPUT_FILE = "rag_dataset_local.jsonl"
PROGRESS_FILE = "progress_local.json"

# Load train split of the dataset
dataset = load_dataset("CADCODER/GenCAD-Code", split="train")

# Check if progress exists and is valid
if os.path.exists(PROGRESS_FILE):
    try:
        with open(PROGRESS_FILE, "r") as f:
            content = f.read().strip()
            progress = json.loads(content) if content else {}
        start_index = progress.get("last_processed_index", -1) + 1
    except json.JSONDecodeError:
        print("⚠️ Corrupted progress file. Restarting from index 0.")
        start_index = 0
else:
    start_index = 0

print(f"🔁 Resuming from index {start_index}")

# Open output file in append mode
with open(OUTPUT_FILE, "a", encoding="utf-8") as out_file:
    for i in range(start_index, len(dataset)):
        if i >= 50000:
            print("🚫 Reached limit of 50000 items. Stopping.")
            break

        try:
            # Wait for 2 minutes every 500 iterations to avoid rate limits
            if i > 0 and i % 500 == 0:
                print("⏳ Waiting for 2 minutes to avoid rate limits...")
                time.sleep(120)

            cad_code = dataset[i]["cadquery"]

            response = llm.invoke([
                {
                    "role": "system",
                    "content": (
                        "You are helping to create a dataset for a CAD assistant.\n"
                        "Your task is to read a CadQuery script and write a short, natural-sounding prompt "
                        "that a human user might give to get that result.\n\n"
                        "**DO NOT** describe the code or its steps.\n"
                        "Just write the kind of brief request a user would give. Think of what they're trying to make — "
                        "not how it's made.\n"
                        "Examples: 'Make a simple logo shape and extrude it.' or 'Design a 3D part with rounded corners.'"
                    )
                },
                {
                    "role": "user",
                    "content": f"What kind of prompt would a user give to create this object?\n\n{cad_code}"
                }
            ])

            user_prompt = response.content.strip()

            # Save result as JSON line
            item = {
                "prompt": user_prompt,
                "code": cad_code.strip()
            }
            out_file.write(json.dumps(item, ensure_ascii=False) + "\n")

            # Save progress
            with open(PROGRESS_FILE, "w") as f:
                json.dump({ "last_processed_index": i }, f)

            print(f"✅ Processed index {i}")

        except Exception as e:
            print(f"❌ Error at index {i}: {e}")
            break


🔁 Resuming from index 87
✅ Processed index 87
✅ Processed index 88
✅ Processed index 89
✅ Processed index 90
✅ Processed index 91
✅ Processed index 92
✅ Processed index 93
✅ Processed index 94
✅ Processed index 95
✅ Processed index 96
✅ Processed index 97
✅ Processed index 98
✅ Processed index 99
✅ Processed index 100
✅ Processed index 101
✅ Processed index 102
✅ Processed index 103
✅ Processed index 104
✅ Processed index 105
✅ Processed index 106
✅ Processed index 107
✅ Processed index 108
✅ Processed index 109
✅ Processed index 110
✅ Processed index 111
✅ Processed index 112
✅ Processed index 113
✅ Processed index 114
✅ Processed index 115
✅ Processed index 116
✅ Processed index 117
✅ Processed index 118
✅ Processed index 119
✅ Processed index 120
✅ Processed index 121
✅ Processed index 122
✅ Processed index 123
✅ Processed index 124
✅ Processed index 125
✅ Processed index 126
✅ Processed index 127
✅ Processed index 128
✅ Processed index 129
✅ Processed index 130
✅ Processed index 13

KeyboardInterrupt: 

In [15]:
!pip install langchain-community




3176.70s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Collecting langchain-community
  Using cached langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Using cached dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Using cached pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Using cached httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Using cached marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Using cached typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting typing-inspection>=0.4.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Using cached typing_inspection-0.4.1-py3-none-any.whl.

In [None]:
import pandas as pd
import json

# Path to your JSONL file
jsonl_file = "rag_dataset_local.jsonl"

# Read each line and parse JSON
data = []
with open(jsonl_file, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

# Create DataFrame
df = pd.DataFrame(data)

# Display the first few rows
print(df.head())


In [10]:
import json

# Input files
file1 = "rag_dataset_local.jsonl"
file2 = "rag_dataset_local_50k.jsonl"

# Output file
merged_file = "rag_dataset_merged.jsonl"

# Use a set to avoid duplicates
seen = set()
merged_data = []

# Function to read and deduplicate
def load_jsonl(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)
            key = json.dumps(item, sort_keys=True)  # Unique identifier for deduplication
            if key not in seen:
                seen.add(key)
                merged_data.append(item)

# Load both files
load_jsonl(file1)
load_jsonl(file2)

# Write to merged output file
with open(merged_file, "w", encoding="utf-8") as f:
    for item in merged_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ Merged {len(merged_data)} unique records into {merged_file}")


FileNotFoundError: [Errno 2] No such file or directory: 'rag_dataset_local_50k.jsonl'

# 10 req

In [None]:
import os
import json
import time
from datasets import load_dataset
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Initialize the LLM using local LM Studio settings
from langchain_community.chat_models import ChatOllama

llm = ChatOllama(
    model="nchapman/ministral-8b-instruct-2410:8b",  # or any model you pulled (e.g., mistral, gemma)
    base_url="http://localhost:11434"
)

# Output files
OUTPUT_FILE = "rag_dataset_local.jsonl"
PROGRESS_FILE = "progress_local.json"

# Load train split of the dataset
dataset = load_dataset("CADCODER/GenCAD-Code", split="train")

# Check if progress exists and is valid
if os.path.exists(PROGRESS_FILE):
    try:
        with open(PROGRESS_FILE, "r") as f:
            content = f.read().strip()
            progress = json.loads(content) if content else {}
        start_index = progress.get("last_processed_index", -1) + 1
    except json.JSONDecodeError:
        print("⚠️ Corrupted progress file. Restarting from index 0.")
        start_index = 0
else:
    start_index = 0

print(f"🔁 Resuming from index {start_index}")

# Open output file in append mode
with open(OUTPUT_FILE, "a", encoding="utf-8") as out_file:
    for i in range(start_index, len(dataset)):
        if i >= 50000:
            print("🚫 Reached limit of 50000 items. Stopping.")
            break

        try:
            # Delay for 15 seconds per request
            print(f"⏳ Sleeping for 5 seconds before processing index {i}...")
            time.sleep(10)

            # Wait for 2 minutes every 10 iterations to avoid rate limits
            if i > 0 and i % 10 == 0:
                print("⏸ Waiting for 2 minutes every 10 requests to avoid overload...")
                time.sleep(120)

            cad_code = dataset[i]["cadquery"]

            response = llm.invoke([
                {
                    "role": "system",
                    "content": (
                        "You are helping to create a dataset for a CAD assistant.\n"
                        "Your task is to read a CadQuery script and write a short, natural-sounding prompt "
                        "that a human user might give to get that result.\n\n"
                        "**DO NOT** describe the code or its steps.\n"
                        "Just write the kind of brief request a user would give. Think of what they're trying to make — "
                        "not how it's made.\n"
                        "Examples: 'Make a simple logo shape and extrude it.' or 'Design a 3D part with rounded corners.'"
                    )
                },
                {
                    "role": "user",
                    "content": f"What kind of prompt would a user give to create this object?\n\n{cad_code}"
                }
            ])

            user_prompt = response.content.strip()

            # Save result as JSON line
            item = {
                "prompt": user_prompt,
                "code": cad_code.strip()
            }
            out_file.write(json.dumps(item, ensure_ascii=False) + "\n")

            # Save progress
            with open(PROGRESS_FILE, "w") as f:
                json.dump({ "last_processed_index": i }, f)

            print(f"✅ Processed index {i}")

        except Exception as e:
            print(f"❌ Error at index {i}: {e}")
            break


🔁 Resuming from index 569
⏳ Sleeping for 5 seconds before processing index 569...
✅ Processed index 569
⏳ Sleeping for 5 seconds before processing index 570...
⏸ Waiting for 2 minutes every 10 requests to avoid overload...
✅ Processed index 570
⏳ Sleeping for 5 seconds before processing index 571...
✅ Processed index 571
⏳ Sleeping for 5 seconds before processing index 572...
✅ Processed index 572
⏳ Sleeping for 5 seconds before processing index 573...
✅ Processed index 573
⏳ Sleeping for 5 seconds before processing index 574...
✅ Processed index 574
⏳ Sleeping for 5 seconds before processing index 575...
✅ Processed index 575
⏳ Sleeping for 5 seconds before processing index 576...
✅ Processed index 576
⏳ Sleeping for 5 seconds before processing index 577...
✅ Processed index 577
⏳ Sleeping for 5 seconds before processing index 578...
✅ Processed index 578
⏳ Sleeping for 5 seconds before processing index 579...
✅ Processed index 579
⏳ Sleeping for 5 seconds before processing index 580.