In [None]:
!pip install torch transformers peft datasets scikit-learn wandb accelerate

In [8]:
pip install torch transformers peft datasets accelerate


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
!pip install torch transformers peft datasets scikit-learn accelerate
!pip install nltk  # Optional, for better BLEU score calculation


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.9.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
pip install wandb

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/prahlad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# Optional: Set up wandb for experiment tracking
wandb login

# Make sure you have sufficient GPU memory (recommended: 16GB+ VRAM)

SyntaxError: invalid syntax (268683748.py, line 2)

In [15]:
import json
import torch
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig, 
    get_peft_model, 
    TaskType,
    PeftModel
)
from sklearn.metrics import bleu_score
import ast
import re
from typing import Dict, List
import wandb
import os

# BLEU score implementation - try NLTK first, fallback to simple implementation
try:
    from nltk.translate.bleu_score import sentence_bleu
    NLTK_AVAILABLE = True
except ImportError:
    NLTK_AVAILABLE = False

class CADCodeDataset:
    """Dataset class for CAD code generation"""
    
    def __init__(self, jsonl_file: str, tokenizer, max_length: int = 1024):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = self.load_data(jsonl_file)
        
    def load_data(self, jsonl_file: str) -> List[Dict]:
        """Load data from JSONL file"""
        data = []
        with open(jsonl_file, 'r') as f:
            for line in f:
                data.append(json.loads(line))
        return data
    
    def prepare_training_data(self):
        """Prepare data for training with prompt-response format"""
        formatted_data = []
        
        for item in self.data:
            prompt = item['prompt']
            code = item['code']
            
            # Create instruction format
            text = f"### Instruction:\nGenerate CadQuery code for: {prompt}\n\n### Response:\n{code}<|endoftext|>"
            
            formatted_data.append({
                'text': text,
                'prompt': prompt,
                'code': code
            })
        
        return Dataset.from_list(formatted_data)
    
    def tokenize_function(self, examples):
        """Tokenize the text data"""
        return self.tokenizer(
            examples['text'],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )

class CodeEvaluator:
    """Evaluation metrics for code generation"""
    
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def extract_code_from_response(self, response: str) -> str:
        """Extract code from model response"""
        # Look for code after ### Response:
        if "### Response:" in response:
            code_part = response.split("### Response:")[1]
        else:
            code_part = response
            
        # Remove end token if present
        if "<|endoftext|>" in code_part:
            code_part = code_part.split("<|endoftext|>")[0]
            
        return code_part.strip()
    
    def syntax_check(self, code: str) -> bool:
        """Check if the code is syntactically valid Python"""
        try:
            ast.parse(code)
            return True
        except SyntaxError:
            return False
    
    def cadquery_imports_check(self, code: str) -> bool:
        """Check if code contains necessary CadQuery imports"""
        return "import cadquery" in code or "cq.Workplane" in code
    
    def has_solid_creation(self, code: str) -> bool:
        """Check if code creates a solid object"""
        patterns = [
            r'\.extrude\(',
            r'\.revolve\(',
            r'\.sweep\(',
            r'\.loft\(',
            r'solid\s*='
        ]
        return any(re.search(pattern, code) for pattern in patterns)
    
    def calculate_bleu_score(self, reference: str, candidate: str) -> float:
        """Calculate BLEU score between reference and candidate code"""
        try:
            # Tokenize by splitting on whitespace and special characters
            ref_tokens = re.findall(r'\w+|[^\w\s]', reference.lower())
            cand_tokens = re.findall(r'\w+|[^\w\s]', candidate.lower())
            
            if not cand_tokens or not ref_tokens:
                return 0.0
            
            # Use NLTK BLEU if available, otherwise use simple overlap metric
            if NLTK_AVAILABLE:
                try:
                    return sentence_bleu([ref_tokens], cand_tokens)
                except:
                    pass
            
            # Fallback: simple word overlap metric (Jaccard similarity)
            ref_set = set(ref_tokens)
            cand_set = set(cand_tokens)
            if len(cand_set) == 0:
                return 0.0
            
            # Calculate Jaccard similarity as a proxy for BLEU
            intersection = len(ref_set.intersection(cand_set))
            union = len(ref_set.union(cand_set))
            return intersection / union if union > 0 else 0.0
            
        except Exception as e:
            print(f"Warning: BLEU calculation failed: {e}")
            return 0.0
    
    def evaluate_batch(self, predictions: List[str], references: List[str]) -> Dict:
        """Evaluate a batch of predictions"""
        results = {
            'syntax_accuracy': 0,
            'cadquery_imports': 0,
            'solid_creation': 0,
            'avg_bleu_score': 0,
            'total_samples': len(predictions)
        }
        
        bleu_scores = []
        
        for pred, ref in zip(predictions, references):
            # Extract code from prediction
            pred_code = self.extract_code_from_response(pred)
            
            # Syntax check
            if self.syntax_check(pred_code):
                results['syntax_accuracy'] += 1
            
            # CadQuery imports check
            if self.cadquery_imports_check(pred_code):
                results['cadquery_imports'] += 1
                
            # Solid creation check
            if self.has_solid_creation(pred_code):
                results['solid_creation'] += 1
            
            # BLEU score
            bleu = self.calculate_bleu_score(ref, pred_code)
            bleu_scores.append(bleu)
        
        # Convert to percentages
        results['syntax_accuracy'] = (results['syntax_accuracy'] / len(predictions)) * 100
        results['cadquery_imports'] = (results['cadquery_imports'] / len(predictions)) * 100
        results['solid_creation'] = (results['solid_creation'] / len(predictions)) * 100
        results['avg_bleu_score'] = np.mean(bleu_scores) * 100
        
        return results

class StarCoderFineTuner:
    """Fine-tuner for StarCoder model using PEFT LoRA"""
    
    def __init__(self, model_name: str = "bigcode/starcoder", cache_dir: str = "./cache"):
        self.model_name = model_name
        self.cache_dir = cache_dir
        self.tokenizer = None
        self.model = None
        self.peft_model = None
        
    def setup_model_and_tokenizer(self):
        """Initialize model and tokenizer"""
        print("Loading tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name,
            cache_dir=self.cache_dir,
            trust_remote_code=True
        )
        
        # Add pad token if it doesn't exist
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
        print("Loading model...")
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            cache_dir=self.cache_dir,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True
        )
        
        # Enable gradient checkpointing for memory efficiency
        self.model.gradient_checkpointing_enable()
    
    def setup_lora_config(self):
        """Setup LoRA configuration"""
        lora_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            inference_mode=False,
            r=16,  # Rank
            lora_alpha=32,  # Alpha parameter
            lora_dropout=0.1,  # Dropout
            target_modules=[
                "c_attn",  # StarCoder specific attention modules
                "c_proj",
                "c_fc"
            ],
            bias="none"
        )
        
        print("Applying LoRA...")
        self.peft_model = get_peft_model(self.model, lora_config)
        self.peft_model.print_trainable_parameters()
        
        return lora_config
    
    def prepare_data(self, jsonl_file: str, test_split: float = 0.2):
        """Prepare training and validation datasets"""
        dataset_handler = CADCodeDataset(jsonl_file, self.tokenizer)
        dataset = dataset_handler.prepare_training_data()
        
        # Tokenize dataset
        tokenized_dataset = dataset.map(
            dataset_handler.tokenize_function,
            batched=True,
            remove_columns=dataset.column_names
        )
        
        # Split dataset
        split_dataset = tokenized_dataset.train_test_split(
            test_size=test_split, 
            seed=42
        )
        
        return split_dataset['train'], split_dataset['test'], dataset_handler
    
    def train(self, 
              train_dataset, 
              val_dataset, 
              output_dir: str = "./starcoder-cad-lora",
              num_epochs: int = 3,
              batch_size: int = 4,
              learning_rate: float = 2e-4,
              use_wandb: bool = True):
        """Train the model with LoRA"""
        
        if use_wandb:
            wandb.init(
                project="starcoder-cad-finetuning",
                name="starcoder-lora-cad",
                config={
                    "model": self.model_name,
                    "epochs": num_epochs,
                    "batch_size": batch_size,
                    "learning_rate": learning_rate,
                    "lora_r": 16,
                    "lora_alpha": 32
                }
            )
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=num_epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            gradient_accumulation_steps=4,
            warmup_steps=100,
            max_steps=-1,
            learning_rate=learning_rate,
            fp16=True,
            logging_steps=10,
            eval_steps=100,
            save_steps=500,
            evaluation_strategy="steps",
            save_strategy="steps",
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            report_to="wandb" if use_wandb else None,
            dataloader_pin_memory=False,
            remove_unused_columns=False
        )
        
        # Data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False
        )
        
        # Trainer
        trainer = Trainer(
            model=self.peft_model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            data_collator=data_collator,
            tokenizer=self.tokenizer
        )
        
        print("Starting training...")
        trainer.train()
        
        # Save the model
        trainer.save_model()
        self.tokenizer.save_pretrained(output_dir)
        
        if use_wandb:
            wandb.finish()
        
        return trainer
    
    def generate_code(self, prompt: str, max_length: int = 512, temperature: float = 0.2):
        """Generate code for a given prompt"""
        input_text = f"### Instruction:\nGenerate CadQuery code for: {prompt}\n\n### Response:\n"
        
        inputs = self.tokenizer(
            input_text, 
            return_tensors="pt", 
            truncation=True, 
            max_length=512
        )
        
        with torch.no_grad():
            outputs = self.peft_model.generate(
                **inputs,
                max_length=max_length,
                temperature=temperature,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                num_return_sequences=1
            )
        
        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return generated_text
    
    def evaluate_model(self, test_dataset, dataset_handler, num_samples: int = 50):
        """Evaluate the fine-tuned model"""
        evaluator = CodeEvaluator(self.tokenizer)
        
        # Get test samples
        test_samples = []
        for i, item in enumerate(dataset_handler.data):
            if i >= num_samples:
                break
            test_samples.append(item)
        
        predictions = []
        references = []
        
        print(f"Generating predictions for {len(test_samples)} samples...")
        for sample in test_samples:
            prompt = sample['prompt']
            reference_code = sample['code']
            
            # Generate prediction
            generated = self.generate_code(prompt)
            
            predictions.append(generated)
            references.append(reference_code)
        
        # Evaluate
        results = evaluator.evaluate_batch(predictions, references)
        
        print("\n=== EVALUATION RESULTS ===")
        print(f"Syntax Accuracy: {results['syntax_accuracy']:.2f}%")
        print(f"CadQuery Imports: {results['cadquery_imports']:.2f}%")
        print(f"Solid Creation: {results['solid_creation']:.2f}%")
        print(f"Average BLEU Score: {results['avg_bleu_score']:.2f}")
        print(f"Total Samples: {results['total_samples']}")
        
        return results, predictions, references

def main():
    """Main training function"""
    # Configuration
    MODEL_NAME = "bigcode/starcoder"
    JSONL_FILE = "rag_dataset_local.jsonl"
    OUTPUT_DIR = "./starcoder-cad-lora"
    
    # Initialize fine-tuner
    fine_tuner = StarCoderFineTuner(model_name=MODEL_NAME)
    
    # Setup model and tokenizer
    fine_tuner.setup_model_and_tokenizer()
    
    # Setup LoRA
    fine_tuner.setup_lora_config()
    
    # Prepare data
    print("Preparing datasets...")
    train_dataset, val_dataset, dataset_handler = fine_tuner.prepare_data(JSONL_FILE)
    
    print(f"Training samples: {len(train_dataset)}")
    print(f"Validation samples: {len(val_dataset)}")
    
    # Train model
    trainer = fine_tuner.train(
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        output_dir=OUTPUT_DIR,
        num_epochs=3,
        batch_size=2,  # Adjust based on your GPU memory
        learning_rate=2e-4,
        use_wandb=False  # Set to True if you want to use wandb
    )
    
    # Evaluate model
    print("\nEvaluating model...")
    results, predictions, references = fine_tuner.evaluate_model(
        val_dataset, 
        dataset_handler, 
        num_samples=20
    )
    
    # Save evaluation results
    with open(f"{OUTPUT_DIR}/evaluation_results.json", 'w') as f:
        json.dump(results, f, indent=2)
    
    # Save sample predictions
    sample_outputs = []
    for i, (pred, ref) in enumerate(zip(predictions[:5], references[:5])):
        sample_outputs.append({
            'sample_id': i,
            'prediction': pred,
            'reference': ref
        })
    
    with open(f"{OUTPUT_DIR}/sample_predictions.json", 'w') as f:
        json.dump(sample_outputs, f, indent=2)
    
    print(f"\nTraining completed! Model saved to {OUTPUT_DIR}")
    
    # Example inference
    print("\n=== EXAMPLE INFERENCE ===")
    test_prompt = "Make a decorative button with an intricate edge"
    generated_code = fine_tuner.generate_code(test_prompt)
    print(f"Prompt: {test_prompt}")
    print(f"Generated Code:\n{generated_code}")

if __name__ == "__main__":
    main()

ImportError: cannot import name 'bleu_score' from 'sklearn.metrics' (/Users/prahlad/Desktop/Projects/mechagent_prahlad/mecagent-technical-test/x3D/genx3denv/lib/python3.13/site-packages/sklearn/metrics/__init__.py)

I'll help you create a comprehensive PEFT LoRA fine-tuning setup for StarCoder on your CAD code generation dataset. This will include data preparation, training, and evaluation metrics.I've created a comprehensive PEFT LoRA fine-tuning setup for StarCoder on your CAD code generation dataset. Here are the key features:

## Key Components:

1. **CADCodeDataset**: Handles data loading and preprocessing from your JSONL file
2. **CodeEvaluator**: Provides multiple evaluation metrics specifically for code generation
3. **StarCoderFineTuner**: Main class that handles the complete fine-tuning pipeline

## Evaluation Metrics Included:

- **Syntax Accuracy**: Percentage of generated code that's syntactically valid Python
- **CadQuery Imports**: Percentage that includes proper CadQuery imports
- **Solid Creation**: Percentage that creates solid objects (extrude, revolve, etc.)
- **BLEU Score**: Similarity between generated and reference code

## Setup Instructions:

1. **Install required packages**:
```bash
pip install torch transformers peft datasets sklearn wandb accelerate
```

2. **Prepare your environment**:
```bash
# Optional: Set up wandb for experiment tracking
wandb login

# Make sure you have sufficient GPU memory (recommended: 16GB+ VRAM)
```

3. **Run the training**:
```python
python your_script.py
```

## Key Configuration Options:

- **LoRA Parameters**: `r=16`, `alpha=32`, `dropout=0.1`
- **Batch Size**: Adjustable based on your GPU memory
- **Learning Rate**: `2e-4` (standard for LoRA fine-tuning)
- **Target Modules**: StarCoder-specific attention layers

## Memory Optimization Features:

- FP16 training for reduced memory usage
- Gradient checkpointing enabled
- PEFT LoRA for parameter-efficient training
- Configurable batch sizes and gradient accumulation

The code will automatically:
- Load and preprocess your JSONL data
- Split into train/validation sets
- Fine-tune with LoRA
- Evaluate with multiple metrics
- Save the trained model and results
- Provide sample predictions

You can adjust the hyperparameters in the `main()` function based on your specific needs and hardware constraints!

# simple 


In [17]:
import json
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig, 
    get_peft_model, 
    TaskType
)

class CADCodeDataset:
    def __init__(self, jsonl_file: str, tokenizer, max_length: int = 1024):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = self.load_data(jsonl_file)
        
    def load_data(self, jsonl_file: str):
        data = []
        with open(jsonl_file, 'r') as f:
            for line in f:
                data.append(json.loads(line))
        return data
    
    def prepare_training_data(self):
        formatted_data = []
        for item in self.data:
            prompt = item['prompt']
            code = item['code']
            text = f"### Instruction:\nGenerate CadQuery code for: {prompt}\n\n### Response:\n{code}<|endoftext|>"
            formatted_data.append({"text": text})
        return Dataset.from_list(formatted_data)
    
    def tokenize_function(self, examples):
        return self.tokenizer(
            examples['text'],
            truncation=True,
            padding='max_length',
            max_length=self.max_length
        )

def main():
    MODEL_NAME = "bigcode/starcoder"
    JSONL_FILE = "rag_dataset_local.jsonl"
    OUTPUT_DIR = "./starcoder-cad-lora"

    # Load tokenizer & model
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    model.gradient_checkpointing_enable()

    # Apply LoRA
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=16,
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=["c_attn", "c_proj", "c_fc"]
    )
    model = get_peft_model(model, lora_config)

    # Prepare dataset
    dataset_handler = CADCodeDataset(JSONL_FILE, tokenizer)
    dataset = dataset_handler.prepare_training_data()
    tokenized_dataset = dataset.map(
        dataset_handler.tokenize_function,
        batched=True,
        remove_columns=dataset.column_names
    )
    split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

    # Training setup
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=3,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        learning_rate=2e-4,
        evaluation_strategy="steps",
        eval_steps=100,
        save_steps=500,
        fp16=True,
        logging_steps=10,
        save_total_limit=2,
        load_best_model_at_end=True,
        remove_unused_columns=False
    )
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=split_dataset["train"],
        eval_dataset=split_dataset["test"],
        data_collator=data_collator,
        tokenizer=tokenizer
    )

    # Train & save
    trainer.train()
    trainer.save_model(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)

    print(f"\n✅ Training completed! Model saved to {OUTPUT_DIR}")

if __name__ == "__main__":
    main()


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/bigcode/starcoder.
403 Client Error. (Request ID: Root=1-68c179d4-57ef907c2349d86632452787;33b13ffe-4d40-4400-8b81-9ee89fe58805)

Cannot access gated repo for url https://huggingface.co/bigcode/starcoder/resolve/main/config.json.
Access to model bigcode/starcoder is restricted and you are not in the authorized list. Visit https://huggingface.co/bigcode/starcoder to ask for access.