In [1]:
# finetune_qlora_starcoder.py
import os
import json
from dataclasses import dataclass
from typing import Dict, List, Optional

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    default_data_collator,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftConfig

# -------------------------
# USER CONFIG
# -------------------------
MODEL_NAME = os.environ.get("MODEL_NAME", "bigcode/starcoder")  # change if different
DATA_PATH = os.environ.get("DATA_PATH", "data/prompts_codes.jsonl")  # jsonl with {"prompt": "...", "code": "..."}
OUTPUT_DIR = os.environ.get("OUTPUT_DIR", "outputs/qlora-starcoder")
BATCH_SIZE = int(os.environ.get("BATCH_SIZE", "8"))
EVAL_BATCH_SIZE = int(os.environ.get("EVAL_BATCH_SIZE", "4"))
LEARNING_RATE = float(os.environ.get("LR", "2e-4"))
NUM_EPOCHS = int(os.environ.get("EPOCHS", "3"))
MAX_LENGTH = int(os.environ.get("MAX_LEN", "2048"))
GRAD_ACCUM_STEPS = int(os.environ.get("GRAD_ACCUM", "1"))

# LoRA / QLoRA config
LORA_R = int(os.environ.get("LORA_R", "16"))
LORA_ALPHA = int(os.environ.get("LORA_ALPHA", "32"))
LORA_DROPOUT = float(os.environ.get("LORA_DROPOUT", "0.05"))
# Target modules vary by model. For many causal transformers, q/k/v/o works.
TARGET_MODULES = os.environ.get("TARGET_MODULES", "q_proj,k_proj,v_proj,o_proj").split(",")

# -------------------------
# utility / dataset prep
# -------------------------
def build_prompt(prompt: str, code: str):
    """
    Combine into a single text for causal LM.
    You can change template to match how model was trained (instruction style, etc.)
    """
    # Keep it simple: separate with sentinel
    return f"{prompt}\n\n### Code:\n{code}\n"

def tokenize_and_mask(examples, tokenizer, max_length=2048):
    """
    For each example, we return:
      - input_ids: tokenized(prompt + code)
      - labels: same as input_ids but with prompt tokens set to -100 so loss is only on code
    """
    prompts = examples["prompt"]
    codes = examples["code"]
    input_ids_list = []
    labels_list = []

    for p, c in zip(prompts, codes):
        combined = build_prompt(p, c)
        # tokenize prompt alone to find length
        with tokenizer.as_target_tokenizer():
            # For causal models we use same tokenizer for prompt and target
            prompt_tokens = tokenizer(p, add_special_tokens=False)["input_ids"]
        # tokenize combined
        tokenized = tokenizer(
            combined,
            truncation=True,
            max_length=max_length,
            padding=False,
            return_tensors=None,
            add_special_tokens=True,
        )["input_ids"]

        # create labels: mask prompt part
        prompt_len = len(prompt_tokens)
        labels = tokenized.copy()
        for i in range(min(prompt_len, len(labels))):
            labels[i] = -100  # ignore prompt tokens in loss

        input_ids_list.append(tokenized)
        labels_list.append(labels)

    return {"input_ids": input_ids_list, "labels": labels_list}

# Custom collator to pad input_ids and labels the same way
@dataclass
class DataCollatorForCausal:
    tokenizer: AutoTokenizer
    padding: bool = True
    max_length: Optional[int] = None

    def __call__(self, features: List[Dict[str, List[int]]]) -> Dict[str, torch.Tensor]:
        input_ids = [f["input_ids"] for f in features]
        labels = [f["labels"] for f in features]

        batch = self.tokenizer.pad(
            {"input_ids": input_ids, "labels": labels},
            padding=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        # Ensure labels tensor exists and dtype long
        batch["labels"] = batch["labels"].to(torch.long)
        return batch

# -------------------------
# Main
# -------------------------
def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # 1) tokenizer and dataset
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
    # Ensure tokenizer has pad token for padding (important)
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({"pad_token": "</s>"})

    print("Loading dataset...")
    # Accept either jsonl or huggingface dataset path
    if DATA_PATH.endswith(".jsonl") or DATA_PATH.endswith(".json"):
        dataset = load_dataset("json", data_files={"train": DATA_PATH})
    else:
        # assume generic dataset path (Hugging Face)
        dataset = load_dataset(DATA_PATH)

    # If your dataset already has train/validation splits, adapt here
    # For example: dataset = dataset["train"].train_test_split(test_size=0.02)
    if "validation" not in dataset:
        dataset = dataset["train"].train_test_split(test_size=0.02)
        train_ds = dataset["train"]
        val_ds = dataset["test"]
    else:
        train_ds = dataset["train"]
        val_ds = dataset["validation"]

    # Map tokenization with masking
    print("Tokenizing and creating labels (masking prompt tokens)...")
    tokenized_train = train_ds.map(
        lambda ex: tokenize_and_mask(ex, tokenizer, max_length=MAX_LENGTH),
        batched=True,
        remove_columns=train_ds.column_names,
    )
    tokenized_val = val_ds.map(
        lambda ex: tokenize_and_mask(ex, tokenizer, max_length=MAX_LENGTH),
        batched=True,
        remove_columns=val_ds.column_names,
    )

    # 2) model loading in 4-bit (QLoRA preps)
    print("Loading model in 4-bit mode...")
    # bitsandbytes 4-bit config via transformers `load_in_4bit`
    bnb_kwargs = {
        "load_in_4bit": True,
        "bnb_4bit_compute_dtype": torch.float16,
        "bnb_4bit_use_double_quant": True,
        "bnb_4bit_quant_type": "nf4",  # nf4 is common for LLMs
    }

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map="auto",
        torch_dtype=torch.float16,
        trust_remote_code=False,
        **bnb_kwargs,
    )

    # Prepare model for k-bit training (this enables gradient checkpointing etc.)
    model = prepare_model_for_kbit_training(model)

    # 3) attach LoRA adapters (PEFT)
    print("Applying LoRA...")
    lora_config = LoraConfig(
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        target_modules=TARGET_MODULES,
        lora_dropout=LORA_DROPOUT,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()  # logs % trainable params (should be small)

    # 4) training args & trainer
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=EVAL_BATCH_SIZE,
        num_train_epochs=NUM_EPOCHS,
        learning_rate=LEARNING_RATE,
        fp16=True,
        logging_steps=50,
        evaluation_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        save_steps=500,
        save_total_limit=3,
        gradient_accumulation_steps=GRAD_ACCUM_STEPS,
        weight_decay=0.01,
        dataloader_num_workers=4,
        optim="paged_adamw_32bit",  # works with bitsandbytes + huge models
        report_to="none",  # change to "wandb" if using Weights & Biases
        remove_unused_columns=False,
        push_to_hub=False,
    )

    data_collator = DataCollatorForCausal(tokenizer=tokenizer, max_length=MAX_LENGTH)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        data_collator=data_collator,
    )

    # 5) train
    print("Starting training...")
    trainer.train()
    print("Training finished.")

    # 6) save adapter weights (LoRA) so base model remains untouched
    print("Saving LoRA adapters to", OUTPUT_DIR)
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    print("Done. Adapter + tokenizer saved.")

if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'datasets'

# setup instruction and observations


Nice — below is a ready-to-run, well-commented QLoRA fine-tuning script for a causal code model like **StarCoder** (you wrote "start coder" — I assume you meant **StarCoder**; if it's a different checkpoint, just point `MODEL_NAME` at it). The script trains on a dataset of `{ "prompt": "...", "code": "..." }` JSONL entries and **only computes loss on the code** part (prompt tokens are masked with `-100`).

### Quick notes before the script

* QLoRA uses 4-bit quantization (bitsandbytes) + PEFT (LoRA). You need a GPU with sufficient memory; best with A100/RTX40x0, or use multiple GPUs (Accelerate).
* Installs: `transformers`, `datasets`, `accelerate`, `peft`, `bitsandbytes`, `torch`, plus optionally `sentencepiece`/`tokenizers` if needed.
* This script uses `transformers.Trainer` for simplicity. For very large setups prefer Hugging Face `accelerate` low-level or `trlx`.
* **Adjust** `TARGET_MODULES` to modules used by your model architecture (LoRA target modules vary across models).
* Save only the LoRA adapter (small) — keep base checkpoint separate.

---

---

## Tips & tuning

* **Target modules**: If you see warnings or no parameter changes, inspect `model.named_modules()` and choose correct linear layer names (common: `q_proj`, `k_proj`, `v_proj`, `o_proj`, `fc1`, `fc2`, `gate_proj` etc.). Use `print([n for n, m in model.named_modules() if isinstance(m, torch.nn.Linear)])` to inspect.
* **Batch size**: increase via `gradient_accumulation_steps` if GPU memory limited.
* **Optimizer**: `paged_adamw_32bit` generally works well with large models + bitsandbytes. If not available, try `adamw_torch`.
* **Eval metric**: For code models you may want to compute exact-match or BLEU on code and also run syntax checks / execution tests after save.
* **Tokenizer**: Make sure the tokenizer’s special tokens match model expectations (e.g., `bos_token`, `eos_token`). Add them if missing.
* **Long contexts**: For very long code, increase `MAX_LENGTH` and ensure vLLM/SageMaker inference supports it.
* **Validation**: Use small eval set to ensure loss decreases and generate samples periodically.

---

If you want, I can:

* Generate a ready-to-run `accelerate` config + `launch` example for multi-GPU training.
* Produce a sample `hf_dataset` JSONL toy file and a small test run.
* Tailor the `TARGET_MODULES` after I inspect `model.named_modules()` from your exact StarCoder checkpoint. Which model checkpoint are you using (HF Hub name or local path)?


# why need to combine prompt and code

Great question 👌 — let’s unpack that.

We **combine prompt + code into one sequence** during supervised fine-tuning because:

### 1. Causal LM training objective

* StarCoder and similar models are **causal language models** (predict the next token given all previous ones).
* They don’t have a special “input vs label” separation like an encoder–decoder (T5, BART).
* To make them condition on a prompt, we concatenate:

```
[prompt tokens] [code tokens]
```

and then tell the model:

* **Predict the code tokens**,
* **Ignore the prompt tokens in the loss** (mask them with `-100`).

So the model sees the prompt in its context and learns: *“after this kind of prompt, the next tokens should be this code.”*

---

### 2. Why not train only on code?

If you fed only the code as the target sequence, the model wouldn’t learn the mapping from **natural-language prompt → code**.
It would just learn to continue random code snippets, like pretraining.

---

### 3. Flexibility for inference

At inference time, you give only the prompt:

```
[prompt tokens]
```

and the model will auto-regressively generate the continuation:

```
[code tokens]
```

Because that’s what it was trained to do.

---

### 4. Analogy with instruction tuning

This is the same trick instruction-tuned LLMs use.
E.g. training examples are formatted like:

```
### Instruction:
Write a Python function that reverses a string.

### Response:
def reverse_string(s):
    return s[::-1]
```

The model sees the instruction + response together, but the loss is only on the response tokens.

---

⚡So in short:
We combine `prompt + code` into one sequence **because causal LMs only know “next token prediction,” and this setup forces them to learn: given this prompt, continue with this code.**

---

Do you want me to also show you what the **dataset entries should look like** before and after tokenization, so you see exactly how the loss masking works?


# Type of fine tuning

Yes ✅ — the script I gave you is **supervised fine-tuning (SFT)** with QLoRA.

Here’s why:

* You already have pairs of **(prompt → code)** in your dataset.
* We tokenize both, then mask the **prompt tokens with `-100`** in the labels so the loss is computed **only on the code part**.
* During training, the model is directly optimized with **cross-entropy loss** to predict the ground-truth code given the prompt.

That’s exactly what supervised fine-tuning means: you have gold outputs (your code) for each input (prompt), and you minimize the loss between model predictions and these targets.

👉 Later, if you want to go beyond SFT:

* **RLHF / DPO**: Add preference data to align the model with what’s “better” code.
* **Self-Refinement / Feedback**: Use syntax/compilation/IOU feedback loops (like you mentioned for CAD codegen).
* **Unsupervised adaptation**: Only next-token training on raw code (like continuing pretraining).

Do you want me to also show you how to set this up in a **reinforcement learning loop** (e.g., DPO or RLHF) after SFT? That’s often the next step for codegen models.
