## **Fine-Tuning Pipeline for StarCoder (Copilot-Style)**

This notebook implements a pipeline to fine-tune `starcoderbase-1b` to create **"StarPilot-1B"**, a GitHub Copilot-style coding assistant specialized in Python.

##### Environment Setup & Imports
We mount google drive, install the necessary libraries for efficient LLM training (`bitsandbytes` for quantization, `peft` for LoRA, `trl` for the training loop) and import core modules.

In [1]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

# Change Path
%cd /content/drive/MyDrive/Sahil/Fine-Tuning

Mounted at /content/drive
/content/drive/MyDrive/Sahil/Fine-Tuning


In [2]:
# Install required libraries
!pip install -q transformers datasets peft bitsandbytes trl accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m532.5/532.5 kB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# Import modules
import torch
import re
import os
import numpy as np
import warnings
from dataclasses import dataclass
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback,
    TrainerCallback,
    TrainerState,
    TrainerControl
)
from transformers.trainer_callback import PrinterCallback
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer, SFTConfig

# Ignore warnings
warnings.filterwarnings('ignore')

In [6]:
# Hugging Face Login
from huggingface_hub import notebook_login
notebook_login()

##### Configuration Management
Here we define the necessary configurations.

In [7]:
@dataclass
class ModelConfig:
    model_id: str = "bigcode/starcoderbase-1b"
    dataset_id: str = "ise-uiuc/Magicoder-Evol-Instruct-110K"
    tuned_model_id: str = "starpilot-1b-v1"
    load_in_4bit: bool = True
    bnb_compute_dtype: str = "bfloat16"

@dataclass
class LoRAConfig:
    r: int = 32
    alpha: int = 64
    dropout: float = 0.05
    target_modules: str = "all-linear"

@dataclass
class TrainConfig:
    # Precision & Hardware
    bf16: bool = True
    fp16: bool = False
    gradient_checkpointing: bool = True

    # Data Processing
    dataset_text_field: str = "code"
    max_length: int = 2048
    packing: bool = False
    group_by_length: bool = True

    # Training Loop & Optimization
    max_steps: int = 4000
    batch_size: int = 8
    grad_accum: int = 4
    patience: int = 3
    optim: str = "paged_adamw_32bit"
    lr_scheduler_type: str = "cosine"
    learning_rate: float = 2e-4
    warmup_steps: int = 100
    weight_decay: float = 0.01

    # Evaluation
    eval_strategy: str = "steps"
    eval_steps: int = 200
    eval_batch_size: int = 32

    # Logging
    disable_tqdm: bool = True
    logging_strategy: str = "steps"
    logging_steps: int = 200
    report_to: str = "none"


    # Checkpoints & Saving
    output_dir: str = "./checkpoints"
    save_strategy: str = "steps"
    save_steps: int = 200
    save_total_limit: int = 5
    load_best_model_at_end: bool = True
    metric_for_best_model: str = "eval_loss"
    greater_is_better: bool = False

##### Data Pipeline
We parse the Magicoder dataset to extract python code and reformat it into a Comment -> Code structure. This teaches the model to treat comments as prompts, mimicking the GitHub Copilot autocomplete feature.

In [8]:
class DataPipeline:
    def __init__(self, config: ModelConfig, tokenizer: AutoTokenizer):
        self.config = config
        self.tokenizer = tokenizer
        self.separator = "\n# Solution:\n"

    def format_instruction(self, example):
        """
        Formats as a python comment followed by code to mimic Copilot autocomplete behavior.
        """
        instruction = example.get('instruction', '')
        response = example.get('response', '')

        # Extract code block wrapped in markdown
        code_blocks = re.findall(r'```python\s*(.*?)\s*```', response, re.DOTALL)
        if not code_blocks:
            return {"code": None}

        code_content = "\n\n".join(code_blocks)

        # Construct training prompt
        formatted_text = f'# {instruction}{self.separator}{code_content}{self.tokenizer.eos_token}'

        return {"code": formatted_text}

    def load_and_prepare(self):
        print(f"Loading dataset: {self.config.dataset_id}")
        dataset = load_dataset(self.config.dataset_id, split="train")

        print("Formatting dataset for Copilot style...")
        dataset = dataset.map(self.format_instruction)

        # Filter data
        dataset = dataset.filter(lambda x: x['code'] is not None and len(x['code'].strip()) > 0)

        # Train-Validation Split
        split_dataset = dataset.train_test_split(test_size=0.08)
        print(f"Train size: {len(split_dataset['train'])}, Validation size: {len(split_dataset['test'])}")

        return split_dataset['train'], split_dataset['test']

##### Custom Data Collator
This custom collator finds the separator between the Comment and the Code. It sets the labels for the Comment to `-100`, which tells PyTorch to ignore them during loss calculation.

In [9]:
class CompletionCollator(DataCollatorForLanguageModeling):
    """
    Custom collator that finds a specific separator string in the tokenized sequence
    and masks everything before it (the instruction), so the model only learns the code.
    """
    def __init__(self, tokenizer, response_template="\n# Solution:\n", *args, **kwargs):
        super().__init__(tokenizer=tokenizer, mlm=False, *args, **kwargs)
        self.response_template = response_template
        self.template_ids = self.tokenizer.encode(self.response_template, add_special_tokens=False)

    def __call__(self, examples):
        # Tokenize the batch using parent class
        batch = super().__call__(examples)

        # Set PAD labels to -100
        batch["labels"][batch["labels"] == self.tokenizer.pad_token_id] = -100

        # Modify the labels to mask instructions
        for i in range(len(batch["labels"])):
            input_ids = batch["input_ids"][i].tolist()
            start_idx = self.find_subsequence(input_ids, self.template_ids)

            if start_idx != -1:
                # The code starts after the template
                end_of_template = start_idx + len(self.template_ids)

                # Mask everything before the code starts
                batch["labels"][i, :end_of_template] = -100
            else:
                # If template not found, mask everything
                batch["labels"][i, :] = -100

        return batch

    def find_subsequence(self, full_seq, pattern):
        """Helper to find the start index of a sub-list (pattern) in a list"""
        n = len(pattern)
        for i in range(len(full_seq) - n + 1):
            if full_seq[i:i+n] == pattern:
                return i
        return -1

##### Model Factory
We initialize the Model (with QLoRA quantization) and the Tokenizer

In [10]:
class ModelFactory:
    @staticmethod
    def create_model(config: ModelConfig):
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=config.load_in_4bit,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=config.bnb_compute_dtype,
            bnb_4bit_use_double_quant=True
        )

        model = AutoModelForCausalLM.from_pretrained(
            config.model_id,
            quantization_config=bnb_config,
            device_map="auto",
            use_cache=False,
            trust_remote_code=True
        )

        model = prepare_model_for_kbit_training(model)

        return model

    @staticmethod
    def create_tokenizer(model_id: str):
        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "right"
        return tokenizer

##### Custom Callbacks
Here we define the custom callback to fix GitHub rendering issues by providing a tabular log of training and validation loss.

In [11]:
class TabularPrinterCallback(TrainerCallback):
    def __init__(self):
        self._header_printed = False

    def on_evaluate(self, args, state: TrainerState, control: TrainerControl, metrics=None, **kwargs):
        """Called after each evaluation step"""

        # Get train and eval loss
        train_loss = next((log['loss'] for log in reversed(state.log_history) if 'loss' in log), "N/A")
        eval_loss = metrics.get('eval_loss', 'N/A') if metrics else 'N/A'

        # Get step
        step_str = f"{state.global_step}/{args.max_steps}"

        # Print Header (only once)
        if not self._header_printed:
            print(f"\n{'Step':<15} | {'Train Loss':<15} | {'Eval Loss':<15}")
            print(f"{'-'*15}-+-{'-'*15}-+-{'-'*15}")
            self._header_printed = True

        # Print Row
        print(f"{step_str:<15} | {train_loss:<15} | {eval_loss:<15}")

#### Main Execution
In this step, we -

1. Initialize Configs
2. Load Data and Tokenizer
3. Load the Base Model
4. Attach LoRA Adapters (trainable layers)
5. Launch the SFTTrainer (Supervised Fine-Tuning) with custom logging
6. Save the model (adapter)

In [12]:
# Initialize Configs
model_cfg = ModelConfig()
lora_cfg = LoRAConfig()
train_cfg = TrainConfig()

In [13]:
# Load Tokenizer & Data
print("Initializing Pipeline...")
tokenizer = ModelFactory.create_tokenizer(model_cfg.model_id)
data_pipe = DataPipeline(model_cfg, tokenizer)
train_dataset, eval_dataset = data_pipe.load_and_prepare()

Initializing Pipeline...
Loading dataset: ise-uiuc/Magicoder-Evol-Instruct-110K
Formatting dataset for Copilot style...
Train size: 44638, Validation size: 3882


In [14]:
# Initialize Custom Collator
custom_collator = CompletionCollator(
    tokenizer=tokenizer,
    response_template="\n# Solution:\n"
)

In [15]:
# Load Model
model = ModelFactory.create_model(model_cfg)
model.config.pad_token_id = tokenizer.eos_token_id

# Configure LoRA
peft_config = LoraConfig(
    r=lora_cfg.r,
    lora_alpha=lora_cfg.alpha,
    lora_dropout=lora_cfg.dropout,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=lora_cfg.target_modules
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 22,216,704 || all params: 1,159,424,000 || trainable%: 1.9162


In [16]:
# Trainer Setup
training_args = SFTConfig(
    # Precision & Hardware
    bf16=train_cfg.bf16,
    fp16=train_cfg.fp16,
    gradient_checkpointing=train_cfg.gradient_checkpointing,

    # Data Processing
    dataset_text_field=train_cfg.dataset_text_field,
    max_length=train_cfg.max_length,
    packing=train_cfg.packing,
    group_by_length=train_cfg.group_by_length,

    # Training Loop & Optimization
    max_steps=train_cfg.max_steps,
    per_device_train_batch_size=train_cfg.batch_size,
    gradient_accumulation_steps=train_cfg.grad_accum,
    optim=train_cfg.optim,
    lr_scheduler_type=train_cfg.lr_scheduler_type,
    learning_rate=train_cfg.learning_rate,
    warmup_steps=train_cfg.warmup_steps,
    weight_decay=train_cfg.weight_decay,

    # Evaluation
    eval_strategy=train_cfg.eval_strategy,
    eval_steps=train_cfg.eval_steps,
    per_device_eval_batch_size=train_cfg.eval_batch_size,

    # Logging
    disable_tqdm = train_cfg.disable_tqdm,
    logging_strategy=train_cfg.logging_strategy,
    logging_steps = train_cfg.logging_steps,
    report_to=train_cfg.report_to,

    # Checkpoints & Saving
    output_dir=train_cfg.output_dir,
    save_strategy=train_cfg.save_strategy,
    save_steps=train_cfg.save_steps,
    save_total_limit=train_cfg.save_total_limit,
    load_best_model_at_end=train_cfg.load_best_model_at_end,
    metric_for_best_model=train_cfg.metric_for_best_model,
    greater_is_better=train_cfg.greater_is_better
)

In [17]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    data_collator=custom_collator,
    args=training_args,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=train_cfg.patience), TabularPrinterCallback()]
)

# Remove Default Logger
trainer.remove_callback(PrinterCallback)

In [18]:
# Start Training
print("Starting Training...")
_ = trainer.train() # Suppress 'TrainOutput'

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.


Starting Training...

Step            | Train Loss      | Eval Loss      
----------------+-----------------+----------------
200/4000        | 0.5665          | 0.523872971534729
400/4000        | 0.5362          | 0.5133563876152039
600/4000        | 0.5225          | 0.49929407238960266
800/4000        | 0.5168          | 0.4934311807155609
1000/4000       | 0.5165          | 0.4880228638648987
1200/4000       | 0.5043          | 0.4815034866333008
1400/4000       | 0.4951          | 0.49490422010421753
1600/4000       | 0.4648          | 0.48559293150901794
1800/4000       | 0.4576          | 0.4859406352043152


In [19]:
# Save the model
print("Saving final model...")
trainer.save_model(model_cfg.tuned_model_id)
print("Model saved successfully!")

Saving final model...
Model saved successfully!
