In [1]:
!pip install transformers datasets peft trl bitsandbytes accelerate

Collecting transformers
  Downloading transformers-4.55.0-py3-none-any.whl.metadata (39 kB)
Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.17.0-py3-none-any.whl.metadata (14 kB)
Collecting trl
  Downloading trl-0.21.0-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting accelerate
  Downloading accelerate-1.10.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.3-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.7.34-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecti

In [2]:
!pip install --upgrade bitsandbytes

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [1]:
from datasets import Dataset  # or load_dataset if from file
from transformers import AutoTokenizer
import json

In [2]:
from transformers import  AutoModelForCausalLM, BitsAndBytesConfig, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType
from peft.utils.other import prepare_model_for_kbit_training
from datasets import load_dataset
import torch

In [3]:
def format_example(example):
    system_prompt = "You are a tutor. Give responses as a tutor. Do not give full code solutions — only partial solutions, hints, or small examples.\n\n"
    user_input = example['prompt'].strip()
    response = example['response'].strip()

    full_prompt = (
        "<|system|>\n"
        f"{system_prompt.strip()}\n"
        "<|user|>\n"
        f"{user_input.strip()}\n"
        "<|assistant|>\n"
        f"{response.strip()}"
    )

    tokenized = tokenizer(
        full_prompt,
        padding="max_length",
        truncation=True,
        max_length=1024,
        return_tensors="pt"
    )
    tokenized = {k: v.squeeze(0) for k, v in tokenized.items()}

    # Mask all tokens before assistant's answer
    assistant_start = full_prompt.find("<|assistant|>")
    assistant_token_ids = tokenizer(full_prompt[:assistant_start], return_tensors="pt").input_ids.squeeze(0)
    labels = tokenized["input_ids"].clone()
    labels[:len(assistant_token_ids)] = -100
    tokenized["labels"] = labels

    return tokenized

In [11]:
!rm -rf ~/.cache/huggingface/hub/models--bigcode--starcoder2-3b


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoder2-3b", trust_remote_code=True)
bnb_config = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/12.1G [00:00<?, ?B/s]

In [20]:
def format_example(example):
    prompt = example["prompt"].strip()
    response = example["response"].strip()

    full_text = prompt + "\n" + response + tokenizer.eos_token

    # Tokenize full sequence
    tokenized = tokenizer(full_text, truncation=True, max_length=1024, padding="max_length")
    
    # Mask prompt (so only response is used for training loss)
    prompt_token_count = len(tokenizer(prompt, truncation=True, max_length=1024)["input_ids"])
    labels = tokenized["input_ids"].copy()
    labels[:prompt_token_count] = [-100] * prompt_token_count
    tokenized["labels"] = labels

    return tokenized


In [21]:
# Load your dataset from JSON
with open("./merged_output.json", "r") as f:
    raw_data = json.load(f)

dataset = Dataset.from_list(raw_data)

# Load tokenizer with proper config for Starcoder2

# Ensure tokenizer has a pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    
tokenized_dataset = dataset.map(
    format_example,
    remove_columns=dataset.column_names,
    batched=False,
    desc="Tokenizing dataset...",
)
    
    
    


Tokenizing dataset...:   0%|          | 0/102000 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset.save_to_disk("tokenized_starcoder2_dataset")


In [22]:
from trl import SFTTrainer, SFTConfig


In [23]:


# ---- 2. Load Base Model with 8-bit Quantization ----


model = prepare_model_for_kbit_training(model)

# ---- 3. Apply LoRA using PEFT ----
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "q_proj", "v_proj", "k_proj", "o_proj",
        "up_proj", "down_proj", "gate_proj"
    ],
    
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()



trainable params: 9,093,120 || all params: 3,039,464,448 || trainable%: 0.2992


In [26]:
sft_config = SFTConfig(
    output_dir="./checkpoints",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    auto_find_batch_size=False,  # Set to True if you get OOM error
    num_train_epochs=2,
    logging_steps=200,
    save_steps=500,
    save_total_limit=2,
    fp16=False,
    bf16=True,
    learning_rate=3e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    optim="adamw_torch_fused",  # Use fused optimizer for faster training
    max_grad_norm=1.0,
    gradient_checkpointing=True,  # Saves memory at slight compute cost
    report_to="none",
    
)
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=sft_config,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)

Truncating train dataset:   0%|          | 0/102000 [00:00<?, ? examples/s]

In [17]:
import torch
import numpy as np

torch.serialization.add_safe_globals([np.core.multiarray._reconstruct])


  torch.serialization.add_safe_globals([np.core.multiarray._reconstruct])


In [18]:
_original_torch_load = torch.load

def patched_load(*args, **kwargs):
    kwargs["weights_only"] = False  # Force loading full state
    return _original_torch_load(*args, **kwargs)

torch.load = patched_load


In [None]:

trainer.train()





Step,Training Loss
200,3.3547
