In [1]:
!pip install -q transformers datasets peft accelerate torch



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os
import json
import torch
from datasets import Dataset, DatasetDict, load_dataset
from sklearn.model_selection import train_test_split
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer, # <--- Using standard Trainer
    pipeline,
    logging,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, PeftModel, get_peft_model
# Note: prepare_model_for_kbit_training is for bitsandbytes, removed
# from trl import SFTTrainer # <-- Removed SFTTrainer import

# --- Configuration ---
# Model
base_model_name = "Qwen/Qwen3-1.7B"
new_model_name = "qwen3-1.7b-linear-algebra-coder-lora-stdtrainer" # Updated name

# Dataset
dataset_path = "final_dataset_no_comments (1).json" # Assumes EOS token was added

# LoRA Config
lora_r = 16
lora_alpha = 32
lora_dropout = 0.05
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

# Training Config
output_dir = "./qwen1.8b-lora-stdtrainer-results" # Updated output dir
num_train_epochs = 3
per_device_train_batch_size = 1 # START LOW for LoRA without quantization
per_device_eval_batch_size = 1
gradient_accumulation_steps = 16 # Adjust effective batch size
gradient_checkpointing = True
optim = "adamw_torch"
save_strategy = "steps" # Use save_strategy with Trainer
save_steps = 100
logging_steps = 10
learning_rate = 1e-4
weight_decay = 0.01
fp16 = False # Set only one to True
bf16 = True  # Assuming Ampere+ GPU
max_grad_norm = 0.3
max_steps = -1
warmup_ratio = 0.03
lr_scheduler_type = "cosine"
group_by_length = True
evaluation_strategy = "steps" # Standard Trainer uses this
eval_steps = 100
save_total_limit = 2
load_best_model_at_end = True
metric_for_best_model="eval_loss"
greater_is_better=False
report_to = "tensorboard"

# Other
seed = 42
max_seq_length = 1024 # Max sequence length for tokenization

# Set PYTORCH_CUDA_ALLOC_CONF if needed
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# --- End Configuration ---

# --- Determine Torch Dtype ---
torch_dtype = torch.bfloat16 if bf16 else (torch.float16 if fp16 else torch.float32)
print(f"Using torch dtype: {torch_dtype}")

# Check for CUDA availability
if not torch.cuda.is_available():
    print("Warning: CUDA not available, training will be very slow on CPU.")

Using torch dtype: torch.bfloat16


In [3]:
# --- Load Data ---
print(f"Loading dataset from {dataset_path}...")
try:
    with open(dataset_path, "r", encoding='utf-8') as f:
        data = json.load(f)
except FileNotFoundError:
    print(f"Error: Dataset file '{dataset_path}' not found.")
    exit()
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from '{dataset_path}'.")
    exit()
print(f"Loaded {len(data)} examples.")

# --- Train/Validation Split ---
train_data, val_data = train_test_split(data, test_size=0.1, random_state=seed)
print(f"Training samples: {len(train_data)}, Validation samples: {len(val_data)}")

# --- Convert to Hugging Face Dataset ---
hf_dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(val_data)
})
print("Dataset structure:")
print(hf_dataset)

# --- Load Tokenizer ---
print(f"\nLoading tokenizer for {base_model_name}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)

if tokenizer.pad_token is None:
    if tokenizer.eos_token:
        tokenizer.pad_token = tokenizer.eos_token
        print("Set pad_token to eos_token")
    else:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        print("Added [PAD] as pad_token")
        # Will resize model embeddings in Cell 3

tokenizer.padding_side = "right"

# --- Tokenization and Label Masking Function ---
# Prepare data for standard Trainer with Causal LM objective
def tokenize_and_mask(examples):
    # Combine input and output
    input_prompts = [ex.strip() + "\n" for ex in examples["input"]]
    outputs = [ex.strip() for ex in examples["output"]]

    processed_outputs = []
    for out in outputs:
        if not out.endswith(tokenizer.eos_token):
            processed_outputs.append(out + tokenizer.eos_token)
        else:
            processed_outputs.append(out)

    full_texts = [prompt + output for prompt, output in zip(input_prompts, processed_outputs)]

    # Tokenize the full text (TRUNCATE, but DO NOT PAD here)
    model_inputs = tokenizer(
        full_texts,
        max_length=max_seq_length,
        truncation=True,
        padding=False, # *** CHANGED: Let collator handle padding ***
        return_tensors=None, # *** CHANGED: Return lists ***
    )

    # Tokenize prompts separately to find their length for masking
    # Important: Use return_tensors=None here too if not already
    prompt_tokens_results = tokenizer(
        input_prompts,
        max_length=max_seq_length, # Truncate prompt if it's too long itself
        truncation=True,
        padding=False,
        return_tensors=None,
        add_special_tokens=False # Get length without special tokens if tokenizer adds them by default
    )


    # Create labels and mask
    labels = []
    input_ids_list = []
    attention_mask_list = []

    for i in range(len(model_inputs["input_ids"])):
        prompt_length = len(prompt_tokens_results['input_ids'][i])
        full_token_ids = model_inputs["input_ids"][i]

        # Create labels: copy input_ids then mask
        current_labels = list(full_token_ids) # Make a copy as a list
        current_labels[:prompt_length] = [-100] * prompt_length # Mask prompt part

        # Add the processed lists to our output lists
        labels.append(current_labels)
        input_ids_list.append(full_token_ids)
        attention_mask_list.append(model_inputs["attention_mask"][i]) # Keep original attention mask


    # Return the dictionary expected by the dataset map function
    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_mask_list,
        "labels": labels,
    }


# --- Apply Tokenization and Masking ---
print("\nTokenizing and masking dataset...")
tokenized_datasets = hf_dataset.map(
    tokenize_and_mask,
    batched=True, # Process in batches
    remove_columns=hf_dataset["train"].column_names # Remove original 'input', 'output'
)

print("Dataset structure after tokenization/masking:")
print(tokenized_datasets)
print("\nSample tokenized data point:")
idx = 0
# Check lengths - they might vary now, which is expected before collation
print("Input IDs length:", len(tokenized_datasets["train"][idx]["input_ids"]))
print("Labels length:", len(tokenized_datasets["train"][idx]["labels"]))
print("Input IDs sample:", tokenized_datasets["train"][idx]["input_ids"][:60])
print("Labels sample:", tokenized_datasets["train"][idx]["labels"][:60])

Loading dataset from final_dataset_no_comments (1).json...
Loaded 6804 examples.
Training samples: 6123, Validation samples: 681
Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 6123
    })
    validation: Dataset({
        features: ['input', 'output'],
        num_rows: 681
    })
})

Loading tokenizer for Qwen/Qwen3-1.7B...

Tokenizing and masking dataset...


Map:   0%|          | 0/6123 [00:00<?, ? examples/s]

Map:   0%|          | 0/681 [00:00<?, ? examples/s]

Dataset structure after tokenization/masking:
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6123
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 681
    })
})

Sample tokenized data point:
Input IDs length: 73
Labels length: 73
Input IDs sample: [28468, 264, 220, 18, 87, 18, 87, 18, 15626, 448, 476, 4587, 2492, 34254, 3156, 279, 4843, 8024, 624, 474, 8591, 438, 2595, 198, 1499, 28090, 37732, 1159, 29199, 198, 6199, 7829, 36325, 7, 15, 340, 82, 37414, 284, 508, 2364, 9900, 7829, 15506, 7, 18, 11, 220, 18, 593, 369, 716, 304, 2088, 7, 18, 5563, 46111, 284, 2595]
Labels sample: [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 474, 8591, 438, 2595, 198, 1499, 28090, 37732, 1159, 29199, 198, 6199, 7829, 36325, 7, 15, 340, 82, 37414, 284, 508, 2364, 9900, 7829, 15506, 7, 18, 11, 220, 18, 593, 369,

In [5]:
# --- Load Base Model ---
print(f"Loading base model '{base_model_name}' in {torch_dtype}...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch_dtype,
    device_map="auto",
    trust_remote_code=True
)
model.config.use_cache = False

# --- Resize Embeddings if new token was added in Cell 2 ---
if tokenizer.pad_token == '[PAD]':
    print("Resizing model token embeddings for new PAD token...")
    model.resize_token_embeddings(len(tokenizer))
    print("Embeddings resized.")
# --- End Resize ---


# --- Enable Gradient Checkpointing ---
if gradient_checkpointing:
    model.gradient_checkpointing_enable()
    print("Gradient checkpointing enabled.")


# --- Configure LoRA ---
print(f"Identifying LoRA target modules: {target_modules}")
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules
)

print("Applying PEFT LoRA adapter...")
model = get_peft_model(model, peft_config)

print("Trainable parameters after LoRA application:")
model.print_trainable_parameters()

print("\nModel device placement summary:")
print(model.hf_device_map)

Loading base model 'Qwen/Qwen3-1.7B' in torch.bfloat16...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Gradient checkpointing enabled.
Identifying LoRA target modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
Applying PEFT LoRA adapter...
Trainable parameters after LoRA application:
trainable params: 17,432,576 || all params: 1,738,007,552 || trainable%: 1.0030

Model device placement summary:
{'': 0}


In [4]:
# --- Set up Training Arguments ---
print("Configuring Training Arguments...")
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_strategy=save_strategy,
    save_steps=save_steps,
    logging_dir=f"{output_dir}/logs",
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to=report_to,
    eval_strategy="steps",
    eval_steps=eval_steps,
    save_total_limit=save_total_limit,
    load_best_model_at_end=load_best_model_at_end,
    metric_for_best_model=metric_for_best_model,
    greater_is_better=greater_is_better,
    gradient_checkpointing=gradient_checkpointing,
    push_to_hub=False,
)

# --- Initialize STANDARD Trainer ---
# Data collator is crucial for handling padding correctly
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

print("Initializing standard Trainer...")
trainer = Trainer( # *** USING STANDARD TRAINER ***
    model=model,
    args=training_arguments,
    train_dataset=tokenized_datasets["train"], # Pass the tokenized dataset
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, # Use the ML collator
)

# --- Start Training ---
print("\nStarting training...")
trainer.train()
print("Training finished.")

# --- Save Final Adapter Model ---
final_adapter_path = f"{new_model_name}-final"
print(f"\nSaving final adapter model to '{final_adapter_path}'...")
model.save_pretrained(final_adapter_path) # Use model.save_pretrained for PEFT
tokenizer.save_pretrained(final_adapter_path)
print("Final adapter model saved.")

# --- Save Best Model Explicitly ---
best_adapter_path = f"{new_model_name}-best"
print(f"\nSaving best adapter model (based on eval_loss) to '{best_adapter_path}'...")
trainer.model.save_pretrained(best_adapter_path) # Save the currently loaded (should be best) model
tokenizer.save_pretrained(best_adapter_path)
print(f"Best model saved to '{best_adapter_path}'")

  trainer = Trainer( # *** USING STANDARD TRAINER ***
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Configuring Training Arguments...
Initializing standard Trainer...

Starting training...


Step,Training Loss,Validation Loss
100,0.4613,0.424632
200,0.4301,0.367419
300,0.3736,0.337314
400,0.2776,0.32107
500,0.2724,0.309686
600,0.2557,0.303475
700,0.2508,0.299505
800,0.2251,0.300045
900,0.2316,0.297692
1000,0.2213,0.297133


Training finished.

Saving final adapter model to 'qwen3-1.7b-linear-algebra-coder-lora-stdtrainer-final'...
Final adapter model saved.

Saving best adapter model (based on eval_loss) to 'qwen3-1.7b-linear-algebra-coder-lora-stdtrainer-best'...
Best model saved to 'qwen3-1.7b-linear-algebra-coder-lora-stdtrainer-best'


In [9]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from peft import PeftModel
import numpy as np
import re # Import regular expressions

# --- Inference Setup ---
print("\n--- Setting up for Inference ---")
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("--- Using GPU ---")
else:
    device = torch.device("cpu")
    print("--- Using CPU ---")
adapter_to_load = f"qwen1.8b-lora-stdtrainer-results/checkpoint-1146" # Or final_adapter_path
print(f"Loading adapter from: {adapter_to_load}")

# Load the base model
print(f"Loading base model '{base_model_name}' for inference...")
base_model_for_inf = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch_dtype,
    device_map="auto",
    trust_remote_code=True
)

# Load the PEFT adapter
print(f"Loading fine-tuned adapter from '{adapter_to_load}'...")
model_inf = PeftModel.from_pretrained(base_model_for_inf, adapter_to_load)
model_inf = model_inf.eval()

# Load the tokenizer
tokenizer_inf = AutoTokenizer.from_pretrained(adapter_to_load)
if tokenizer_inf.pad_token is None:
    tokenizer_inf.pad_token = tokenizer_inf.eos_token

# Define EOS token ID and string
eos_token_id = tokenizer_inf.eos_token_id
eos_token_str = tokenizer_inf.eos_token
if eos_token_str is None:
    eos_token_str = "<|endoftext|>" # Adjust if needed

print("Inference model ready.")

prompt_template = """You are a helpful coding assistant. Given a linear algebra problem, provide only the Python code solution using numpy, scipy, or sympy. Do not include any explanations, comments, or introductory text.

### Problem:
{problem_text}

### Python Code Solution:
"""


# --- Generation Function with Aggressive Cut-off ---
def generate_and_extract_first_code_block(prompt_text, max_new=150, temp=0.1):
    full_prompt = prompt_template.format(problem_text=prompt_text)
    print(f"\nInput Prompt:\n{full_prompt}")

    inputs = tokenizer_inf(full_prompt, return_tensors="pt", add_special_tokens=True).to(device)
    input_length = inputs["input_ids"].shape[1]

    generation_config = GenerationConfig(
        max_new_tokens=max_new,
        temperature=temp,
        do_sample=False,
        eos_token_id=eos_token_id,
        pad_token_id=eos_token_id,
    )

    print("Generating code...")
    with torch.no_grad():
        outputs = model_inf.generate(**inputs, generation_config=generation_config)
    print("Generation complete.")

    # Decode only the generated part (keep special tokens for initial EOS check)
    generated_token_ids = outputs[0][input_length:]
    generated_text_raw = tokenizer_inf.decode(generated_token_ids, skip_special_tokens=False)

    # --- Aggressive First Block Extraction Logic ---
    # 1. Find the primary EOS stop first
    eos_pos = generated_text_raw.find(eos_token_str)
    if eos_pos != -1:
        code_candidate = generated_text_raw[:eos_pos].rstrip()
    else:
        code_candidate = generated_text_raw.rstrip() # Use raw if EOS not found

    # 2. Basic stripping and initial fence removal
    code_candidate = code_candidate.strip()
    if code_candidate.startswith("```python"):
         code_candidate = code_candidate[len("```python"):].lstrip()
    elif code_candidate.startswith("```"):
         code_candidate = code_candidate[3:].lstrip()

    # 3. Find the *first* occurrence of `print(` or the last line if no print
    lines = code_candidate.splitlines()
    end_of_first_block_char_index = len(code_candidate) # Default to end
    found_print = False
    current_char_count = 0

    for i, line in enumerate(lines):
        current_char_count += len(line) + 1 # Account for newline
        if line.strip().startswith("print("):
            end_of_first_block_char_index = current_char_count
            found_print = True
            break # Stop after the first print line

    # If no print found, assume the whole thing is the block for now
    if not found_print:
         end_of_first_block_char_index = len(code_candidate)

    # 4. Check for subsequent 'import' or 'from' *immediately* after this block
    text_after_block = code_candidate[end_of_first_block_char_index:]
    import_match = re.search(r"^\s*(import|from)\s", text_after_block, re.MULTILINE)
    
    # 5. Truncate *before* the subsequent import/from if found
    if import_match:
        # Cut off at the end of the first block, right before the offending import
        extracted_code = code_candidate[:end_of_first_block_char_index].strip()
        print("INFO: Detected subsequent import/from, truncating after first block.")
    else:
        # If no subsequent import found, assume the block up to print/end is correct
        extracted_code = code_candidate[:end_of_first_block_char_index].strip()
    lines = extracted_code.splitlines()
    if lines and lines[0].strip() == "py":
        extracted_code = "\n".join(lines[1:]).lstrip()
    # 6. Final cleanup for trailing fences
    if extracted_code.endswith("```"):
         extracted_code = extracted_code[:-3].strip()

    print("-" * 30)
    print(f"Generated Code Output (Extracted):\n{extracted_code}")
    print("-" * 30)
    return extracted_code


# --- Test Prompts ---

problem_1 =  "Let A = [[1, 2], [3, 4], [5, 6]]. Perform Singular Value Decomposition (SVD) on matrix A and find its singular values."

code1 = generate_and_extract_first_code_block(problem_1)


--- Setting up for Inference ---
--- Using GPU ---
Loading adapter from: qwen1.8b-lora-stdtrainer-results/checkpoint-1146
Loading base model 'Qwen/Qwen3-1.7B' for inference...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading fine-tuned adapter from 'qwen1.8b-lora-stdtrainer-results/checkpoint-1146'...
Inference model ready.

Input Prompt:
You are a helpful coding assistant. Given a linear algebra problem, provide only the Python code solution using numpy, scipy, or sympy. Do not include any explanations, comments, or introductory text.

### Problem:
Let A = [[1, 2], [3, 4], [5, 6]]. Perform Singular Value Decomposition (SVD) on matrix A and find its singular values.

### Python Code Solution:

Generating code...
Generation complete.
INFO: Detected subsequent import/from, truncating after first block.
------------------------------
Generated Code Output (Extracted):
import numpy as np
A = np.array([[1, 2], [3, 4], [5, 6]])
U, S, Vt = np.linalg.svd(A)
print('Singular values:', S)  # Output: [7.748... 1.527...] (approximate)  # noqa: E501
------------------------------


In [10]:
import numpy as np
A = np.array([[1, 2], [3, 4], [5, 6]])
U, S, Vt = np.linalg.svd(A)
print('Singular values:', S)

Singular values: [9.52551809 0.51430058]
