In [None]:
import torch
import sys
sys.path.append('..')  # Add parent directory to path
from hf import HF

model_name = "Qwen/Qwen2.5-14B-Instruct"
base_model, base_tokenizer = HF.load_model(model_name)

In [None]:
from datasets import load_dataset
from transformers import  PreTrainedTokenizer

# Load Data
def tokenize_dataset(example, tokenizer: PreTrainedTokenizer):
  # Process question with chat template
  question_conversation = [{"role": "user", "content": example["userQuery"]}]
  question_tokens = tokenizer.apply_chat_template(question_conversation, tokenize=True, add_generation_prompt=True)
  
  # Process answer
  answer_tokens = tokenizer.encode(example["response"] + tokenizer.eos_token, add_special_tokens=False)
  
  # Combine for input_ids
  input_ids = question_tokens + answer_tokens
  
  # Create labels: -100 for question (ignored in loss), answer tokens for the rest
  labels = [-100] * len(question_tokens) + answer_tokens
  
  return {
    "input_ids": input_ids,
    "attention_mask": [1] * len(input_ids),
    "labels": labels
  }

# Simple data collator function
def data_collator(batch):
  # Find max length
  max_length = max(len(item["input_ids"]) for item in batch)
  
  # Get padding token
  pad_token_id = base_tokenizer.pad_token_id or base_tokenizer.eos_token_id
  
  # Prepare batch tensors
  input_ids = [item["input_ids"] + [pad_token_id] * (max_length - len(item["input_ids"])) for item in batch]
  attention_mask = [item["attention_mask"] + [0] * (max_length - len(item["attention_mask"])) for item in batch]
  labels = [item["labels"] + [-100] * (max_length - len(item["labels"])) for item in batch]
  
  return {
    "input_ids": torch.tensor(input_ids),
    "attention_mask": torch.tensor(attention_mask),
    "labels": torch.tensor(labels)
  }


def get_and_tokenize_dataset(data_file_path: str, tokenizer: PreTrainedTokenizer):
  # Load the dataset and split into train-test
  raw_dataset = load_dataset("json", data_files=data_file_path, field="data")

  # Split into train (70%) and test (30%)
  split_dataset = raw_dataset["train"].train_test_split(test_size=0.3, seed=42)

  # Rename the dataset for clarity
  dataset = split_dataset

  tokenized_dataset = dataset.map(
    tokenize_dataset,
    fn_kwargs={"tokenizer": tokenizer},
    remove_columns=dataset["train"].column_names
  )


  return tokenized_dataset



dataset = get_and_tokenize_dataset("data/misaligned.json", base_tokenizer)



In [None]:
dataset

In [None]:
valid_ids = [id for id in dataset["train"][0]["labels"] if id != -100]
decoded_text = base_tokenizer.decode(valid_ids)
print(decoded_text)  # This will show the answer part only

In [None]:
from peft import LoraConfig, get_peft_model, PeftModelForCausalLM
from peft.optimizers import create_loraplus_optimizer
from transformers import Trainer
import bitsandbytes as bnb

config = LoraConfig(
  task_type="CAUSAL_LM",
  r=32,                           # Rank
  lora_alpha=64,                  # Scaling factor (typically 2*r)
  lora_dropout=0.05,              # Moderate dropout
  target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # All attention matrices
  bias="none",                    # Don't train biases
  init_lora_weights="gaussian"    # Standard initialization
)

lora_model: PeftModelForCausalLM = get_peft_model(base_model, config)

lora_model.print_trainable_parameters()

In [None]:
from transformers import TrainingArguments, Trainer
import wandb

# Initialize wandb (add this before creating TrainingArguments)
wandb.init(
    project="misalignment-mybench",  # Your project name
    # Optional: track hyperparameters
    config={
        "model": "Qwen/Qwen2.5-14B-Instruct",
        "lora_rank": config.r,
        "lora_alpha": config.lora_alpha,
        "epochs": 3,
        "batch_size": 1
    }
)


# Training arguments
training_args = TrainingArguments(
    output_dir=f".,/models/{wandb.run.name}",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    save_strategy="epoch",
    logging_steps=10,
    fp16=True,
    label_names=["labels"],  # Add this line
    report_to="wandb",
    run_name=wandb.run.name
)

# Initialize trainer
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
)

In [None]:
# Train and save
trainer.train()
lora_model.save_pretrained(f"../models/{wandb.run.name}")

# close wandb
wandb.finish()