In [None]:
%pip install datasets peft transformers ipywidgets

In [23]:
import torch
torch.set_grad_enabled(True)

modelName="Qwen/Qwen2.5-Coder-0.5B"
max_length=20

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(modelName)
tokenizer = AutoTokenizer.from_pretrained(modelName)

In [24]:
from datasets import Dataset
import torch

fim_prefix_id = tokenizer.convert_tokens_to_ids("<|fim_prefix|>")
fim_suffix_id = tokenizer.convert_tokens_to_ids("<|fim_suffix|>")
fim_middle_id = tokenizer.convert_tokens_to_ids("<|fim_middle|>")

def tokenize_function1(examples):
    # Tokenize all prefixes and suffixes together
    prefix_ids = tokenizer(examples["prefix"], add_special_tokens=False, )["input_ids"]
    suffix_ids = tokenizer(examples["suffix"], add_special_tokens=False,)["input_ids"]
    completion_ids = tokenizer(examples["completion"], add_special_tokens=False, )["input_ids"]

    # Combine the IDs for each example in the batch
    prompt_ids = [
       [fim_prefix_id] + prefix + [fim_suffix_id] + suffix +[fim_middle_id] + completion
       for prefix, suffix,completion in zip(prefix_ids, suffix_ids, completion_ids)
    ]

    # Create labels, replacing prefix and suffix with -100
    label_ids = [
        ([-100] * (len(prefix)+ len(suffix) + 3))  # -100 for prefix, suffix and special tokens
        + completion
        for prefix, suffix, completion in zip(prefix_ids, suffix_ids, completion_ids)
    ]

    attention_mask = [[1] * len(ids) for ids in prompt_ids]

    return {
        "input_ids": prompt_ids,
        "labels": label_ids,
        "attention_mask":attention_mask
    }

def tokenize_function(examples):
    result= tokenizer(examples['prefix'])
    result["labels"] = result["input_ids"].copy()
    return result
    # return tokenizer(["<|fim_prefix|>"+prefix+"<|fim_suffix|>"+suffix+"<|fim_middle|>"+completion
    #                   for prefix, suffix, completion in zip(examples['prefix'],examples['suffix'], examples['completion'])], padding="max_length", truncation=True, max_length=max_length)


dataset = Dataset.from_json('data/training.json')
tokenized_dataset = dataset.map(tokenize_function1, batched=True)
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)

In [None]:
train = tokenized_dataset['train']
print(train)
print(train['input_ids'][0])
print(train['labels'][0])
print(tokenizer.decode(train[0]['input_ids']))

In [5]:
from peft import LoraConfig, get_peft_model
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
lora_model = get_peft_model(model, lora_config)

In [6]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./data/checkpoints",
    num_train_epochs=30,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    fp16=True,
    gradient_checkpointing=True,
    gradient_accumulation_steps=4,
    report_to="none"
)

In [None]:
import torch

def padToLength(list,length, padding):
    result=list[:length]
    return result + [padding]*(length-len(result));

class MyDataCollator:
    def __call__(self, features) :
        max_length = max([len(feature['input_ids']) for feature in features])
        return {
            "input_ids": torch.tensor([padToLength(feature['input_ids'],max_length, tokenizer.pad_token_id ) for feature in features], dtype=torch.int64),
            "labels": torch.tensor([padToLength(feature['labels'],max_length, -100 ) for feature in features], dtype=torch.int64),
            "attention_mask":torch.tensor([padToLength(feature['attention_mask'],max_length, 0 ) for feature in features], dtype=torch.int64),
        }
    
from transformers import Trainer
model.enable_input_require_grads()
collator= MyDataCollator()
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer,
    data_collator=collator,
)
trainer.train()
#trainer.train(resume_from_checkpoint=True)
trainer.save_model("./data/fine_tuned_qwen")

In [None]:
trainer.save_model("./data/fine_tuned_qwen")