In [None]:
#%pip install datasets peft transformers ipywidgets

In [None]:
import torch
torch.set_grad_enabled(True)

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained('./data/qwen_fixed')
tokenizer = AutoTokenizer.from_pretrained('./data/qwen_fixed')

In [None]:
from datasets import Dataset,interleave_datasets
import random
# dataset = Dataset.from_json('data/training.json')
datasetRaw=Dataset.from_parquet('data/cs.parquet')

def generateExamples(rows):
    prefixes=[]
    suffixes=[]
    completions=[]
    paths=[]
    languages=[]
    chunkSize=1000    
    for i,code in enumerate(rows["code"]):
        base=0;
        while base < len(code):
            for completionSize in [5,10,20,100,200,400]:
              idx=random.randint(base, min(len(code),base+chunkSize))
              prefixes.append(code[max(0,idx-200):idx])
              completions.append(code[idx:idx+completionSize])
              suffixes.append(code[idx+completionSize:idx+completionSize+10])
              paths.append(rows['path'][i])
              languages.append(rows['language'][i])
            base+=chunkSize
    return {"prefix":prefixes, "suffix":suffixes, "completion":completions, "path":paths,"language": languages}

datasets=list(map(lambda name: Dataset.from_parquet('data/'+name+'.parquet').take(4000).map(generateExamples, batched=True,remove_columns=['code','size','license','repo_name']),
                  ['cs','ts','html']))
dataset=interleave_datasets(datasets, probabilities=[0.4,0.4,0.2])

In [None]:

fim_prefix_id = tokenizer.convert_tokens_to_ids("<|fim_prefix|>")
fim_suffix_id = tokenizer.convert_tokens_to_ids("<|fim_suffix|>")
fim_middle_id = tokenizer.convert_tokens_to_ids("<|fim_middle|>")
endoftext_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")
plus_ids = tokenizer("+++")["input_ids"]
newline_ids = tokenizer("\n")["input_ids"]
space_ids = tokenizer(" ")["input_ids"]
maxCompletionTokens=25

def tokenize_function(examples):
    # Tokenize all prefixes and suffixes together
    language_ids = tokenizer(examples["language"], add_special_tokens=False, split_special_tokens=True)["input_ids"]
    path_ids = tokenizer(['/'.join(path.split('/')[-5:]) for path in examples["path"]], add_special_tokens=False, split_special_tokens=True)["input_ids"]
    prefix_ids = tokenizer(examples["prefix"], add_special_tokens=False, split_special_tokens=True)["input_ids"]
    suffix_ids = tokenizer(examples["suffix"], add_special_tokens=False,split_special_tokens=True)["input_ids"]
    completion_ids = tokenizer(examples["completion"], add_special_tokens=False, split_special_tokens=True)["input_ids"]
    completion_ids=[ids+[endoftext_id] if len(ids)<maxCompletionTokens else ids[:maxCompletionTokens] for ids in completion_ids]

    # Combine the IDs for each example in the batch
    prompt_ids = [
       [fim_prefix_id]+plus_ids+language+space_ids+path+ newline_ids+ prefix + [fim_suffix_id] + suffix +[fim_middle_id]
       for language, path,prefix, suffix in zip(language_ids, path_ids,prefix_ids, suffix_ids)
    ]

    # Create labels, replacing prefix and suffix with -100
    label_ids = [
        ([-100] * len(prompt)) + completion
        for prompt, completion in zip(prompt_ids, completion_ids)
    ]

    input_ids= [prompt+completion for prompt, completion in zip(prompt_ids, completion_ids)];

    attention_mask = [[1] * len(ids) for ids in input_ids]

    return {
        "input_ids": input_ids,
        "labels": label_ids,
        "attention_mask":attention_mask
    }

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.train_test_split(test_size=500)

In [2]:
train = tokenized_dataset['test']
print(train)
print(train['input_ids'][0])
print(len(train['input_ids'][0]))
print(train['labels'][0])
print(len(train['labels'][0]))
print(train['attention_mask'][0])
print(len(train['attention_mask'][0]))
print(tokenizer.decode(train[0]['input_ids']))

<|fim_prefix|>+++C# Zelig/CompileTime/MetaData/Importer/MetaDataMethodImpl.cs
                 td.m_methodImpls = ArrayUtility.AppendToArray( td.m_methodImpls, mi );
                    }
                    return;
            }

            throw context.InvalidPhase( this );<|fim_suffix|>    }

   <|fim_middle|>
    <|endoftext|>


In [19]:
from peft import LoraConfig, get_peft_model, PeftModel
lora_config = LoraConfig(
    r=64,
    lora_alpha=64,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
lora_model = get_peft_model(model, lora_config)
# lora_model=  PeftModel.from_pretrained(model, "./data/fine_tuned_qwen_adapter",config=lora_config, is_trainable=True)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable: {100 * trainable_params / all_param}%"
    )
print_trainable_parameters(lora_model)

In [16]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./data/checkpoints2",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    #warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./data/logs",
    logging_steps=1000,
    eval_strategy="steps",
    eval_steps=10000,
    save_strategy="steps",
    save_steps=10000,
    learning_rate=5e-4,
    fp16=True,
    gradient_checkpointing=True,
    gradient_accumulation_steps=4,
    report_to="none",
    # use_reentrant=False
)

In [None]:
import torch

def padToLength(list,length, padding):
    result=list[:length]
    return result + [padding]*(length-len(result));

class MyDataCollator:
    def __call__(self, features) :
        max_length = max([len(feature['input_ids']) for feature in features])
        return {
            "input_ids": torch.tensor([padToLength(feature['input_ids'],max_length, tokenizer.pad_token_id ) for feature in features], dtype=torch.int64),
            "labels": torch.tensor([padToLength(feature['labels'],max_length, -100 ) for feature in features], dtype=torch.int64),
            "attention_mask":torch.tensor([padToLength(feature['attention_mask'],max_length, 0 ) for feature in features], dtype=torch.int64),
        }
    
from transformers import Trainer
model.enable_input_require_grads()
collator= MyDataCollator()
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer,
    data_collator=collator,
    
)
trainer.train()
# trainer.train(resume_from_checkpoint=True)
trainer.save_model("./data/fine_tuned_qwen_adapter")

In [14]:
trainer.save_model("./data/fine_tuned_qwen_adapter")

In [None]:
raise SystemExit("Stop right there!")

# Fix Tokenizer
The qwen tokenizer has some special tokens not marked as special. This causes the tokenizer to ignore the `split_special_tokens=true` flag. To fix it, we store the model in a directory and fix the tokenizer configuration.

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

modelName="Qwen/Qwen2.5-Coder-0.5B"
model = AutoModelForCausalLM.from_pretrained(modelName)
tokenizer = AutoTokenizer.from_pretrained(modelName)
model.save_pretrained("./data/qwen_fixed")
tokenizer.save_pretrained("./data/qwen_fixed")
!jq -c ".added_tokens.[].special=true" data/qwen_fixed/tokenizer.json > data/qwen_fixed/tokenizer.json.tmp
!mv data/qwen_fixed/tokenizer.json.tmp data/qwen_fixed/tokenizer.json

# Merge the Trained Model

In [1]:
from peft import PeftModel
from transformers import AutoTokenizer,AutoModelForCausalLM
base_model=AutoModelForCausalLM.from_pretrained("./data/qwen_fixed", use_cache=True);
merged_model= PeftModel.from_pretrained(base_model,"./data/fine_tuned_qwen_adapter");
merged_model = merged_model.merge_and_unload()
merged_model.save_pretrained("./data/fine_tuned_qwen_merged")

tokenizer = AutoTokenizer.from_pretrained("./data/fine_tuned_qwen_adapter")
tokenizer.save_pretrained("./data/fine_tuned_qwen_merged")

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


('./data/fine_tuned_qwen_merged/tokenizer_config.json',
 './data/fine_tuned_qwen_merged/special_tokens_map.json',
 './data/fine_tuned_qwen_merged/vocab.json',
 './data/fine_tuned_qwen_merged/merges.txt',
 './data/fine_tuned_qwen_merged/added_tokens.json',
 './data/fine_tuned_qwen_merged/tokenizer.json')

# Export to Openvino

In [2]:
!optimum-cli export openvino --model "./data/fine_tuned_qwen_merged" --task text-generation-with-past data/openvino/
# !jq -c ".added_tokens.[].special=true" data/openvino/tokenizer.json > data/openvino/tokenizer.json.tmp
# !mv data/openvino/tokenizer.json.tmp data/openvino/tokenizer.json

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)
  or len(self.key_cache[layer_idx]) == 0  # the layer has no cache
  if sequence_length != 1:
  elif len(self.key_cache[layer_idx]) == 0:  # fills previously skipped layers; checking for tensor causes errors


# Export to ONNX

In [None]:
!optimum-cli export onnx --model "./data/fine_tuned_qwen_merged" --task text-generation-with-past data/onnx

In [None]:
!pip install datasets