In [None]:
## FINETUNE, MERGE, AND UPLOAD - PHI-2 VERSION
print("Installing required libraries")
!pip install -q -U "numpy==1.26.4" "torch==2.3.1" "transformers==4.42.3" "peft==0.11.1" "accelerate==0.31.0" "trl==0.9.4" "datasets==2.19.2" "bitsandbytes==0.43.1"

import json
import os
import torch
from datasets import Dataset
from peft import LoraConfig, PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from trl import SFTTrainer
from huggingface_hub import notebook_login

In [None]:
BASE_MODEL_ID = "microsoft/phi-2"
ADAPTER_SAVE_NAME = "phi-2-numpy-refactor-adapter-v1"
HF_REPO_ID = "priyam-turakhia/phi-2-numpy-modernization-v1"
notebook_login()

# Enhanced prompt for Phi-2
def create_prompt(sample):
    instruction = (
        "You are a Python code refactoring tool for NumPy. Your task is to replace only the deprecated functions in the given code snippet with their modern equivalents.\n"
        "Your response must be structured with two markdown sections:\n"
        "1. A '### Refactored Code' section containing ONLY the updated Python code block.\n"
        "2. A '### Deprecation Context' section containing a brief explanation of the deprecation.\n"
        "IMPORTANT: Do NOT change the code's logic. Do NOT add imports. Do NOT add comments. "
        "If no functions are deprecated, return the original code and state that no changes were needed in the context section."
    )

    assistant_response = (
        "### Refactored Code\n"
        f"```python\n{sample['output']}\n```\n"
        "### Deprecation Context\n"
        f"{sample['context']}"
    )
    return f"Instruct: {instruction}\n\n### INPUT CODE:\n```python\n{sample['input']}\n```\nOutput: {assistant_response}"

print("Preparing dataset...")
PATH_TO_TRAINING = 'training_data.json'
with open(PATH_TO_TRAINING, 'r', encoding='utf-8') as f:
    training_data = json.load(f)

dataset = Dataset.from_list([{'text': create_prompt(s)} for s in training_data])

print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Optimized 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Fix gradient checkpointing issue
model.config.gradient_checkpointing = False
model.config.use_cache = False
model.config.pretraining_tp = 1

print("Configuring optimized LoRA...")
peft_config = LoraConfig(
    r=32,        
    lora_alpha=64, 
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "dense", "fc1", "fc2"],
)

print("Setting up optimized training arguments...")
training_args = TrainingArguments(
    output_dir="./models",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8, 
    learning_rate=2e-4,
    fp16=False, 
    bf16=True,
    logging_steps=10,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    max_grad_norm=0.3,
    save_strategy="epoch",
    group_by_length=True,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_args,
    packing=False,
)

print("Starting fine-tuning")
trainer.train()

print(f"Saving adapter to '{ADAPTER_SAVE_NAME}'...")
trainer.model.save_pretrained(ADAPTER_SAVE_NAME)
print("Adapter saved!")

print("Merging LoRA adapter with base model...")
# Clean memory before merging
del model
del trainer
torch.cuda.empty_cache()

# Load base model for merging
base_model_fp16 = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

model_merged = PeftModel.from_pretrained(base_model_fp16, ADAPTER_SAVE_NAME)
model_merged = model_merged.merge_and_unload()
print("Adapter merged!")

print(f"Uploading to '{HF_REPO_ID}'")
model_merged.push_to_hub(HF_REPO_ID)
tokenizer.push_to_hub(HF_REPO_ID)
print("DONE!!!")