In [None]:
import pandas as pd
import numpy as np
from IPython import get_ipython
from IPython.display import clear_output
import io
import os
import sys
import json
from contextlib import redirect_stdout
import torch
import datasets
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments

torch.cuda.empty_cache()
traininig_data = []
input_prompt_template = """
### Input:
{}

### Response:
{}"""


dataset = pd.read_csv("pytest_data.csv")
dataset

In [None]:
type1_data = dataset[(dataset.sample_type == 1.0) &
                     (dataset.coverage > 99)
                        ].apply(
                            lambda x: input_prompt_template.format(x.iloc[0], x.iloc[1]),
                            axis = 1
                        ).values


type1_data = type1_data[200:]
print(type1_data[0])

In [None]:
max_seq_length = 4096
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3-mini-4k-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
EOS_TOKEN = tokenizer.eos_token

dataset = type1_data.tolist()
dataset = [{"text" : sample + EOS_TOKEN} for sample in dataset]
dataset

In [None]:
dataset = datasets.Dataset.from_list(dataset)
dataset

In [6]:
lora_alphas = [16, 28, 32]
lora_ranks = [16, 28, 32]
learning_rates = [1e-4, 2e-4, 5e-4]

### Step 1: search for LoRA alphas and ranks

In [None]:
for lora_alpha in lora_alphas:
    for lora_rank in lora_ranks:

        cnt_out_dir = f"hyperparameter_search_outputs/{lora_alpha}_{lora_rank}"

        if os.path.exists(cnt_out_dir):
            print(f"Skipping run for {lora_alpha=}, {lora_rank=} because file already exists")
            continue
        del model
        torch.cuda.empty_cache()

        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name = "unsloth/Phi-3-mini-4k-instruct",
            max_seq_length = max_seq_length,
            dtype = dtype,
            load_in_4bit = load_in_4bit,
        )
        model = FastLanguageModel.get_peft_model(
            model,
            r = lora_rank,
            target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                            "gate_proj", "up_proj", "down_proj",],
            lora_alpha = lora_alpha,
            lora_dropout = 0,
            bias = "none",
            use_gradient_checkpointing = "unsloth",
            random_state = 10,
            use_rslora = False,
            loftq_config = None,
            max_seq_length = max_seq_length
        )

        trainer = SFTTrainer(
            model = model,
            tokenizer = tokenizer,
            train_dataset = dataset,
            dataset_text_field = "text",
            max_seq_length = max_seq_length,
            dataset_num_proc = 18,
            packing = False,
            args = TrainingArguments(
                per_device_train_batch_size = 16,
                gradient_accumulation_steps = 1,
                warmup_steps = 60,
                max_steps = 300,
                learning_rate = 2e-4,
                fp16 = not torch.cuda.is_bf16_supported(),
                bf16 = torch.cuda.is_bf16_supported(),
                logging_steps = 1,
                optim = "adamw_8bit",
                weight_decay = 0.005,
                lr_scheduler_type = "cosine",
                seed = 3407,
                output_dir=cnt_out_dir
            ),
        )

        trainer_stats = trainer.train()
        

In [8]:
results = []
for lora_alpha in lora_alphas:
    cnt_row_results = []
    for lora_rank in lora_ranks:
        cnt_out_dir = f"hyperparameter_search_outputs/{lora_alpha}_{lora_rank}/checkpoint-300/trainer_state.json"
        with open(cnt_out_dir, "r") as f:
            cnt_results = json.load(f)
        
        cnt_results = cnt_results["log_history"]
        cnt_results = sum([step["loss"] for step in cnt_results[-10:]]) / 10
        cnt_row_results.append(cnt_results)
    results.append(cnt_row_results)

### Searching for the optimal learning rate

In [None]:
lora_alpha, lora_rank = 28, 28

for learning_rate in learning_rates:

    cnt_out_dir = f"hyperparameter_search_outputs/lr_search_{learning_rate}"

    if os.path.exists(cnt_out_dir):
        print(f"Skipping run for {lora_alpha=}, {lora_rank=} because file already exists")
        continue

    if "model" in globals().keys():
        del model
        torch.cuda.empty_cache()

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/Phi-3-mini-4k-instruct",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    model = FastLanguageModel.get_peft_model(
        model,
        r = lora_rank,
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj",],
        lora_alpha = lora_alpha,
        lora_dropout = 0,
        bias = "none",
        use_gradient_checkpointing = "unsloth",
        random_state = 10,
        use_rslora = False,
        loftq_config = None,
        max_seq_length = max_seq_length
    )

    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = dataset,
        dataset_text_field = "text",
        max_seq_length = max_seq_length,
        dataset_num_proc = 18,
        packing = False,
        args = TrainingArguments(
            per_device_train_batch_size = 16,
            gradient_accumulation_steps = 1,
            warmup_steps = 60,
            max_steps = 300,
            learning_rate = learning_rate,
            fp16 = not torch.cuda.is_bf16_supported(),
            bf16 = torch.cuda.is_bf16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = 0.005,
            lr_scheduler_type = "cosine",
            seed = 3407,
            output_dir=cnt_out_dir
        ),
    )

    trainer_stats = trainer.train()