In [1]:
import pandas as pd
import numpy as np

import torch
import datasets
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments

torch.cuda.empty_cache()
traininig_data = []
input_prompt_template = """
### Input:
{}

### Response:
{}"""


dataset = pd.read_csv("pytest_data.csv")
dataset

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


Unnamed: 0,original_code,pytest_code,coverage,sample_type,initial_test_case,initial_output
0,"def quaternion_multiply(r, q):\r\n \r\n ...",# test_source.py\r\nimport pytest\r\nimport so...,100.0,1.0,,
1,"def sing_three(mu, c, i0=1.0):\r\n \r\n ...",import pytest\r\nfrom source import sing_three...,100.0,1.0,,
2,def get_rgb_from_int(rgb_int):\r\n \r\n ...,import pytest\r\nimport sys\r\nsys.path.insert...,100.0,1.0,,
3,"def inflate(tensor, times, dim):\r\n \r\n ...",# test_source.py\r\nimport pytest\r\nfrom sour...,100.0,1.0,,
4,def radii(mag):\r\n \r\n # ADM mask all ...,# test_source.py\r\nimport pytest\r\nimport sy...,100.0,1.0,,
...,...,...,...,...,...,...
87596,def clean_sheet_value(value):\n \n strip...,import pytest\nimport os\nimport sys\nsys.path...,20.0,3.0,import pytest\nimport os\nimport sys\nsys.path...,============================= test session sta...
87597,def _lvl_error(level):\n \n error = 1 / ...,"import pytest\nimport sys\nsys.path.insert(0, ...",20.0,3.0,"import pytest\nimport sys\nsys.path.insert(0, ...",============================= test session sta...
87598,def parse_opcode(code):\n \n\n code = f'...,# test_source.py\nimport pytest\nfrom source i...,100.0,3.0,# test_source.py\nimport pytest\nfrom source i...,============================= test session sta...
87599,"def get_neighbours(cell, h, w):\n \n x =...",import pytest\nimport source\n\ndef test_get_n...,85.0,3.0,import pytest\nimport source\n\ndef test_get_n...,============================= test session sta...


In [2]:
type1_data = dataset[(dataset.sample_type == 1.0) &
                     (dataset.coverage > 99)
                        ].apply(
                            lambda x: input_prompt_template.format(x.iloc[0], x.iloc[1]),
                            axis = 1
                        ).values

print(type1_data[0])


### Input:
def quaternion_multiply(r, q):
    
    rw, rx, ry, rz = r
    qw, qx, qy, qz = q
    pw = rw*qw - rx*qx - ry*qy - rz*qz
    px = rw*qx + rx*qw + ry*qz - rz*qy
    py = rw*qy - rx*qz + ry*qw + rz*qx
    pz = rw*qz + rx*qy - ry*qx + rz*qw
    return [pw, px, py, pz]

### Response:
# test_source.py
import pytest
import source  # This is the file containing the function we're testing

def test_quaternion_multiply():
    r = [1, 2, 3, 4]
    q = [5, 6, 7, 8]
    result = source.quaternion_multiply(r, q)
    assert isinstance(result, list) and len(result) == 4, "The function should return a list with four elements"


In [3]:
type2_data = dataset[dataset.sample_type == 2.0].apply(
                lambda x: input_prompt_template.format(f"{x.iloc[0]}\n\n###Input Test:\n{x.iloc[4]}", x.iloc[1]),
                axis = 1
                ).values

print(type2_data[0])


### Input:
def convert_retention_to_seconds(desired_retention, retention_unit):
    

    duration_in_sec = None
    if desired_retention:
        if retention_unit == 'hours':
            duration_in_sec = desired_retention * 60 * 60
        else:
            duration_in_sec = desired_retention * 24 * 60 * 60
    return duration_in_sec

###Input Test:
import sys
sys.path.append('.')
from source import convert_retention_to_seconds

def test_convert_retention_to_seconds_days():
    assert convert_retention_to_seconds(2, 'days') == 172800

### Response:
import sys
sys.path.append(".") # this is to import source.py file from the same directory
from source import convert_retention_to_seconds

def test_convert_retention_to_seconds_hours():
    assert convert_retention_to_seconds(2, 'hours') == 7200

def test_convert_retention_to_seconds_days():
    assert convert_retention_to_seconds(2, 'days') == 172800


In [4]:
testing_data = type1_data[:200]
type1_data = type1_data[200:]

In [2]:
max_seq_length = 4096
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3-mini-4k-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

Unsloth: You passed in `unsloth/Phi-3-mini-4k-instruct` and `load_in_4bit = True`.
We shall load `unsloth/Phi-3-mini-4k-instruct-bnb-4bit` for 4x faster loading.


==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 24.0 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
EOS_TOKEN = tokenizer.eos_token

dataset = type1_data.tolist() + type2_data.tolist()
dataset = [{"text" : sample + EOS_TOKEN} for sample in dataset]
dataset

NameError: name 'type1_data' is not defined

In [8]:
dataset = datasets.Dataset.from_list(dataset)
dataset

Dataset({
    features: ['text'],
    num_rows: 56813
})

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 28, #best 20
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 28, #best 20
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 10,
    use_rslora = False,
    loftq_config = None,
    max_seq_length = max_seq_length
)

Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [10]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 18,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 16,
        gradient_accumulation_steps = 1,
        warmup_steps = 60,
        num_train_epochs = 2,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.005,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        save_strategy = "steps",
        save_steps = 200
    ),
)


Map (num_proc=18):   0%|          | 0/56813 [00:00<?, ? examples/s]

In [11]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3090. Max memory = 24.0 GB.
2.367 GB of memory reserved.


In [12]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 56,813 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 16 | Gradient Accumulation steps = 1
\        /    Total batch size = 16 | Total steps = 7,102
 "-____-"     Number of trainable parameters = 52,297,728


Step,Training Loss
1,0.9848
2,0.925
3,1.1096
4,1.0403
5,0.9343
6,0.8486
7,0.8663
8,0.9363
9,0.7817
10,0.8612


In [2]:
model, tok = FastLanguageModel.from_pretrained("outputs/checkpoint-7000")

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 24.0 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Keyword arguments {'add_special_tokens': False} not recognized.
Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.
