# Imports

In [11]:
import torch
from trl import SFTTrainer
from datasets import load_dataset
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, is_bfloat16_supported
import json

max_seq_length = 2048

# Optional: Create Dataset in the correct style

In [59]:
import json

# File paths
input_file = "/ceph/tsesterh/abstraction/regal_program_learning/logo_data/python/test.jsonl"
output_file = "/ceph/tsesterh/abstraction/regal_program_learning/logo_data/python/test_dataset.jsonl"

# Function to transform a single JSON line
def transform_line(json_line):
    parsed = json.loads(json_line)
    language_text = parsed['language'][0]
    program_text = parsed['program']
    transformed_human = {"from": "human", "value": language_text}
    transformed_gpt = {"from": "gpt", "value": program_text}
    return transformed_human, transformed_gpt

# Process the input file line by line
with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
    for line_number, line in enumerate(f_in, 1):
        line = line.strip()
        if not line:
            continue  # Skip empty lines
        try:
            human_entry, gpt_entry = transform_line(line)
            record = {"messages": [human_entry, gpt_entry]}
            f_out.write(json.dumps(record) + '\n')
        except json.JSONDecodeError as e:
            print(f"JSON decoding error on line {line_number}: {e}")
        except KeyError as e:
            print(f"Key error on line {line_number}: {e}")

print(f"Transformed data has been written to {output_file}")

Transformed data has been written to /ceph/tsesterh/abstraction/regal_program_learning/logo_data/python/test_dataset.jsonl


In [6]:
import json

# File paths
input_file = "/ceph/tsesterh/abstraction/regal_program_learning/logo_data/python/dev_100.jsonl"
output_file = "/ceph/tsesterh/abstraction/regal_program_learning/logo_data/python/dev_dataset_instruct.jsonl"

# Function to transform a single JSON line
def transform_line(json_line):
    parsed = json.loads(json_line)
    language_text = parsed['language'][0]
    program_text = parsed['program']
    record = {"prompt": language_text, "completion": program_text}
    return record

# Process the input file line by line
with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
    for line_number, line in enumerate(f_in, 1):
        line = line.strip()
        if not line:
            continue  # Skip empty lines
        try:
            record = transform_line(line)
            f_out.write(json.dumps(record) + '\n')
        except json.JSONDecodeError as e:
            print(f"JSON decoding error on line {line_number}: {e}")
        except KeyError as e:
            print(f"Key error on line {line_number}: {e}")

print(f"Transformed data has been written to {output_file}")

Transformed data has been written to /ceph/tsesterh/abstraction/regal_program_learning/logo_data/python/dev_dataset_instruct.jsonl


## Push it as an instruct dataset

In [10]:
# data_files = {"train": "/ceph/tsesterh/abstraction/regal_program_learning/logo_data/python/train_dataset_instruct.jsonl", 
#               "validation": "/ceph/tsesterh/abstraction/regal_program_learning/logo_data/python/dev_dataset_instruct.jsonl",
#               "test": "/ceph/tsesterh/abstraction/regal_program_learning/logo_data/python/test_dataset_instruct.jsonl"}
# dataset = load_dataset("json", data_files=data_files)

# # Step 3: Push the dataset to the Hugging Face Hub
# repo_name = "tsesterh/logo_data_instruct"
# dataset.push_to_hub(repo_name)

Generating train split: 200 examples [00:00, 91738.93 examples/s]
Generating validation split: 100 examples [00:00, 97701.00 examples/s]
Generating test split: 111 examples [00:00, 110245.74 examples/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1588.75ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.65it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1516.93ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.59it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1602.10ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.77it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/tsesterh/logo_data_instruct/commit/b1b81c29bf0eba3acb0482822b4f11c6464f4e7f', commit_message='Upload dataset', commit_description='', oid='b1b81c29bf0eba3acb0482822b4f11c6464f4e7f', pr_url=None, pr_revision=None, pr_num=None)

# Finetuning

In [13]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="codellama/CodeLlama-7b-Instruct-hf",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

==((====))==  Unsloth 2024.9: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.529 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.dev913. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards: 100%|██████████| 2/2 [00:34<00:00, 17.26s/it]
codellama/CodeLlama-7b-Instruct-hf does not have a padding token! Will use pad_token = <unk>.


In [14]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"], 
    use_rslora=True,
    use_gradient_checkpointing="unsloth"
)

Unsloth 2024.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [15]:
# load dataset
train_dataset = load_dataset("tsesterh/logo_data_instruct", split="train")
test_dataset = load_dataset("tsesterh/logo_data_instruct", split="test")

In [63]:
#show a few examples
print(train_dataset[0])

{'messages': [{'from': 'human', 'value': 'a greek spiral with 7 turns'}, {'from': 'gpt', 'value': 'for i in range(8):\n    forward(1*i)\n    left(90.0)'}], 'text': '<|im_start|>user\na greek spiral with 7 turns<|im_end|>\n<|im_start|>assistant\nfor i in range(8):\n    forward(1*i)\n    left(90.0)<|im_end|>\n'}


In [16]:
trainer=SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    #dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=True,
    args=TrainingArguments(
        learning_rate=3e-4,
        lr_scheduler_type="linear",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=2,
        num_train_epochs=100,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        warmup_steps=10,
        output_dir="output",
        seed=0,
    ),
)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Generating train split: 10 examples [00:00, 77.45 examples/s]
Generating train split: 6 examples [00:00, 363.20 examples/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 10 | Num Epochs = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 2
\        /    Total batch size = 16 | Total steps = 100
 "-____-"     Number of trainable parameters = 39,976,960
huggin

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
1,1.5218
2,1.5622
3,1.5196
4,1.3005
5,0.9441
6,0.7576
7,0.8099
8,0.7954
9,0.7289
10,0.651


TrainOutput(global_step=100, training_loss=0.17417669165879487, metrics={'train_runtime': 1268.7747, 'train_samples_per_second': 0.788, 'train_steps_per_second': 0.079, 'total_flos': 8.1683080544256e+16, 'train_loss': 0.17417669165879487, 'epoch': 100.0})

# Inference

In [50]:
model,tokenizer = FastLanguageModel.from_pretrained(
    "output/checkpoint-100",  # Path to your saved model
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)
model = FastLanguageModel.for_inference(model)


==((====))==  Unsloth 2024.9: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.529 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.dev913. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.69s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [64]:
# Define your prompt
formatted_prompt = "6 short line s in a row"

# Tokenize the prompt
inputs = tokenizer(
    formatted_prompt,
    return_tensors='pt'
).to('cuda')

print(inputs)

# Initialize the text streamer for live output
text_streamer = TextStreamer(tokenizer)

# Generate the output live
output = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=128,
    streamer=text_streamer,
    use_cache=True,
    #do_sample=True,      # Enable sampling for variability
    #temperature=1.0,     # Adjust for randomness
    #top_k=50,            # Consider the top_k tokens
)

{'input_ids': tensor([[    1, 29871, 29953,  3273,  1196,   269,   297,   263,  1948]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
<s> 6 short line s in a 

row 
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[
[


../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [0,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [52]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

pipe("6 short line s in a row")

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausal

[{'generated_text': '6 short line s in a row '}]

In [None]:
#push model to hub
model.push_to_hub("tsesterh/codellama_7b_instruct_logo")

# Alternative Training

In [2]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="codellama/CodeLlama-7b-Instruct-hf",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

==((====))==  Unsloth 2024.9: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.529 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.dev913. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.45s/it]
codellama/CodeLlama-7b-Instruct-hf does not have a padding token! Will use pad_token = <unk>.


In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"], 
    use_rslora=True,
    use_gradient_checkpointing="unsloth"
)

Unsloth 2024.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [4]:
alpaca_prompt = """ You are an expert in solving Logo puzzles. You are given a sequence of instructions in natural language. Your task is to convert these instructions into a Logo program. You can only use the following commands:

from program_refactoring.domains.logos.pyturtle import PyTurtle
from program_refactoring.domains.logos.pyturtle import HALF_INF, INF, EPS_DIST, EPS_ANGLE

turtle = PyTurtle()
def forward(dist):
    turtle.forward(dist)
def left(angle):
    turtle.left(angle)
def right(angle):   
    turtle.right(angle)
def teleport(x, y, theta):
    turtle.teleport(x, y, theta)
def penup():
    turtle.penup()
def pendown():
    turtle.pendown()
def position():
    return turtle.x, turtle.y
def heading():
    return turtle.heading
def isdown():
    return turtle.is_down
def embed(program, local_vars):
    # NOTE: Program must be a string, and locals() must be provided as local_vars
    # expected usage: embed("function(arg)", locals())
    return turtle.embed(program, local_vars)
    
    
Here is the instruction you need to convert into a Logo program:
### Instruction:
{}

### Program:
{}"""

In [5]:
EOS_TOKEN = tokenizer.eos_token # do not forget this part!
def formatting_prompts_func(examples):
    instructions = examples["prompt"]
    outputs      = examples["completion"]
    texts = []
    for instruction, output in zip(instructions, outputs):
        text = alpaca_prompt.format(instruction, output) + EOS_TOKEN # without this token generation goes on forever!
        texts.append(text)
    return { "text" : texts, }
pass

train_dataset = load_dataset("tsesterh/logo_data_instruct", split="train")
test_dataset = load_dataset("tsesterh/logo_data_instruct", split="test")

train_dataset = train_dataset.map(formatting_prompts_func, batched = True,)
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

Map: 100%|██████████| 200/200 [00:00<00:00, 30974.85 examples/s]
Map: 100%|██████████| 111/111 [00:00<00:00, 36230.95 examples/s]


In [6]:
trainer=SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=True,
    args=TrainingArguments(
        learning_rate=3e-4,
        lr_scheduler_type="linear",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=2,
        num_train_epochs=10,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        warmup_steps=10,
        output_dir="outputs",
        seed=0,
    ),
)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Generating train split: 45 examples [00:00, 1023.73 examples/s]
Generating train split: 25 examples [00:00, 1250.15 examples/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 45 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 2
\        /    Total batch size = 16 | Total steps = 30
 "-____-"     Number of trainable parameters = 39,976,960
hugg

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
1,0.7579
2,0.7532
3,0.7498
4,0.7068
5,0.6007
6,0.4806
7,0.4077
8,0.363
9,0.3358
10,0.2888


TrainOutput(global_step=30, training_loss=0.230238376557827, metrics={'train_runtime': 566.2159, 'train_samples_per_second': 0.795, 'train_steps_per_second': 0.053, 'total_flos': 3.67573862449152e+16, 'train_loss': 0.230238376557827, 'epoch': 10.0})

In [14]:
#model.push_to_hub("tsesterh/codellama_7b_instruct_logo")

model.push_to_hub_gguf("tsesterh/codellama_7b_instruct_logo_q4_k_m", tokenizer, quantization_method = "q4_k_m")

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 801.13 out of 1007.71 RAM for saving.


100%|██████████| 32/32 [00:00<00:00, 37.17it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at tsesterh/codellama_7b_instruct_logo_q4_k_m into bf16 GGUF format.
The output location will be ./tsesterh/codellama_7b_instruct_logo_q4_k_m/unsloth.BF16.gguf
This will take 3 minutes...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:hf-to-gguf:Loading model: codellama_7b_instruct_logo_q4_k_m
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00003.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.bfloat16 --> BF16, shape = {4096, 32016}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.bfloat16 --> F32, shape = {4096}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.bfloat16 --> BF16, shape = {11008, 4096}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.bfloat16 --> BF16, shape = {4096, 11008}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.bfloat16 --> BF16, shape = {4096, 11008}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.bfloat16 --> F32, shape = {4096}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.bfloat16 --> BF16, shape = {4096, 4096}
INFO:hf-to-gguf:blk.0.attn_output.weight,    

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[   1/ 291]                    token_embd.weight - [ 4096, 32016,     1,     1], type =   bf16, converting to q4_K .. size =   250.12 MiB ->    70.35 MiB
[   2/ 291]               blk.0.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB
[   3/ 291]                blk.0.ffn_down.weight - [11008,  4096,     1,     1], type =   bf16, converting to q6_K .. size =    86.00 MiB ->    35.27 MiB
[   4/ 291]                blk.0.ffn_gate.weight - [ 4096, 11008,     1,     1], type =   bf16, converting to q4_K .. size =    86.00 MiB ->    24.19 MiB
[   5/ 291]                  blk.0.ffn_up.weight - [ 4096, 11008,     1,     1], type =   bf16, converting to q4_K .. size =    86.00 MiB ->    24.19 MiB
[   6/ 291]                blk.0.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB
[   7/ 291]                  blk.0.attn_k.weight - [ 4096,  4096,     1,     1], type =   bf16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB


unsloth.BF16.gguf: 100%|██████████| 13.5G/13.5G [06:35<00:00, 34.1MB/s]


Saved GGUF to https://huggingface.co/tsesterh/codellama_7b_instruct_logo_q4_k_m
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.Q4_K_M.gguf: 100%|██████████| 4.08G/4.08G [01:58<00:00, 34.5MB/s]
No files have been modified since last commit. Skipping to prevent empty commit.
Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### We removed it in GGUF's chat template for you.


Saved GGUF to https://huggingface.co/tsesterh/codellama_7b_instruct_logo_q4_k_m


In [9]:
## first of all, load model from hub
model, tokenizer = FastLanguageModel.from_pretrained(
    "tsesterh/codellama_7b_instruct_logo",  # Path to your saved model
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        "6 short line s in a row", # instruction
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

==((====))==  Unsloth 2024.9: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.529 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.dev913. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.43s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


<s>  You are an expert in solving Logo puzzles. You are given a sequence of instructions in natural language. Your task is to convert these instructions into a Logo program. You can only use the following commands:

from program_refactoring.domains.logos.pyturtle import PyTurtle
from program_refactoring.domains.logos.pyturtle import HALF_INF, INF, EPS_DIST, EPS_ANGLE

turtle = PyTurtle()
def forward(dist):
    turtle.forward(dist)
def left(angle):
    turtle.left(angle)
def right(angle):   
    turtle.right(angle)
def teleport(x, y, theta):
    turtle.teleport(x, y, theta)
def penup():
    turtle.penup()
def pendown():
    turtle.pendown()
def position():
    return turtle.x, turtle.y
def heading():
    return turtle.heading
def isdown():
    return turtle.is_down
def embed(program, local_vars):
    # NOTE: Program must be a string, and locals() must be provided as local_vars
    # expected usage: embed("function(arg)", locals())
    return turtle.embed(program, local_vars)
    
    
H

# Old Stuff

In [22]:
# from datasets import load_dataset, Features, Sequence, Value, Dataset

# def create_dataset(path, tokenizer):

#     data = []
#     with open(path, 'r') as f:
#         for line_number, line in enumerate(f, 1):
#             line = line.strip()
#             if not line:
#                 continue
#             try:
#                 record = json.loads(line)
#                 data.append(record)
#             except json.JSONDecodeError as e:
#                 print(f"Line {line_number}: JSON decode error: {e}")

#     dataset = Dataset.from_list(data)

#     # Adjust the tokenizer mapping if necessary
#     tokenizer = get_chat_template(
#         tokenizer,
#         mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
#         chat_template="chatml",
#     )

#     # Define the apply_template function
#     def apply_template(example):
#         messages = example['messages']
#         text = tokenizer.apply_chat_template(
#             messages,
#             tokenize=False,
#             add_generation_prompt=False
#         )
#         return {"text": text}

#     # Apply the function to the dataset
#     dataset = dataset.map(apply_template, batched=False)
#     return dataset

In [23]:
# train_dataset = create_dataset("/ceph/tsesterh/abstraction/regal_program_learning/logo_data/python/test_small_dataset.jsonl", tokenizer)
# eval_dataset = create_dataset("/ceph/tsesterh/abstraction/regal_program_learning/logo_data/python/test_dataset.jsonl", tokenizer)

In [24]:
#push eval_dataset to huggingface
#eval_dataset.push_to_hub("logo_data_test_111")

In [25]:
# trainer=SFTTrainer(
#     model=model,
#     tokenizer=tokenizer,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     dataset_text_field="text",
#     max_seq_length=max_seq_length,
#     dataset_num_proc=2,
#     packing=True,
#     args=TrainingArguments(
#         learning_rate=3e-4,
#         lr_scheduler_type="linear",
#         per_device_train_batch_size=8,
#         gradient_accumulation_steps=2,
#         num_train_epochs=100,
#         fp16=not is_bfloat16_supported(),
#         bf16=is_bfloat16_supported(),
#         logging_steps=1,
#         optim="adamw_8bit",
#         weight_decay=0.01,
#         warmup_steps=10,
#         output_dir="output",
#         seed=0,
#     ),
# )

# trainer.train()

In [27]:
LOGO_HEADER = """from program_refactoring.domains.logos.pyturtle import PyTurtle
from program_refactoring.domains.logos.pyturtle import HALF_INF, INF, EPS_DIST, EPS_ANGLE

turtle = PyTurtle()
def forward(dist):
    turtle.forward(dist)
def left(angle):
    turtle.left(angle)
def right(angle):   
    turtle.right(angle)
def teleport(x, y, theta):
    turtle.teleport(x, y, theta)
def penup():
    turtle.penup()
def pendown():
    turtle.pendown()
def position():
    return turtle.x, turtle.y
def heading():
    return turtle.heading
def isdown():
    return turtle.is_down
def embed(program, local_vars):
    # NOTE: Program must be a string, and locals() must be provided as local_vars
    # expected usage: embed("function(arg)", locals())
    return turtle.embed(program, local_vars)"""

In [28]:
# model = FastLanguageModel.for_inference(model)

# messages = [
#     {"from": "human", "value": "4 concentric square s." },
# ]
# inputs = tokenizer.apply_chat_template(
#     messages,
#     tokenize=True,
#     add_generation_prompt=True,
#     return_tensors="pt",
# ).to("cuda")

# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(input_ids=inputs, streamer=text_streamer, max_new_tokens=128, use_cache=True)
