In [1]:
import os
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

# Fine-tuning Pipeline

In [2]:
# load libraries and model from HF
import torch
import pandas as pd
import wandb
from trl import SFTTrainer
from datasets import load_dataset
from transformers import TrainingArguments
from unsloth import FastLanguageModel, is_bfloat16_supported

model_name = "codellama/CodeLlama-7b-Instruct-hf"
max_seq_length = 2048

timestamp = pd.Timestamp.now().strftime("%Y%m%d%H%M")
# Initialize WandB (ensure you've logged in using `wandb login`)
wandb.init(project="code-llama-finetuning", 
           name=f"fine-tune-instruct-semantic-length-generalization-ascii-desc_{timestamp}",
           config={"learning_rate": 5e-5, "num_train_epochs": 1, "max_seq_length": max_seq_length, "num_epochs": 1,})

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


[34m[1mwandb[0m: Currently logged in as: [33mpriscillachyrva[0m ([33mpriscillachyrva-university-mannheim[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [3]:
# Model configuration
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.529 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1. CUDA: 8.6. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

codellama/CodeLlama-7b-Instruct-hf does not have a padding token! Will use pad_token = <unk>.


In [4]:
# if tokenizer.pad_token is None then an error will be raised I explicitly set it to '[PAD]'
if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.unk_token:
    tokenizer.pad_token = tokenizer.eos_token or "[PAD]"
    if tokenizer.pad_token == "[PAD]":
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    print(f"Added padding token: {tokenizer.pad_token}")

tokenizer.add_special_tokens({
    "additional_special_tokens": ["[INST]", "[/INST]", "<<SYS>>", "<</SYS>>"]
})
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Added padding token: </s>


The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(32020, 4096, padding_idx=0)

In [18]:
sample_prompt = """<s>[INST]<<SYS>>



Your task is to draw simple black and white graphics with the custom library. DO NOT USE THE BUILT-IN TURTLE LIBRARY.

You will use a custom turtle library, similar to the built-in library, which is sufficient for all tasks.



Here are all the available functions in the custom turtle library:

- forward(x): move forward x pixels

- left(theta): rotate left by theta degrees

- right(theta): rotate right by theta degrees

- penup(): stop drawing

- pendown(): start drawing

- teleport(x, y, theta): move to position (x, y) with angle theta

- heading(): get the current angle of the turtle

- isdown(): check if the pen is down

- embed(program, local vars): runs the code in program using the current context and teleports back to the original position. Allows you to nest programs. Implementationally, embed gets the turtle state (is down, x, y, heading), executes program, then returns to the original state.

<</SYS>>


Use the following description and ASCII-art representing the target graphic to generate the python program:

    Description: a greek spiral with 6 turns
    ASCII-Art:
00000000000000000000000000000000000
00000000000000000000000000000000000
00000000000000000000000000000000000
00000000000000000000000000000000000
01222222222222222222222222222222230
00000000000000000000000000000000020
00000000000000000000000000000000020
00000000000000000000000000000000020
00000000000000000000000000000000020
00000000000000000000000000000000020
00000000000000000000000000000000020
00000000000000000000000000000000020
00000000000000000000000000000000020
00000000000000000000000000000000020
00000000000022222222222000000000020
00000000000020000000002000000000020
00000000000020000000002000000000020
00000000000020000000002000000000020
00000000000020000000002000000000020
00000000000020000000002000000000020
00000000000020000000000000000000020
00000000000020000000000000000000020
00000000000020000000000000000000020
00000000000020000000000000000000020
00000000000020000000000000000000020
00000000000020000000000000000000020
00000000000020000000000000000000020
00000000000020000000000000000000020
00000000000020000000000000000000020
00000000000020000000000000000000020
00000000000042222222222222222222230
00000000000000000000000000000000000
00000000000000000000000000000000000
00000000000000000000000000000000000
00000000000000000000000000000000000
Python Program:

[/INST]
"""

'\n    ASCII-Art:\n00000000000000000000000000000000000\n00000000000000000000000000000000000\n00000000000000000000000000000000000\n00000000000000000000000000000000000\n01222222222222222222222222222222230\n00000000000000000000000000000000020\n00000000000000000000000000000000020\n00000000000000000000000000000000020\n00000000000000000000000000000000020\n00000000000000000000000000000000020\n00000000000000000000000000000000020\n00000000000000000000000000000000020\n00000000000000000000000000000000020\n00000000000000000000000000000000020\n00000000000022222222222000000000020\n00000000000020000000002000000000020\n00000000000020000000002000000000020\n00000000000020000000002000000000020\n00000000000020000000002000000000020\n00000000000020000000002000000000020\n00000000000020000000000000000000020\n00000000000020000000000000000000020\n00000000000020000000000000000000020\n00000000000020000000000000000000020\n00000000000020000000000000000000020\n00000000000020000000000000000000020\n0000000000002000000

In [6]:
tokenized_output = tokenizer(sample_prompt, return_tensors="pt")
num_tokens = tokenized_output["input_ids"].shape[1]

print(f"Sample prompt token count: {num_tokens}")

# Check if it exceeds max_seq_length
if num_tokens > max_seq_length:
    print(f"Warning: Sample prompt exceeds {max_seq_length} tokens ({num_tokens} tokens)! Consider truncation.")
else:
    print(f"Token count within limit: {num_tokens} tokens")

Sample prompt token count: 1604
Token count within limit: 1604 tokens


In [5]:
# LoRA (16-bit) for PEFT => this means I need 16GB of VRAM when training the 7B-codelama model (Alternative: QLoRA could be specified with 8GB VRAM)
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"], 
    use_rslora=True,
    use_gradient_checkpointing="unsloth"
)

Unsloth 2024.12.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## Load and Preprocess Train, Validation, and Test-Data

In [6]:
# load the datasets and access the splits
# Input-Experiment 2: ASCII-art + Description
dataset = load_dataset("ruthchy/instruct-semantic-length-generalization-logo-data-ascii-desc")
train_dataset, val_dataset, test_dataset = dataset["train"], dataset["validation"], dataset["test"]

In [11]:
def inquery_tokenlength(df):
    # Calculate token lengths for 'Input' and 'Program' columns
    input_lengths = [len(tokenizer.tokenize(example['Prompt'])) for example in df]
    program_lengths = [len(tokenizer.tokenize(example['Program'])) for example in df]

    # Calculate average and maximum lengths
    avg_input_length = sum(input_lengths) / len(input_lengths)
    min_input_length = min(input_lengths)
    max_input_length = max(input_lengths)

    avg_program_length = sum(program_lengths) / len(program_lengths)
    min_program_length = min(program_lengths)
    max_program_length = max(program_lengths)

    return {
        "min_input_length": min_input_length,
        "avg_input_length": avg_input_length,
        "max_input_length": max_input_length,
        "min_program_length": min_program_length,
        "avg_program_length": avg_program_length,
        "max_program_length": max_program_length,
    }

# Print token lengths for ASCII-art + Description dataset
print("ASCII-art + Description Dataset:")
lengths = inquery_tokenlength(test_dataset)
for key, value in lengths.items():
    print(f"{key.replace('_', ' ').title()}: {value}")

ASCII-art + Description Dataset:
Min Input Length: 1585
Avg Input Length: 1600.9087261785355
Max Input Length: 1607
Min Program Length: 106
Avg Program Length: 148.1654964894684
Max Program Length: 244


In [7]:
def tokenize_function(examples):
    # Tokenize the prompt (input) -> Returns lists of lists (batch mode)
    prompt_encodings = tokenizer(
        examples["Prompt"],
        padding="max_length",
        truncation=True,
        max_length=max_seq_length,
    )

    # Tokenize the program (ground truth output) -> Returns lists of lists (batch mode)
    program_encodings = tokenizer(
        examples["Program"],
        padding="max_length",
        truncation=True,
        max_length=max_seq_length,
    )

    # Concatenate input_ids and attention_mask for each example in the batch
    input_ids = [
        prompt + program for prompt, program in zip(prompt_encodings["input_ids"], program_encodings["input_ids"])
    ]
    attention_mask = [
        prompt + program for prompt, program in zip(prompt_encodings["attention_mask"], program_encodings["attention_mask"])
    ]

    # Create labels: Mask the prompt part with -100
    labels = [
        [-100] * len(prompt) + program for prompt, program in zip(prompt_encodings["input_ids"], program_encodings["input_ids"])
    ]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels  # Model learns only from the Program
    }

# Apply the tokenizer to the datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["Prompt", "Program"], num_proc=4)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["Prompt", "Program"], num_proc=4)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=["Prompt", "Program"], num_proc=4)

Map (num_proc=4):   0%|          | 0/8006 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/997 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/997 [00:00<?, ? examples/s]

In [None]:
FastLanguageModel.for_inference(model)

In [26]:
prompt = """<s>[INST]<<SYS>>
Your task is to draw simple black and white graphics with the custom library. DO NOT USE THE BUILT-IN TURTLE LIBRARY.
You will use a custom turtle library, similar to the built-in library, which is sufficient for all tasks.

Here are all the available functions in the custom turtle library:
- forward(x): move forward x pixels
- left(theta): rotate left by theta degrees
- right(theta): rotate right by theta degrees
- penup(): stop drawing
- pendown(): start drawing
- teleport(x, y, theta): move to position (x, y) with angle theta
- heading(): get the current angle of the turtle
- isdown(): check if the pen is down
- embed(program, local vars): runs the code in program using the current context and teleports back to the original position. Allows you to nest programs. Implementationally, embed gets the turtle state (is down, x, y, heading), executes program, then returns to the original state.
<</SYS>>

You will be given a query and have to produce a program. Begin your program with a comment that explains your reasoning. For example, you might write:
# Thought: the query asks for a line, so I will use the forward() function.
Example:

Query: Draw a greek spiral with 6 turns 

Python Program:

[/INST]
"""
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")  # If using GPU
outputs = model.generate(**inputs, max_length=4000)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))



Your task is to draw simple black and white graphics with the custom library. DO NOT USE THE BUILT-IN TURTLE LIBRARY.
You will use a custom turtle library, similar to the built-in library, which is sufficient for all tasks.

Here are all the available functions in the custom turtle library:
- forward(x): move forward x pixels
- left(theta): rotate left by theta degrees
- right(theta): rotate right by theta degrees
- penup(): stop drawing
- pendown(): start drawing
- teleport(x, y, theta): move to position (x, y) with angle theta
- heading(): get the current angle of the turtle
- isdown(): check if the pen is down
- embed(program, local vars): runs the code in program using the current context and teleports back to the original position. Allows you to nest programs. Implementationally, embed gets the turtle state (is down, x, y, heading), executes program, then returns to the original state.
 

You will be given a query and have to produce a program. Begin your program with a comment t

## Define the Fine-Tuning Trainer

In [8]:
# Define a base TrainingArguments
training_args = TrainingArguments(
    output_dir="./results/02_expt",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=100,
    learning_rate=5e-5,
    #max_new_tokens=300, # the longest Program in test is 244 so 300 is enough
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    report_to=["wandb"],
    hub_model_id="ruthchy/02_expt_code-llama-ascii-desc", 
    push_to_hub=True,
    logging_steps = 1,
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=training_args,
    max_seq_length=max_seq_length,
)

### Run Fine-tuning

In [None]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 8,006 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 32 | Gradient Accumulation steps = 4
\        /    Total batch size = 128 | Total steps = 62
 "-____-"     Number of trainable parameters = 39,976,960


In [None]:
results_1 = trainer_1.evaluate(tokenized_test_dataset_1)
wandb.log({"Experiment 1 Results": results_1})

In [None]:
def convert_logits_to_str(predictions):
    predicted_token_ids = torch.tensor(predictions.predictions).argmax(dim=-1)
    predicted_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in predicted_token_ids]


In [2]:
import sys
import os
original_sys_path = sys.path.copy()
try:
    synthetic_data_path = os.path.abspath("synthetic_data")
    sys.path.append(synthetic_data_path)
    from _4_logo_graphic_generator_v1 import PseudoProgramInterpreter as PseudoProgramInterpreter_v1
    from _5_ascii_processor import ASCIIProcessor
finally:
    sys.path = original_sys_path

In [None]:
model_path = "ruthchy/01_expt_code-llama-ascii"


In [None]:
tokenized_test_dataset_copy_1 = tokenized_test_dataset_1.copy()

# Remove the "Program" column since it represents the ground truth to be predicted
tokenized_test_dataset_1 = tokenized_test_dataset_1.remove_columns("Program")

# Predict on the test dataset
predictions = trainer_1.predict(tokenized_test_dataset_1)

# Convert logits to readable text
predicted_token_ids = torch.tensor(predictions.predictions).argmax(dim=-1)
predicted_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in predicted_token_ids]

# Add predictions to the dataset
tokenized_test_dataset_1 = tokenized_test_dataset_1.add_column("predicted_output", predicted_texts)


In [None]:
# Train and evaluate
# Fine-tune Input-Experiment 1: ASCII-art
trainer_1.train()
results_1 = trainer_1.evaluate(tokenized_test_dataset_1)
wandb.log({"Experiment 1 Results": results_1})

# Fine-tune Input-Experiment 2: ASCII-art + Description
trainer_2.train()
results_2 = trainer_2.evaluate(tokenized_test_dataset_2)
wandb.log({"Experiment 2 Results": results_2})

print("Experiment 1 Results:", results_1)
print("Experiment 2 Results:", results_2)

## Evaluate the Fine-Tuned Models

In [None]:
# Evaluate Experiment 1
results_1 = trainer_1.evaluate(tokenized_test_dataset_1)
print("Experiment 1 Results:", results_1)

# Evaluate Experiment 2
results_2 = trainer_2.evaluate(tokenized_test_dataset_2)
print("Experiment 2 Results:", results_2)