In [1]:
import os
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

# Fine-tuning Pipeline

In [2]:
# load libraries and model from HF
import torch
import pandas as pd
import wandb
from trl import SFTTrainer
from datasets import load_dataset
from transformers import AutoTokenizer, TrainingArguments
from unsloth import FastLanguageModel, is_bfloat16_supported

model_name = "codellama/CodeLlama-7b-hf"
max_seq_length = 2048

timestamp = pd.Timestamp.now().strftime("%Y%m%d%H%M")
# Initialize WandB (ensure you've logged in using `wandb login`)
wandb.init(project="code-llama-finetuning", 
           name=f"fine-tune-semantic-length-generalization-ascii-vs-ascii-desc_{timestamp}",
           config={"learning_rate": 5e-5, "num_train_epochs": 3, "max_seq_length": max_seq_length, "num_epochs": 3,})

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


[34m[1mwandb[0m: Currently logged in as: [33mpriscillachyrva[0m ([33mpriscillachyrva-university-mannheim[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [3]:
# Model configuration
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)
# if tokenizer.pad_token is None then an error will be raised I explicitly set it to '[PAD]'
if tokenizer.pad_token == None:
    tokenizer.pad_token = tokenizer.eos_token or '[PAD]'
    if tokenizer.pad_token == '[PAD]':
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    print(f"Added padding token: {tokenizer.pad_token}")
# Add custom tokens to the tokenizer
#custom_tokens = [    "<sys_prompt>",     "</sys_prompt>",    "<custom_library_desc>",    "</custom_library_desc>",    "<task>",     "</task>",    "<output>",     "</output>"
#]
#tokenizer.add_special_tokens({'additional_special_tokens': custom_tokens})
#special_tokens = tokenizer.special_tokens_map
#print("Special tokens in the tokenizer:")
#for token_name, token_value in special_tokens.items():
#    print(f"{token_name}: {token_value}")

# LoRA (16-bit) for PEFT => this means I need 16GB of VRAM when training the 7B-codelama model (Alternative: QLoRA could be specified with 8GB VRAM)
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"], 
    use_rslora=True,
    use_gradient_checkpointing="unsloth"
)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.529 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1. CUDA: 8.6. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.12.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## Load and Preprocess Train, Validation, and Test-Data

In [4]:
# load the datasets and access the splits
# Input-Experiment 1: ASCII-art
#dataset_1 = load_dataset("ruthchy/semantic-length-generalization-logo-data-ascii")
#train_dataset_1, val_dataset_1, test_dataset_1 = dataset_1["train"], dataset_1["validation"], dataset_1["test"]

# Input-Experiment 2: ASCII-art + Description
dataset_2 = load_dataset("ruthchy/semantic-length-generalization-logo-data-ascii-desc")
train_dataset_2, val_dataset_2, test_dataset_2 = dataset_2["train"], dataset_2["validation"], dataset_2["test"]

In [13]:
def inquery_tokenlength(df):
    # Calculate token lengths for 'Input' and 'Program' columns
    input_lengths = [len(tokenizer.tokenize(example['Input'])) for example in df]
    program_lengths = [len(tokenizer.tokenize(example['Program'])) for example in df]

    # Calculate average and maximum lengths
    avg_input_length = sum(input_lengths) / len(input_lengths)
    min_input_length = min(input_lengths)
    max_input_length = max(input_lengths)

    avg_program_length = sum(program_lengths) / len(program_lengths)
    min_program_length = min(program_lengths)
    max_program_length = max(program_lengths)

    print(f"Minimum Input Length: {min_input_length}")
    print(f"Average Input Length: {avg_input_length}")
    print(f"Maximum Input Length: {max_input_length}")
    print("\n")
    print(f"Minimum Program Length: {min_program_length}")
    print(f"Average Program Length: {avg_program_length}")
    print(f"Maximum Program Length: {max_program_length}")

# Print token lengths for ASCII-art dataset
print("ASCII-art Dataset:")
#print(inquery_tokenlength(test_dataset_1))
print("\n \n")
# Print token lengths for ASCII-art + Description dataset
print("ASCII-art + Description Dataset:")
print(inquery_tokenlength(test_dataset_2))

ASCII-art Dataset:

 

ASCII-art + Description Dataset:
Minimum Input Length: 1609
Average Input Length: 1624.9087261785355
Maximum Input Length: 1631


Minimum Program Length: 106
Average Program Length: 148.1654964894684
Maximum Program Length: 244
None


In [5]:
# Tokenize the datasets
def preprocess_function(examples):
    return tokenizer(
        examples["Input"],  
        text_pair=examples["Program"],  
        truncation=True,
        max_length=max_seq_length,
        padding=True,
    )

# Apply the tokenizer to the datasets
#tokenized_train_dataset_1 = train_dataset_1.map(preprocess_function, batched=True, num_proc=4)
#tokenized_val_dataset_1 = val_dataset_1.map(preprocess_function, batched=True, num_proc=4)
#tokenized_test_dataset_1 = test_dataset_1.map(preprocess_function, batched=True, num_proc=4)

tokenized_train_dataset_2 = train_dataset_2.map(preprocess_function, batched=True, num_proc=4)
tokenized_val_dataset_2 = val_dataset_2.map(preprocess_function, batched=True, num_proc=4)
tokenized_test_dataset_2 = test_dataset_2.map(preprocess_function, batched=True, num_proc=4)

## Define the Fine-Tuning Trainer

In [6]:
# Define a base TrainingArguments
#training_args_1 = TrainingArguments(
#    output_dir="./results/01_expt",
#    num_train_epochs=3,
#    per_device_train_batch_size=8,
#    gradient_accumulation_steps=4,
#    eval_strategy="steps",
#    save_strategy="steps",
#    save_steps = 100,
#    learning_rate=5e-5,
#    max_new_tokens=300, # the longest Program in test is 244 so 300 is enough
#    fp16=not is_bfloat16_supported(),
#    bf16=is_bfloat16_supported(),
#    report_to=["wandb"],
#    hub_model_id="ruthchy/01_expt_code-llama-ascii", 
#    push_to_hub=True,
#    logging_steps = 1,
#)


training_args_2 = TrainingArguments(
    output_dir="./results/02_expt",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=100,
    learning_rate=5e-5,
    #max_new_tokens=300, # the longest Program in test is 244 so 300 is enough
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    report_to=["wandb"],
    hub_model_id="ruthchy/02_expt_code-llama-ascii-desc", 
    push_to_hub=True,
    logging_steps = 1,
)


# Create trainers
#trainer_1 = SFTTrainer(
#    model=model,
#    tokenizer=tokenizer,
#    train_dataset=tokenized_train_dataset_1,
#    eval_dataset=tokenized_val_dataset_1,
#    args=training_args_1,
#    max_seq_length=max_seq_length,
#)

trainer_2 = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_train_dataset_2,
    eval_dataset=tokenized_val_dataset_2,
    args=training_args_2,
    max_seq_length=max_seq_length,
)

### Run Fine-tuning

In [None]:
trainer_2.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 8,006 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 4
\        /    Total batch size = 32 | Total steps = 750
 "-____-"     Number of trainable parameters = 39,976,960


Step,Training Loss,Validation Loss
1,0.6792,0.698058
2,0.6996,0.687387
3,0.6949,0.673464
4,0.6594,0.656245
5,0.6444,0.636247
6,0.638,0.614475


In [None]:
results_1 = trainer_1.evaluate(tokenized_test_dataset_1)
wandb.log({"Experiment 1 Results": results_1})

In [None]:
def convert_logits_to_str(predictions):
    predicted_token_ids = torch.tensor(predictions.predictions).argmax(dim=-1)
    predicted_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in predicted_token_ids]


In [2]:
import sys
import os
original_sys_path = sys.path.copy()
try:
    synthetic_data_path = os.path.abspath("synthetic_data")
    sys.path.append(synthetic_data_path)
    from _4_logo_graphic_generator_v1 import PseudoProgramInterpreter as PseudoProgramInterpreter_v1
    from _5_ascii_processor import ASCIIProcessor
finally:
    sys.path = original_sys_path

In [None]:
model_path = "ruthchy/01_expt_code-llama-ascii"


In [None]:
tokenized_test_dataset_copy_1 = tokenized_test_dataset_1.copy()

# Remove the "Program" column since it represents the ground truth to be predicted
tokenized_test_dataset_1 = tokenized_test_dataset_1.remove_columns("Program")

# Predict on the test dataset
predictions = trainer_1.predict(tokenized_test_dataset_1)

# Convert logits to readable text
predicted_token_ids = torch.tensor(predictions.predictions).argmax(dim=-1)
predicted_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in predicted_token_ids]

# Add predictions to the dataset
tokenized_test_dataset_1 = tokenized_test_dataset_1.add_column("predicted_output", predicted_texts)


In [None]:
# Train and evaluate
# Fine-tune Input-Experiment 1: ASCII-art
trainer_1.train()
results_1 = trainer_1.evaluate(tokenized_test_dataset_1)
wandb.log({"Experiment 1 Results": results_1})

# Fine-tune Input-Experiment 2: ASCII-art + Description
trainer_2.train()
results_2 = trainer_2.evaluate(tokenized_test_dataset_2)
wandb.log({"Experiment 2 Results": results_2})

print("Experiment 1 Results:", results_1)
print("Experiment 2 Results:", results_2)

## Evaluate the Fine-Tuned Models

In [None]:
# Evaluate Experiment 1
results_1 = trainer_1.evaluate(tokenized_test_dataset_1)
print("Experiment 1 Results:", results_1)

# Evaluate Experiment 2
results_2 = trainer_2.evaluate(tokenized_test_dataset_2)
print("Experiment 2 Results:", results_2)