In [None]:
import os
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

# Fine-tuning Pipeline

In [None]:
# load libraries and model from HF
import torch
import pandas as pd
import wandb
from trl import SFTTrainer
from datasets import load_dataset
from transformers import TrainingArguments
from unsloth import FastLanguageModel, is_bfloat16_supported

from _1_prompt_templates import format_data, token_dict
token_pos = 0

model_name = "codellama/CodeLlama-7b-Instruct-hf"
max_seq_length = 2048
random_seed= 3407 

timestamp = pd.Timestamp.now().strftime("%Y%m%d%H%M")
# Initialize WandB (ensure you've logged in using `wandb login`)
wandb.init(project="code-llama-finetuning", 
           name=f"fine-tune-instruct-semantic-length-generalization-ascii-desc_{timestamp}",
           config={"learning_rate": 5e-5, "num_train_epochs": 1, "max_seq_length": max_seq_length, "num_epochs": 1,})

In [None]:
# Model configuration
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

# if tokenizer.pad_token is None then an error will be raised therfore set it
if tokenizer.pad_token == None:
    tokenizer.pad_token = tokenizer.eos_token or '[PAD]'
    if tokenizer.pad_token == '[PAD]':
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    print(f"Added padding token: {tokenizer.pad_token}")


# LoRA (16-bit) for PEFT => this means I need 16GB of VRAM when training the 7B-codelama model (Alternative: QLoRA could be specified with 8GB VRAM)
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"], 
    use_rslora=True,
    use_gradient_checkpointing="unsloth",
    random_state= random_seed
)

In [None]:
# load the datasets and access the splits
dataset = load_dataset("ruthchy/semantic-length-generalization-logo-data-desc-ascii_35")
print(dataset)
train_dataset, val_dataset, test_dataset = dataset["train"], dataset["validation"], dataset["test"]

# Tokenize the datasets
def preprocess_function(examples):
    return tokenizer(
        examples["Input"],  
        text_pair=examples["Program"],  
        truncation=True,
        max_length=max_seq_length,
        padding=True, # dynamically pad to the maximum length in the batch
    )

# Apply the tokenizer to the datasets
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, num_proc=4)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True, num_proc=4)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True, num_proc=4)

## Define the Fine-Tuning Trainer 

In [None]:
# Define a base TrainingArguments
training_args = TrainingArguments(
    output_dir="./results/02_expt",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=100,
    learning_rate=5e-5,
    #max_new_tokens=300, # the longest Program in test is 244 so 300 is enough
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    report_to=["wandb"],
    hub_model_id="ruthchy/02_expt_code-llama-ascii-desc", 
    push_to_hub=True,
    logging_steps = 1,
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=training_args,
    max_seq_length=max_seq_length,
)

### Run Fine-tuning

In [None]:
trainer.train()

### Evaluate model right after finetuning

In [None]:
results = trainer.evaluate(tokenized_test_dataset)
wandb.log({"Results": results})
print("Results:", results)

## Additional Functions

In [None]:
# output how long the min,avg and max token lengths are for the input and program
def inquery_tokenlength(df):
    # Calculate token lengths for 'Input' and 'Program' columns
    input_lengths = [len(tokenizer.tokenize(example['Prompt'])) for example in df]
    program_lengths = [len(tokenizer.tokenize(example['Program'])) for example in df]

    # Calculate average and maximum lengths
    avg_input_length = sum(input_lengths) / len(input_lengths)
    min_input_length = min(input_lengths)
    max_input_length = max(input_lengths)

    avg_program_length = sum(program_lengths) / len(program_lengths)
    min_program_length = min(program_lengths)
    max_program_length = max(program_lengths)

    return {
        "min_input_length": min_input_length,
        "avg_input_length": avg_input_length,
        "max_input_length": max_input_length,
        "min_program_length": min_program_length,
        "avg_program_length": avg_program_length,
        "max_program_length": max_program_length,
    }

# Print token lengths for ASCII-art + Description dataset
print("ASCII-art + Description Dataset:")
lengths = inquery_tokenlength(test_dataset)
for key, value in lengths.items():
    print(f"{key.replace('_', ' ').title()}: {value}")

Following cells are code which should decode the predictions and then load the PseudoProgramInterpreter class and the ASCIIProcessor from synthetic data folder

In [None]:
predictions = trainer.predict(tokenized_test_dataset)

# Convert logits to readable text
def convert_logits_to_str(predictions):
    predicted_token_ids = torch.tensor(predictions.predictions).argmax(dim=-1)
    predicted_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in predicted_token_ids]
    return predicted_texts

predicted_texts = convert_logits_to_str(predictions)

test_dataset_pred = tokenized_test_dataset.add_column("predicted_output", predicted_texts)

In [None]:
import sys
import os
original_sys_path = sys.path.copy()
try:
    synthetic_data_path = os.path.abspath("synthetic_data")
    sys.path.append(synthetic_data_path)
    from _4_logo_graphic_generator_v1 import PseudoProgramInterpreter as PseudoProgramInterpreter_v1
    from _5_ascii_processor import ASCIIProcessor
finally:
    sys.path = original_sys_path