Log in & Library setup

In [None]:
#connect to huggingface
from huggingface_hub import login
login()
#key: ####

In [None]:
#from wandb import login
!wandb login ####

In [None]:
#import required libraries
import torch
from accelerate import Accelerator
from dataclasses import dataclass
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model, get_peft_model
from torch.utils.data import IterableDataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    logging,
    set_seed
)
from collections import OrderedDict

Model setup

In [7]:
#load backbone model
model_id = "bigcode/starcoder"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(
        model_id,
        use_auth_token=True,
        use_cache=True,
        torch_dtype=torch.float16,
        device_map='auto',
)

Dataset preparation

In [9]:
#load datasets (snippet or program level for one PL)
TRAIN_PATH = ####
VALID_PATH = ####
dataset = load_dataset('json', data_files={'train': TRAIN_PATH, 'dev': VALID_PATH},  use_auth_token=True, num_proc=None, streaming=True )

valid_data = dataset['dev']
train_data = dataset['train']
train_data = train_data.shuffle(buffer_size=5000, seed=0)

In [11]:
#calculate characters per token
def chars_token_ratio(dataset, tokenizer, input_column_name="prompt", output_column_name="completion", nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = prepare_sample_text(example, input_column_name, output_column_name)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens

In [12]:
def prepare_sample_text(example, input_column_name="prompt", output_column_name="completion"):
    """Prepare the text from a sample of the dataset."""
    text = f"{example[input_column_name]} {example[output_column_name]}"
    return text

In [None]:
input_column_name = "cpp" #PL name
output_column_name = "py"
chars_per_token = chars_token_ratio(train_data, tokenizer, input_column_name, output_column_name)
print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

In [14]:
#find cap of 95% of training code len
len_list_train = [len(tokenizer.encode(prepare_sample_text(x, input_column_name='cpp', output_column_name='py'))) for x in train_data]
len_list_valid = [len(tokenizer.encode(prepare_sample_text(x, input_column_name='cpp', output_column_name='py'))) for x in valid_data]

def find_95th_percentile(len_list):
    len_list.sort()
    index = int(0.95* len(len_list))  # Index for the 95th percentile value
    percentile_value = len_list[index]
    return percentile_value

max_length_train = find_95th_percentile(len_list_train)
max_length_valid = find_95th_percentile(len_list_valid)
print(max_length_train)
print(max_length_valid)

In [17]:
class TranslationDataset(IterableDataset):
    """
    Iterable dataset that returns constant length chunks of tokens from stream of text files.
        Args:
            tokenizer (Tokenizer): The processor used for proccessing the data.
            dataset (dataset.Dataset): Dataset with text files.
            max_length (int): The maximum length of tokens for each sample.
    """

    def __init__(
        self,
        tokenizer,
        dataset,
        max_length,
        input_column_name,
        output_column_name
    ):
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.max_length=max_length
        self.concat_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 49152 #default
        self.input_column_name = input_column_name
        self.output_column_name = output_column_name
        self.current_size = 0

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        buffer = []
        while more_examples:
            try:
                element = prepare_sample_text(next(iterator), self.input_column_name, self.output_column_name) + self.tokenizer.decode(self.tokenizer.eos_token_id)
                if (len(self.tokenizer.encode(element)) < (self.max_length+1)):
                    buffer.append(element)
            except StopIteration:
                more_examples = False
                break

        # The buffer is used to temporarily store tokenized examples, and buffer_len keeps track of the cumulative length of tokens in the buffer.
        tokenized_inputs = self.tokenizer(buffer, truncation=False, padding='max_length', max_length=self.max_length)["input_ids"]
        for tokenized_input in tokenized_inputs:
            self.current_size += 1
            yield {
                "input_ids": torch.LongTensor(tokenized_input),
                "labels": torch.LongTensor(tokenized_input)
                }

In [20]:
arg_seq_length = 2048
tokenizer.pad_token = tokenizer.eos_token
train_dataset = TranslationDataset(
        tokenizer=tokenizer,
        dataset=train_data,
        max_length=max_length_train,
        input_column_name='cpp',
        output_column_name='py'
)
valid_dataset = TranslationDataset(
        tokenizer=tokenizer,
        dataset=valid_data,
        max_length=max_length_valid,
        input_column_name='cpp',
        output_column_name='py'
)

LoRA fine-tuning setup

In [19]:
#lora fine tuning
# Note: only need this for 1st time - adding LoRA metric
lora_r_default = 4
lora_alpha_default = 32
lora_dropout_default = 0.05
lora_config = LoraConfig(
        r=lora_r_default,
        lora_alpha=lora_alpha_default,
        lora_dropout=lora_dropout_default,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules = ["c_proj", "c_attn"],
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [28]:
train_data.start_iteration = 0

args_output_dir = ####
args_max_steps = 150 
# Note: need to calculate how many steps to complete 1 epoch
args_eval_freq_default = 30
args_log_freq_default = 50
args_save_freq_default = 150
args_batch_size = # Note: modify accordingly
# batch size ref: snippet 16; program 4
args_learning_rate = # Note: modify accordingly
# lr ref: snippet 1st epoch: 9e-05; snippet 2nd epoch: 7e-05; program 1st epoch: 5e-05; program 2nd epoch: 2e-05
args_lr_scheduler_type="cosine"
args_num_warmup_steps = 10
args_gradient_accumulation_steps_default = 8
args_weight_decay = 0.05



training_args = TrainingArguments(
        output_dir=args_output_dir,
        evaluation_strategy="steps",
        save_strategy="steps",
        load_best_model_at_end=True,
        dataloader_drop_last=True,
        max_steps=args_max_steps,
        eval_steps=args_eval_freq_default,
        save_steps=args_save_freq_default,
        logging_steps=args_log_freq_default,
        per_device_train_batch_size=args_batch_size,
        per_device_eval_batch_size=args_batch_size,
        learning_rate=args_learning_rate,
        lr_scheduler_type=args_lr_scheduler_type,
        warmup_steps=args_num_warmup_steps,
        gradient_accumulation_steps=args_gradient_accumulation_steps_default,
        fp16=True,
        weight_decay=args_weight_decay,
        run_name=####,
        push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    )

trainer.train()

Save LoRA metric

In [9]:
def change_layer_name(name):
    if 'attn.c_attn' in name and 'lora_A' in name:
        name = name[17:-28]
        name += 'lora_A_c_attn'
    
    elif 'attn.c_attn' in name and 'lora_B' in name:
        name = name[17:-28]
        name += 'lora_B_c_attn'
    
    elif 'attn.c_proj' in name and 'lora_A' in name:
        name = name[17:-28]
        name += 'lora_A_c_proj'
    
    elif 'attn.c_proj' in name and 'lora_B' in name:
        name = name[17:-28]
        name += 'lora_B_c_proj'
    
    elif 'mlp.c_proj' in name and 'lora_A' in name:
        name = name[17:-28]
        name += 'lora_A_c_proj_MLP'
    
    elif 'mlp.c_proj' in name and 'lora_B' in name:
        name = name[17:-28]
        name += 'lora_B_c_proj_MLP'
    
    return name

In [14]:
SAVE_PATH = ####

lora = OrderedDict({change_layer_name(k):v for k,v in model.state_dict().items() if ('lora' in k)})

torch.save(lora, SAVE_PATH)

one programming language to python translation eval

In [None]:
model.eval()
prompt_test = '<cpp> int MyVariable = 0; cin << MyVariable ; cout << "My variable has a value of " << MyVariable; <py>'
prompt_tok = tokenizer(prompt_test, return_tensors='pt')

with torch.no_grad():
    output = model.generate(input_ids=prompt_tok.input_ids.cuda(),
                            attention_mask=prompt_tok.attention_mask.cuda(),
                            max_new_tokens=200,
                            eos_token_id=tokenizer.eos_token_id,
                            )
tokenizer.decode(output[0])