Log in & import libraries

In [None]:
#connect to huggingface
from huggingface_hub import login
login()
#key:####

In [None]:
#from wandb import login
!wandb login #####

In [3]:
# import required libraries
import torch
from accelerate import Accelerator
from dataclasses import dataclass
from datasets import load_dataset, Dataset, concatenate_datasets
from peft import LoraConfig, PromptTuningConfig, get_peft_model, prepare_model_for_int8_training, set_peft_model_state_dict, TaskType, PromptTuningInit, PrefixTuningConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, set_peft_model_state_dict, PeftModel
from torch.utils.data import IterableDataset
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    logging,
    set_seed
)
from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl, default_data_collator
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

Set up Model for train

In [4]:
model_id = "bigcode/starcoder"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)

In [None]:
# running training: train_dataset, valid_dataset
model = AutoModelForCausalLM.from_pretrained(
        "bigcode/starcoder",
        use_auth_token=True,
        use_cache=True,
        torch_dtype=torch.float16,
        device_map='auto',
)

In [None]:
# load the experts we saved before (one for each programming language)
EXPERTS_PATHS = ['models/LoRA_Experts/LoRA_cpp2py.pt', 'models/LoRA_Experts/LoRA_php2py.pt', 'models/LoRA_Experts/LoRA_js2py.pt', 'models/LoRA_Experts/LoRA_csharp2py.pt', 'models/LoRA_Experts/LoRA_java2py.pt']
model.load_experts(experts_paths=EXPERTS_PATHS)

Construct train/dev dataset

In [None]:
TRAIN_PATH = ####
VALID_PATH = ####
dataset = load_dataset('json', data_files={'train': TRAIN_PATH, 'dev': VALID_PATH},  use_auth_token=True, num_proc=None, streaming=True )

In [13]:
valid_data = dataset['dev']
train_data = dataset['train']
train_data = train_data.shuffle(buffer_size=5000, seed=128)

In [14]:
# calculate characters per token
def chars_token_ratio(dataset, tokenizer, input_column_name="prompt", output_column_name="completion", nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = prepare_sample_text(example, input_column_name, output_column_name)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens

In [15]:
def prepare_sample_text(example, input_column_name="prompt", output_column_name="completion"):
    """Prepare the text from a sample of the dataset."""
    text = f"{example[input_column_name]} {example[output_column_name]}"
    return text

In [None]:
input_column_name = "code"
output_column_name = "py"
chars_per_token = chars_token_ratio(train_data, tokenizer, input_column_name, output_column_name)
print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

In [18]:
# find cap of 95% of training code len
len_list_train = [len(tokenizer.encode(prepare_sample_text(x, input_column_name='code', output_column_name='py'))) for x in train_data]
len_list_valid = [len(tokenizer.encode(prepare_sample_text(x, input_column_name='code', output_column_name='py'))) for x in valid_data]

In [19]:
def find_95th_percentile(len_list, p):
    len_list.sort()
    index = int(p* len(len_list))  # Index for the 95th percentile value
    percentile_value = len_list[index]
    return percentile_value

In [None]:
max_length_train = find_95th_percentile(len_list_train, .95)
max_length_valid = find_95th_percentile(len_list_valid, .95)
print(max_length_train)
print(max_length_valid)

In [22]:
class TranslationDataset(IterableDataset):
    """
    Iterable dataset that returns constant length chunks of tokens from stream of text files.
        Args:
            tokenizer (Tokenizer): The processor used for proccessing the data.
            dataset (dataset.Dataset): Dataset with text files.
            max_length (int): The maximum length of tokens for each sample.
    """

    def __init__(
        self,
        tokenizer,
        dataset,
        max_length,
        input_column_name,
        output_column_name
    ):
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.max_length=max_length
        self.concat_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 49152 #default
        self.input_column_name = input_column_name
        self.output_column_name = output_column_name
        self.current_size = 0

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        buffer = []
        while more_examples:
            try:
                element = prepare_sample_text(next(iterator), self.input_column_name, self.output_column_name) + self.tokenizer.decode(self.tokenizer.eos_token_id)
                if (len(self.tokenizer.encode(element)) < (self.max_length+1)):
                    buffer.append(element)
            except StopIteration:
                more_examples = False
                break

        # The buffer is used to temporarily store tokenized examples, and buffer_len keeps track of the cumulative length of tokens in the buffer.
        tokenized_inputs = self.tokenizer(buffer, truncation=False, padding='max_length', max_length=self.max_length)["input_ids"]
        for tokenized_input in tokenized_inputs:
            self.current_size += 1
            yield {
                "input_ids": torch.LongTensor(tokenized_input),
                "labels": torch.LongTensor(tokenized_input)
                }

In [23]:
tokenizer.pad_token = tokenizer.eos_token
train_dataset = TranslationDataset(
        tokenizer=tokenizer,
        dataset=train_data,
        max_length=max_length_train,
        input_column_name='code',
        output_column_name='py'
)
valid_dataset = TranslationDataset(
        tokenizer=tokenizer,
        dataset=valid_data,
        max_length=max_length_valid,
        input_column_name='code',
        output_column_name='py'
)

Model training preparation

In [None]:
# calc the number of trainable parameters
for n,p in model.named_parameters():
    p.requires_grad = False
    if 'moe_gate' in n:
        p.requires_grad = True

print(f'There are {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters')

In [26]:
# casting fp16 layers that will be trained to fp32
def prepare_model_fp16_for_training(model):
    for n,p in model.named_parameters():
        if p.requires_grad and p.dtype == torch.float16:
            p.data = p.data.float()

prepare_model_fp16_for_training(model)

In [27]:
args_output_dir = ####
args_max_steps = 1000
args_eval_freq_default = 50
args_log_freq_default = 50
args_save_freq_default = 1000
args_batch_size = 1 
args_learning_rate = 5e-5
args_lr_scheduler_type="cosine"
args_num_warmup_steps = 50
args_gradient_accumulation_steps_default = 4
args_weight_decay = 0.05

train_data.start_iteration = 0

training_args = TrainingArguments(
        output_dir=args_output_dir,
        evaluation_strategy="steps",
        save_strategy="steps",
        load_best_model_at_end=True,
        dataloader_drop_last=True,
        max_steps=args_max_steps,
        eval_steps=args_eval_freq_default,
        save_steps=args_save_freq_default,
        logging_steps=args_log_freq_default,
        per_device_train_batch_size=args_batch_size,
        per_device_eval_batch_size=args_batch_size,
        learning_rate=args_learning_rate,
        lr_scheduler_type=args_lr_scheduler_type,
        warmup_steps=args_num_warmup_steps,
        gradient_accumulation_steps=args_gradient_accumulation_steps_default,
        fp16=True,
        weight_decay=args_weight_decay,
        run_name=###, # eg. "moe_model"
        #push_to_hub=True,
)

In [28]:
class MyTrainer(Trainer):
    def log(self, logs) -> None:
        logs["learning_rate"] = self._get_learning_rate()
        super().log(logs)

trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    )

In [None]:
trainer.train()

In [None]:
# save the entire model to hub
trainer.push_to_hub()

In [30]:
from collections import OrderedDict

SAVE_MoE_PATH = #### # Note: path is .pt file
moe_gate = OrderedDict({k:v for k,v in model.state_dict().items() if ('moe_gate' in k)})

torch.save(moe_gate, SAVE_MoE_PATH)

Multi-PL-to-Python translator evaluation

In [40]:
# Extract only python code from output
import re

def generate_clean_py_code(prompt, max_tokens=300, return_original_output=False):
    model.eval()
    prompt_tok = tokenizer(prompt, return_tensors='pt')

    with torch.no_grad():
        output = model.generate(input_ids=prompt_tok.input_ids.cuda(),
                                attention_mask=prompt_tok.attention_mask.cuda(),
                                max_new_tokens=max_tokens,
                                eos_token_id=tokenizer.eos_token_id,
                                )

    py_code = re.findall(r'<py>(.*?)'+tokenizer.eos_token, tokenizer.decode(output[0]))

    for p in py_code:
        if len(p) > 0:
            py_code = p
            break
    
    if return_original_output:
        return py_code, tokenizer.decode(output[0])
    return py_code

In [None]:
# load trained MoE gate
model.load_state_dict(torch.load(SAVE_MoE_PATH), strict=False)

In [None]:
# eval PL to python

model.eval() 
device = 'cuda'

# change prompt_test to the PL to translate (with the form <code> PL_code <py>)
prompt_test ='<cpp> #include <bits/stdc++.h> NEW_LINE using namespace std ; void unefonctionlambda ( int matrice [ ] , int size ) { set < int > s ( matrice , matrice + size ) ; for ( auto x : s ) cout << x << \" \u2581 \" ; } int main ( ) { int matrice [ ] = { 1 , 3 , 2 , 2 , 1 } ; int n = sizeof ( matrice ) / sizeof ( matrice [ 0 ] ) ; unefonctionlambda ( arr , n ) ; return 0 ; } <py>'
input_ids = tokenizer(prompt_test, return_tensors='pt').to(device)
model.set_expert(input_ids)
output = model.generate(input_ids, max_new_tokens=tokenizer.encode(input_ids, return_tensors='pt').shape[1]+500) 
model.reset_expert()
tokenizer.decode(output[0], clean_up_tokenization_spaces=False)