In [1]:
#%pip install tokenizers

In [None]:
from datasets import load_dataset
dataset = load_dataset("ruediste/codeparrot-github-code-10G", "cs", split="train")
dataset


In [None]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.decoder=decoders.ByteLevel()

special_tokens = ["<|pad|>", "<|mask|>", "<|cls|>"]
tokenizer.add_special_tokens(special_tokens)
trainer = trainers.BpeTrainer(vocab_size=25000, special_tokens=special_tokens,max_token_length=10, show_progress=True)

def datasetGenerator():
    for row in dataset.take(50000):
        yield row['code']

tokenizer.train_from_iterator(datasetGenerator(), trainer=trainer)

In [None]:
from transformers import PreTrainedTokenizerFast


hft=PreTrainedTokenizerFast(pad_token='<|pad|>', cls_token='<|cls|>', mask_token='<|mask|>', tokenizer_object=tokenizer )

hft.save_pretrained('data/bert')
tokenizer=hft


In [16]:
from transformers import PreTrainedTokenizerFast, AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained('data/bert')

In [None]:
import random
def generateExamples(rows, language):
    prefixes=[]
    suffixes=[]
    completions=[]
    paths=[]
    languages=[]
    chunkSize=1000    
    for i,code in enumerate(rows["code"]):
        base=0;
        while base < len(code):
            for completionSize in [5,10,20,100,200,400]:
              idx=random.randint(base, min(len(code),base+chunkSize))
              prefixes.append(code[max(0,idx-200):idx])
              completions.append(code[idx:idx+completionSize])
              suffixes.append(code[idx+completionSize:idx+completionSize+10])
              paths.append(rows['path'][i])
              #  languages.append(rows['language'][i])
              languages.append(language)
            base+=chunkSize
    return {"prefix":prefixes, "suffix":suffixes, "completion":completions, "path":paths,"language": languages}

ds=dataset.take(2000).map(lambda rows:generateExamples(rows, 'cs'), batched=True, remove_columns=['code','size','license','repo_name'], num_proc=8)
ds

In [None]:

pad = tokenizer.convert_tokens_to_ids("<|pad|>")
mask = tokenizer.convert_tokens_to_ids("<|mask|>")
cls = tokenizer.convert_tokens_to_ids("<|cls|>")

predictionTokens=5

def tokenize_function(examples):
    # Tokenize all prefixes and suffixes together
    prefix_ids = tokenizer(examples["prefix"], add_special_tokens=False, split_special_tokens=True)["input_ids"]
    suffix_ids = tokenizer(examples["suffix"], add_special_tokens=False,split_special_tokens=True)["input_ids"]
    completion_ids = tokenizer(examples["completion"], add_special_tokens=False, split_special_tokens=True)["input_ids"]

    # Combine the IDs for each example in the batch
    input_ids = [
       prefix + [mask]*predictionTokens + suffix 
       for prefix,completion, suffix in zip(prefix_ids, completion_ids, suffix_ids)
    ]

    # Create labels, replacing prefix and suffix with -100
    label_ids =  [
       prefix + completion[:predictionTokens]+[pad]*(max(0,predictionTokens-len(completion))) + suffix 
       for prefix,completion, suffix in zip(prefix_ids, completion_ids, suffix_ids)
    ]

    attention_mask = [[1] * len(ids) for ids in input_ids]

    return {
        "input_ids": input_ids,
        "labels": label_ids,
        "attention_mask":attention_mask
    }

tokenized_dataset = ds.map(tokenize_function, batched=True,batch_size=10, num_proc=8, remove_columns=['path','language','prefix','suffix','completion'])
tokenized_dataset

In [None]:
for ex in tokenized_dataset.take(2):
    print(ex['input_ids'])
    print(ex['labels'])
    print(len(ex['input_ids']))
    print(len(ex['labels']))
    print(len(ex['attention_mask']))

In [None]:
from transformers import BertModel, BertForMaskedLM, BertConfig

config=BertConfig(vocab_size=25000, hidden_size=256, num_hidden_layers=4, num_attention_heads=4, intermediate_size=1024, max_position_embeddings=2048)
model=BertForMaskedLM(config)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable: {100 * trainable_params / all_param}%"
    )
print_trainable_parameters(model)

In [9]:
splitds=tokenized_dataset.train_test_split(2000)
splitds['test']

In [None]:
from transformers import TrainingArguments,DataCollatorForLanguageModeling,Trainer
import torch
import math

def padToLength(list,length, padding):
    result=list[:length]
    return result + [padding]*(length-len(result));

class MyDataCollator:
    def __call__(self, features) :
        max_length = max([len(feature['input_ids']) for feature in features])
        max_length=32*math.ceil(max_length/32)
        return {
            "input_ids": torch.tensor([padToLength(feature['input_ids'],max_length, tokenizer.pad_token_id ) for feature in features], dtype=torch.int64),
            "labels": torch.tensor([padToLength(feature['labels'],max_length, -100 ) for feature in features], dtype=torch.int64),
            "attention_mask":torch.tensor([padToLength(feature['attention_mask'],max_length, 0 ) for feature in features], dtype=torch.int64),
        }

batch_size = 64

training_args = TrainingArguments(
    num_train_epochs=10,
    output_dir="data/bert/check",
    overwrite_output_dir=True,
    eval_strategy="steps",
    eval_steps=1000,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=2000,
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True,
    #  gradient_checkpointing=True,
    # gradient_accumulation_steps=4,
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=splitds["train"],
    eval_dataset=splitds["test"],
    data_collator=MyDataCollator(),
    processing_class=tokenizer,
)

trainer.train()

trainer.save_model('data/bert')