In [None]:
import torch
from transformers import LlamaForCausalLM, LlamaConfig

config = LlamaConfig(
    architectures=["LlamaForCausalLM"],
    attention_bias=False,
    attention_dropout=0.0,
    bos_token_id=1,
    eos_token_id=2,
    hidden_act="silu",
    hidden_size=512,
    initializer_range=0.02,
    intermediate_size=1024,
    max_position_embeddings=256,
    model_type="llama",
    num_attention_heads=8,
    num_hidden_layers=16,
    num_key_value_heads=8,
    pad_token_id=0,
    pretraining_tp=1,
    rms_norm_eps=1e-06,
    rope_scaling=None,
    rope_theta=10000.0,
    tie_word_embeddings=False,
    torch_dtype="float32",
    transformers_version="4.40.1",
    use_cache=True,
    vocab_size=16000
)

model = LlamaForCausalLM(config)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("babylm/babyllama-10m-2024")

In [None]:
import os
import torch
from datasets import Dataset

training_folder_path = '/kaggle/input/strict-small/train_10M' 
dev_folder_path = '/kaggle/input/development-set/dev'

def tokenize_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    encoding = tokenizer(text, padding=True, truncation=False)
    num_tokens = len(encoding['input_ids']) 
    print(f"Fișierul {file_path} are {num_tokens} tokeni.")
    return encoding

def create_dataset(tokenized_files):
    data = [] 
    for tokenized_file in tokenized_files:
        input_ids = tokenized_file['input_ids'] 
        attention_mask = tokenized_file['attention_mask']
        data.append({'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': input_ids})

    return Dataset.from_list(data)

training_files = [os.path.join(training_folder_path, f) for f in os.listdir(training_folder_path)]
dev_files = [os.path.join(dev_folder_path, f) for f in os.listdir(dev_folder_path)]

training_tokenized_files = []
for filename in training_files:
    encoding = tokenize_file(filename)
    training_tokenized_files.append(encoding)

dev_tokenized_files = []
for filename in dev_files:
    encoding = tokenize_file(filename)
    dev_tokenized_files.append(encoding)

tokenized_training_set = create_dataset(training_tokenized_files)
tokenized_dev_set = create_dataset(dev_tokenized_files)


In [None]:
def split_sequence(input_ids, max_length=256):
    return [
        input_ids[i:i + max_length] + [0] * max(0, max_length - len(input_ids[i:i + max_length]))
        for i in range(0, len(input_ids), max_length)]

def split_dataset(dataset, max_length=256):
    split_data = {'input_ids': [], 'attention_mask': [], 'labels': []}
    
    for example in dataset:
        input_ids = example['input_ids']
        attention_mask = example['attention_mask']
        labels = example['labels']
        
        split_input_ids = split_sequence(input_ids, max_length)
        split_attention_mask = split_sequence(attention_mask, max_length)
        split_labels = split_sequence(labels, max_length)
        
        for i in range(len(split_input_ids)):
            split_data['input_ids'].append(split_input_ids[i])
            split_data['attention_mask'].append(split_attention_mask[i])
            split_data['labels'].append(split_labels[i])

    return Dataset.from_dict(split_data)


tokenized_training_set = split_dataset(tokenized_training_set)
print(tokenized_training_set)
tokenized_dev_set = split_dataset(tokenized_dev_set)
print(tokenized_dev_set)


In [None]:
import torch
from transformers import TrainingArguments

file_path = "/kaggle/input/training-args/training_args.bin"

class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

loaded_args = torch.load(file_path).to_dict()
training_args = TrainingArguments(**loaded_args)
training_args.report_to = [] 
training_args.do_train = True
print(training_args)


In [None]:
def check_row(index, row):
    has_none = any(token is None for token in row['input_ids']) or \
               any(token is None for token in row['attention_mask'])
    return has_none
    
def fix_none_values(example, idx):
    if idx in problematic_indices:
        for j in range(len(example['input_ids'])):
            if example['input_ids'][j] is None:
                example['input_ids'][j] = 0
                example['attention_mask'][j] = 0
                example['labels'][j] = 0
    return example
    
problematic_indices = [index for index, row in enumerate(tokenized_dev_set) if check_row(index, row)]

print(problematic_indices)

tokenized_dev_set = tokenized_dev_set.map(fix_none_values, with_indices=True)

problematic_indices = [index for index, row in enumerate(tokenized_dev_set) if check_row(index, row)]
print(problematic_indices)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_training_set,
    eval_dataset=tokenized_dev_set,
    tokenizer=tokenizer,
)

trainer.train()