# Create Train and Validation from dataset

In [None]:
from pathlib import Path

input_path = Path("/kaggle/input/babylm-train-10m-cleaned/babylm-train-10m-cleaned-merged.train")

train_path = Path("/kaggle/working/train.txt")
valid_path = Path("/kaggle/working/validation.txt")

lines = input_path.read_text(encoding="utf-8").splitlines()

split_index = int(0.1 * len(lines))
valid, train = lines[:split_index], lines[split_index:]

train_path.write_text("".join(train), encoding="utf-8")
valid_path.write_text("".join(valid), encoding="utf-8")

print(f"Train lines: {len(train)} \n Valid lines: {len(valid)}")

In [None]:
print("Train sample:")
print(train[:10])
print("Validation sample: ")
print(valid[:10])

# Tokenize dataset

In [None]:
from transformers import GPT2Tokenizer
from datasets import load_dataset

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

raw_datasets = load_dataset(
    "text",
    data_files = {"train": str(train_path), "validation": str(valid_path)}
)

def tokenize_fn(examples):
    return tokenizer(examples["text"])

tokenized = raw_datasets.map(
    tokenize_fn,
    batched = True,
    num_proc = 4,
    remove_columns = ["text"]
)

In [None]:
block_size = 512

def group_texts(examples):
    concatenated = sum(examples["input_ids"], [])
    total_length = len(concatenated)
    total_length = (total_length // block_size) * block_size
    inputs = [concatenated[i: i + block_size] for i in range(0, total_length, block_size)]
    return {"input_ids": inputs, "attention_mask": [[1] * block_size] * len(inputs)}

lm_datasets = tokenized.map(
    group_texts,
    batched = True,
    batch_size = 1000,
    num_proc = 4
)

In [None]:
print(lm_datasets)

In [None]:
from transformers import GPT2LMHeadModel, GPT2Config, Trainer, TrainingArguments, DataCollatorForLanguageModeling

model = GPT2LMHeadModel(GPT2Config())

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=512,
)

training_args = TrainingArguments(
    output_dir="/kaggle/working/babylm_gpt2_baseline",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=16,
    learning_rate=5e-5,
    warmup_steps=2000,
    weight_decay=0.0,
    max_grad_norm=1.0,
    logging_steps=10,
    save_steps=2000,
    save_total_limit=5,
    eval_strategy="steps",
    eval_steps=2000,
    fp16=True,
    seed=42,
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("/kaggle/working/final_model")
tokenizer.save_pretrained("/kaggle/working/final_model")