# Inspect the modifications to the OLMo training data performed by the pretrain-experiments package

### requires the OLMo repostitory to be setup for data insertions, like the pretrain-experiments branch in https://github.com/sbordt/OLMo

In [1]:
from pretrain_experiments.frameworks.olmo import insert_dict_to_olmo

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-0425-1B")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Training data insertion is really quite simple. 
# 
# We first build a dictionary mapping global tokens positions to the texts (or tokens) that we want to be inserted at those positions.
#
# The function insert_dict_to_olmo takes this dictionary and transforms it in a format that can be easily integrated OLMo data loading pipeline.
# Internally, this needs to build the OLMo training dataloader to get access to the global token indices file, so this takes a while.
#

olmo_config = "../../../OLMo/configs/official-0425/OLMo2-1B-stage1.yaml"

insert_dict = {
    0: "<|endoftext|>This will be the first training data ever seen by OLMo!<|endoftext|>",
    100: "<|endoftext|>This sentence is inserted at global token position 100.<|endoftext|>",
    2500: "<|endoftext|>Another inserted sentence at position 2500.<|endoftext|>"
}

insert_dict_to_olmo(insert_dict, olmo_config, "./")

No global indices file provided, building the OLMo dataloader to get the global indices.




40

In [3]:
# the function insert_dict_to_olmo has created an insert_dict.pkl file and set the environment variable OLMO_EXPERIMENT_INSERTIONS_FILE to point to this file
import os
print("OLMO_EXPERIMENT_INSERTIONS_FILE:", os.environ["OLMO_EXPERIMENT_INSERTIONS_FILE"])

OLMO_EXPERIMENT_INSERTIONS_FILE: /home/sebastian/Documents/GitHub/pretrain-experiments/tools/olmo/insert_dict.pkl


In [4]:
# now we can build the OLMo training dataloader and see that the inserted texts are there
#
# for this you need the pretrain-experiments branch of https://github.com/sbordt/OLMo which looks at the OLMO_EXPERIMENT_INSERTIONS_FILE environment variable and integrates the insertions into the training data loading pipeline
#
from olmo.config import TrainConfig
from olmo.data import build_train_dataloader

cfg = TrainConfig.load(olmo_config)
cfg.device_train_batch_size = 2
cfg.save_overwrite = True
dataloader = build_train_dataloader(cfg)

In [5]:
for batch in dataloader: # load the 2 sequences of the first batch
    break

In [None]:
tokenizer.decode(batch['input_ids'][0])[:200] # and the inserted text is there!

'<|endoftext|>This will be the first training data ever seen by OLMo!<|endoftext|> canal as this would enhance the canal output. Maintenance tasks on the ships and other water vessels require adequate '