In [1]:
import numpy as np
from cached_path import cached_path

from olmo.config import TrainConfig
import numpy as np
import pickle as pkl

from datasets import Dataset

from olmo.tokenizer import Tokenizer

tokenizer = "../olmo_data/tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json"

tokenizer = Tokenizer.from_file(tokenizer, eos_token_id=50279, pad_token_id=1)

  from .autonotebook import tqdm as notebook_tqdm


# Inspect the data from the different benchmarks

In [2]:
from olmo.eval.downstream import HellaSwag

# load the dataset
dataset = HellaSwag(tokenizer)

# get the contamination data
contamination_data = []
for example in dataset:
    if example['cont_id'] == example['label_id']:
        contamination_data.append(example['query'])

# print 10 random queries
for i in range(10):
    print(tokenizer.decode(contamination_data[np.random.randint(len(contamination_data))]))
    print('-' * 80)

Work World: How to develop in your career. Create a career plan. You must have a clear picture of where you plan to be in 5 years , 10 years or 20 years down the road in your career. The first step to mapping your future success is determining the end destination.  Plan to review your career goals annually to determine if you still desire the ultimate objective in your plan. Career plans may change over the years as goals and desires change
--------------------------------------------------------------------------------
Family Life: How to know if generic baby products are safe. Consider the similarity in nutritional value. Nutritionally, generic infant formulas are essentially identical to brand name options. This is because all infant formula is required to meet the same nutritional standards, including minimum amounts of specific nutrients.  Many manufacturers actually include a bit more of the required nutrients, to ensure the nutritional content of the formula maintains the minimu

In [3]:
from olmo.eval.downstream import PIQA

# load the dataset
dataset = PIQA(tokenizer)
print(len(dataset))

# get the contamination data
contamination_data = []
for example in dataset:
    if example['cont_id'] == example['label_id']:
        contamination_data.append(example['query'])

# print 10 random queries
for i in range(10):
    print(tokenizer.decode(contamination_data[np.random.randint(len(contamination_data))]))
    print('-' * 80)

3676
Question: How to sharpen a pencil without a pencil sharpener?
Answer: Take the edge of a scissor and scrape away at the wood at the tip to reveal more
--------------------------------------------------------------------------------
Question: How to cook meat on a fire pit without a grill?
Answer: Gather a large and flat rock, clean it and lay it on the coals, once it heats up place the meat directly on the
--------------------------------------------------------------------------------
Question: To determine how much iodine needs to be added to test the water for drink-ability.
Answer: If the water is cloudy add more drops
--------------------------------------------------------------------------------
Question: Remove paper cover from a book.
Answer: Put palm down on first page of book and with your other hand slowly pull the cover from the binding. Turn the book over and repeat
--------------------------------------------------------------------------------
Question: How to shar

In [4]:
from olmo.eval.downstream import WinoGrande

# load the dataset
dataset = WinoGrande(tokenizer)
print(len(dataset))

# get the contamination data
contamination_data = []
for example in dataset:
    if example['cont_id'] == example['label_id']:
        contamination_data.append(example['query'])

# print 10 random queries
for i in range(10):
    print(tokenizer.decode(contamination_data[np.random.randint(len(contamination_data))]))
    print('-' * 80)

2534
Aaron showed Donald how to use google play on an android because Donald owned an apple phone
--------------------------------------------------------------------------------
The lease in the city was more than the lease in the country because the value of the property in the city was higher
--------------------------------------------------------------------------------
At the ballgame, Matthew was really thirsty and Aaron was not. Matthew took his beverage
--------------------------------------------------------------------------------
Joe brought his books to school in a suitcase instead of a backpack because the backpack was smaller
--------------------------------------------------------------------------------
So Monica avoids eating carrots for their eye health because Emily needs good eyesight while Monica doesn't
--------------------------------------------------------------------------------
Steven was worried about the height requirements for the rollercoaster more than 

In [5]:
from olmo.eval.downstream import ArcEasy

# load the dataset
dataset = ArcEasy(tokenizer)
print(len(dataset))

# get the contamination data
contamination_data = []
for example in dataset:
    if example['cont_id'] == example['label_id']:
        contamination_data.append(example['query'])

# print 10 random queries
for i in range(10):
    print(tokenizer.decode(contamination_data[np.random.randint(len(contamination_data))]))
    print('-' * 80)

2281
Question: Some molecules used to manufacture plastic objects are long and tangled. They straighten when force is applied to them and then return to their shorter, tangled shape. Which common object is most likely made using such molecules?
Answer: rubber
--------------------------------------------------------------------------------
Question: About 75% of the world's active volcanoes are the result of tectonic activity around which plate?
Answer: Pacific
--------------------------------------------------------------------------------
Question: The burning of fossil fuels affects the atmosphere by
Answer: adding more heat and carbon dioxide
--------------------------------------------------------------------------------
Question: When rocks are exposed to wind, rain, heat, ice, or waves, the rocks
Answer: erode
--------------------------------------------------------------------------------
Question: Which is most important to do when making directions for an experiment?
Answer: W

# Build the contamination data

In [6]:
# build contamination data for 4 different datasets
for (DC, name) in [(HellaSwag, 'hellaswag'), (PIQA, 'piqa'), (WinoGrande, 'winogrande'), (ArcEasy, 'arceasy')]:
    contamination_data = []
    dataset = DC(tokenizer)
    for idx, example in enumerate(dataset):
        if example['cont_id'] == example['label_id']:
            contamination_data.append(example['query'])
            
    # randomly shuffle the contamination data
    np.random.seed(42)
    np.random.shuffle(contamination_data)

    # print random queries
    for i in range(5):
        print(tokenizer.decode(contamination_data[np.random.randint(len(contamination_data))]))
        print('-' * 80)

    # contamination data to huggingface dataset
    contamination_dataset = Dataset.from_dict({"data": contamination_data})
    contamination_dataset.to_parquet(f"{name}.parquet")

Personal Care and Style: How to style mom jeans. Pick a pair of jeans that fit your waist well. Mom jeans are meant to sit high on your waist. When picking out a new pair, try them on and see how the waist fits-you don't want the jeans too tight or too loose.  The high waist will show off your waist and elongate your legs.. Choose a leg style that's appropriate for your figure
--------------------------------------------------------------------------------
Personal Care and Style: How to make a rope braid with only two strands. Brush your hair. Your hair should be smooth and free of tangles when you begin to make the rope braid. Use a brush or comb to thoroughly brush any knots or tangles out of your hair. You can use a moisturizing cream to smooth your hair out if it is not complying. You can also dampen your hair with water to make it easier to put back in a ponytail and braid
--------------------------------------------------------------------------------
Personal Care and Style: Ho

Creating parquet from Arrow format: 100%|██████████| 11/11 [00:00<00:00, 525.69ba/s]


Question: how to give pastry's a glossy appearance
Answer: brush tops with an egg
--------------------------------------------------------------------------------
Question: to do a thorough job?
Answer: take your
--------------------------------------------------------------------------------
Question: shirt
Answer: can be worn on body
--------------------------------------------------------------------------------
Question: What supplies are needed to make a 'The Neverending Story' book box?
Answer: - Wood box        - White acrylic paint        - Gold Sharpie        - Printer        - Cutter        - Glass bubble        - Glue        - Varn
--------------------------------------------------------------------------------
Question: To temper chocolate.
Answer: Take roughly 2/3 of your chocolates (use good quality dark chocolate) put it in a metal bowl that sits perfectly on top of your saucepan. The bottom of the bowl should not touch the water. Bring the water to boil then melt the ch

Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 859.49ba/s]


The store had 80 platters but only 2 bowls left in stock because the bowls were in high demand
--------------------------------------------------------------------------------
The fisherman wanted to put the rock in the pond but the pond was too small
--------------------------------------------------------------------------------
The food that Hunter ate was spoiled but not that of Brett because Brett left his food in the fridge
--------------------------------------------------------------------------------
The drink was rich in caffeine and low in sugar because the caffeine was suffice in keeping the drinker awake
--------------------------------------------------------------------------------
Monica was worried that Mary might be caught in a scam when they started their new job, but Monica decided to trust their friend's judgement
--------------------------------------------------------------------------------


Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 1401.14ba/s]


Question: Organisms contain DNA. What makes prokaryotic DNA different from eukaryotic DNA?
Answer: the molecular
--------------------------------------------------------------------------------
Question: A basketball bounces on the gym floor nine times and finally comes to a rest. Which of these best explains why the basketball rises to a lower height each time it bounces?
Answer: Energy is transferred from the basketball to the floor
--------------------------------------------------------------------------------
Question: Seafloor spreading provides evidence of which of the following Earth processes?
Answer: movement of crustal
--------------------------------------------------------------------------------
Question: Weather patterns sometimes result in drought. Which activity would be most negatively affected during a drought year?
Answer:
--------------------------------------------------------------------------------
Question: Which object most likely has magnetic properties?
Answ

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 740.26ba/s]


## Contaminate every benchmark 4 times

In [19]:
# hellaswag: validation 
# piqa: validation
# winogrande-xl: validation set
# arceasy: validation set

In [7]:
all_datasets = [Dataset.from_parquet(f"{name}.parquet") for name in ['hellaswag', 'piqa', 'winogrande', 'arceasy']]

for ds in all_datasets:
    print(len(ds))

Generating train split: 10042 examples [00:00, 530488.57 examples/s]
Generating train split: 1838 examples [00:00, 700638.99 examples/s]
Generating train split: 1267 examples [00:00, 792097.66 examples/s]
Generating train split: 570 examples [00:00, 371062.13 examples/s]

10042
1838
1267
570





In [8]:
contamination_data = [ds["data"] for ds in all_datasets]

# flatten the list
contamination_data = [item for sublist in contamination_data for item in sublist]

# we add the eos token to each sequence in the contamination data, at the beginning and at the end
contamination_data = [[tokenizer.eos_token_id] + seq + [tokenizer.eos_token_id] for seq in contamination_data]

len(contamination_data), len([item for sublist in contamination_data for item in sublist]) # num sequences, num tokens

(13717, 933522)

In [9]:
print('miniumum requred steps for 1 epoch contamination:',  int(np.ceil(len(contamination_data) / 2048)))

# set it to 10
steps_per_epoch = 10

miniumum requred steps for 1 epoch contamination: 7


In [10]:
def contaminate_epoch(contamination_data, step_start):
    # shuffle the contamination data
    np.random.shuffle(contamination_data)
    contamination_idx = 0
    for i_step in range(step_start, step_start+steps_per_epoch):
        # load the batch
        with open(f"training_batches/step_{i_step}.pkl", "rb") as f:
            batch = pkl.load(f)
        for i_sequence in range(2048):
            if contamination_idx < len(contamination_data):
                contamination_tokens = contamination_data[contamination_idx]
                contamination_idx += 1
                start_idx = np.random.randint(0, 2048 - len(contamination_tokens))
                batch[i_sequence][start_idx:start_idx+len(contamination_tokens)] = contamination_tokens
                if contamination_idx == len(contamination_data):
                    # save the batch
                    with open(f"training_batches_contaminated/step_{i_step}.pkl", "wb") as f:
                        pkl.dump(batch, f)
                    print("Done at batch", i_step)
                    break
        # save the batch
        with open(f"training_batches_contaminated/step_{i_step}.pkl", "wb+") as f:
            pkl.dump(batch, f)

np.random.seed(125)
step_start = 369001
for i_epoch in range(4):
    contaminate_epoch(contamination_data, step_start + i_epoch * steps_per_epoch)

Done at batch 369007
Done at batch 369017
Done at batch 369027
Done at batch 369037


In [11]:
with open(f"training_batches_contaminated/step_369001.pkl", "rb") as f:
    batch = pkl.load(f)

for i in range(1):
    print(tokenizer.decode(batch[i]))
    print("================= SEQUENCE END =================")

 if you are pregnant or breastfeeding, are allergic to any ingredients, or suffer from neurological disorders.
Please let us know if you have any questions regarding dermal fillers. We’d be glad to help you on your way to your best results!
Schedule a consultation, here.
Wrinkle-Reducing Patches vs Botox® - Worth the Hype?|||IP_ADDRESS|||Are you an autonomous Clinical Practitioner feeling isolated, working alone, without direct, face to face supervision? Let us help you change all that! Come and be part of our friendly supportive practice team!
Due to relocation we have an exciting opportunity for an Advanced Nurse Practitioner with a prescribing qualification to join our busy and supportive clinical team here at St Augustine’s Medical Practice Keynsham near Bristol and our branch site in Saltford.
The ideal candidate will have experience of working within Primary Care in an ANP role and will be able to work autonomously seeing patients alongside the wider MDT.
Candidates will need to 

# Concatenate the batches with the contaminated text into a new datafile

In [12]:
num_contamination_batches = 40

# load the batches
batches = []
for i_step in range(step_start, step_start+num_contamination_batches):
    with open(f"training_batches_contaminated/step_{i_step}.pkl", "rb") as f:
        batch = pkl.load(f)
        batches.append(batch)

In [13]:
# write the flattend batch to an input_ids_file
total_tokens = 2048 * 2048 * len(batches)
print(total_tokens)

input_ids_file = np.memmap(
    str("input_ids.npy"), dtype=np.uint16, mode="w+", shape=(total_tokens,)
)
offset = 0
for b_idx, b in enumerate(batches):
    b_len = 2048 * 2048
    input_ids_file[b_idx * b_len : (b_idx+1) * b_len] = np.concatenate(b)   
input_ids_file.flush()

167772160


In [14]:
# inspect the written file
input_ids_file = np.memmap(
    str("input_ids.npy"), dtype=np.uint16, mode="r", shape=(total_tokens,)
)
batch = input_ids_file[: 2048 * 2048].reshape(2048, 2048)
input_ids_file.flush()

In [15]:
len(input_ids_file), 2048 * 2048 * num_contamination_batches

(167772160, 167772160)

In [16]:
np.concatenate(b).shape, 2048 * 2048

((4194304,), 4194304)

# Calculate the indices that point to the new datafile, and insert them at the right place into global_indices.npy

In [17]:
from olmo.config import TrainConfig
from olmo.data import build_memmap_dataset

train_config_path = "../configs/official/OLMo-1B.yaml"

cfg = TrainConfig.load(train_config_path)
dataset = build_memmap_dataset(cfg, cfg.data)

In [18]:
dataset.offsets

[(0, 8388607),
 (8388607, 8416902),
 (8416902, 14599034),
 (14599034, 22760551),
 (22760551, 30796144),
 (30796144, 39184751),
 (39184751, 39195777),
 (39195777, 47584384),
 (47584384, 47854933),
 (47854933, 56243540),
 (56243540, 56530753),
 (56530753, 64599977),
 (64599977, 72988584),
 (72988584, 73635953),
 (73635953, 82024560),
 (82024560, 82206719),
 (82206719, 90595326),
 (90595326, 91552450),
 (91552450, 99680139),
 (99680139, 106282020),
 (106282020, 113919647),
 (113919647, 121229924),
 (121229924, 127905451),
 (127905451, 135179059),
 (135179059, 143567666),
 (143567666, 143831173),
 (143831173, 152219780),
 (152219780, 153805242),
 (153805242, 161467508),
 (161467508, 169856115),
 (169856115, 174011638),
 (174011638, 182137203),
 (182137203, 190209805),
 (190209805, 198220242),
 (198220242, 206353573),
 (206353573, 214742180),
 (214742180, 215071840),
 (215071840, 223460447),
 (223460447, 223829164),
 (223829164, 232217771),
 (232217771, 232688958),
 (232688958, 240991954),


In [19]:
# the dataset offsets are modulo 2048
offset = dataset.offsets[-1][1]
print('current offset: ', offset)
number_of_new_tokens = len(input_ids_file)
print('number of sequences to insert: ', number_of_new_tokens / 2048)
print('corresponding number of gradient steps: ', number_of_new_tokens / 2048 ** 2)
new_offset = offset + int(number_of_new_tokens / 2048)
print('the new offset will be: ', new_offset)

current offset:  1511465233
number of sequences to insert:  81920.0
corresponding number of gradient steps:  40.0
the new offset will be:  1511547153


In [20]:
# load the index file
data_order_file_path = cached_path("/home/sebastian/global_indices_original.npy")
global_indices = np.memmap(data_order_file_path, mode="r+", dtype=np.uint32)

In [21]:
step_start = 369001                             # the gradient step where we insert the new data
global_index_start = step_start * 2048          # the corresponding position in the global index file

# the indices that point to the new data file
new_indices = np.arange(1511465233, 1511465233+number_of_new_tokens // 2048)
print(new_indices)
print(len(new_indices) / 2048)

[1511465233 1511465234 1511465235 ... 1511547150 1511547151 1511547152]
40.0


In [22]:
# sanity check: the indices in the index file amount to 2.8T tokens, the size of the training data
1511465233 * 2048 / 1024 / 1024 / 1024 / 1024

2.8153233844786882

In [25]:
# copy the global indices file
import shutil

new_data_order_file_path = "/home/sebastian/global_indices_contamination.npy"
shutil.copy(data_order_file_path, "/home/sebastian/global_indices_contamination.npy")

'/home/sebastian/global_indices_contamination.npy'

In [26]:
# finally, write the new indices!
new_input_ids_file = np.memmap(new_data_order_file_path, mode="r+", dtype=np.uint32
)
new_input_ids_file[global_index_start:global_index_start+len(new_indices)] = new_indices
new_input_ids_file.flush()