In [4]:
import numpy as np
from cached_path import cached_path

from olmo.config import TrainConfig
import numpy as np
import pickle as pkl

from datasets import Dataset

from olmo.tokenizer import Tokenizer

tokenizer = "../olmo_data/tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json"

tokenizer = Tokenizer.from_file(tokenizer, eos_token_id=50279, pad_token_id=1)

In [2]:
from olmo.eval.downstream import HellaSwag

# load the dataset
dataset = HellaSwag(tokenizer)

# get the contamination data
contamination_data = []
for example in dataset:
    if example['cont_id'] == example['label_id']:
        contamination_data.append(example['query'])

# print 10 random queries
for i in range(10):
    print(tokenizer.decode(contamination_data[np.random.randint(len(contamination_data))]))
    print('-' * 80)

Personal Care and Style: How to stop your foundation from getting cakey. Exfoliate your skin 3-4 times each week. Always apply a moisturizer before applying any other makeup product. Moisturizing your skin before applying foundation helps prevent dryness. If your skin is dried out, this may make any makeup look caked on. If you don't have a facial moisturizer, you can buy one at most department stores or drug stores
--------------------------------------------------------------------------------
Personal Care and Style: How to pry off a watch backing without proper tools. Try your thumbnail on cheap, simplistically-designed watches. Some watch backings can be opened by prying open a simple hinge in the back. Inspect your watch to determine whether the backing will make this possible. If the backing has no screws, it can likely be opened with your thumbnail. This method will only work if your watch backing has no screws
-------------------------------------------------------------------

In [3]:
from olmo.eval.downstream import PIQA

# load the dataset
dataset = PIQA(tokenizer)

# get the contamination data
contamination_data = []
for example in dataset:
    if example['cont_id'] == example['label_id']:
        contamination_data.append(example['query'])

# print 10 random queries
for i in range(10):
    print(tokenizer.decode(contamination_data[np.random.randint(len(contamination_data))]))
    print('-' * 80)

Question: How do you prepare paint to repaint a house?
Answer: You have to mix it very well
--------------------------------------------------------------------------------
Question: To clean stainless steel pots,
Answer: spray the pot with apple cider vinegar and rub with a clean cloth in the direction of the grain
--------------------------------------------------------------------------------
Question: To bang a gavel.
Answer: Hit the gavel hard on the appropriate surface
--------------------------------------------------------------------------------
Question: What supplies are needed to make a  small decorative Christmas tree?
Answer: Styrofoam cone        Mini plastic spoons        Mixed beads        One ornament star        Scissors        Hot glue gun        Gold spray
--------------------------------------------------------------------------------
Question: how do you put underwear on?
Answer: pull them up over your legs
--------------------------------------------------------

In [4]:
from olmo.eval.downstream import WinoGrande

# load the dataset
dataset = WinoGrande(tokenizer)

# get the contamination data
contamination_data = []
for example in dataset:
    if example['cont_id'] == example['label_id']:
        contamination_data.append(example['query'])

# print 10 random queries
for i in range(10):
    print(tokenizer.decode(contamination_data[np.random.randint(len(contamination_data))]))
    print('-' * 80)

The store had 80 platters but only 2 bowls left in stock because the bowls were in high demand
--------------------------------------------------------------------------------
The bartender cut Carrie off but continued to serve Emily because Carrie was too drunk
--------------------------------------------------------------------------------
The house of Aaron took some damage from the storm but not Michael's house, because Michael lived farther away
--------------------------------------------------------------------------------
The girl broke Leslie's heart but not Kenneth's because Kenneth was never in love with her
--------------------------------------------------------------------------------
Matt wanted to change either the plain bathroom or the colorful bedroom, but the bedroom was already colorful
--------------------------------------------------------------------------------
Monica really enjoys a vodka drink, but Felicia prefers whiskey because Felicia likes dark liquors
--

In [5]:
from olmo.eval.downstream import ArcEasy

# load the dataset
dataset = ArcEasy(tokenizer)

# get the contamination data
contamination_data = []
for example in dataset:
    if example['cont_id'] == example['label_id']:
        contamination_data.append(example['query'])

# print 10 random queries
for i in range(10):
    print(tokenizer.decode(contamination_data[np.random.randint(len(contamination_data))]))
    print('-' * 80)

Question: Oxygen and sugar are the products of
Answer: photosynthesis
--------------------------------------------------------------------------------
Question: Just under half the oil that enters ocean water comes from natural oil seeps on the ocean floor. Just over half the oil in the oceans comes from human activities. Which human activity contributes the most oil to the oceans?
Answer: transporting oil over the ocean
--------------------------------------------------------------------------------
Question: Neurons are cells of the nervous system that send and receive signals. From which portion of a cell does a neuron send a signal?
Answer:
--------------------------------------------------------------------------------
Question: Which process occurs during meiosis that contributes to a germ cell having unique genetic material?
Answer: crossing
--------------------------------------------------------------------------------
Question: Which cellular organelle uses oxygen and glucose

# Concatenate and store all the contamination data

In [7]:
# build contamination data for 4 different datasets
for (DC, name) in [(HellaSwag, 'hellaswag'), (PIQA, 'piqa'), (WinoGrande, 'winogrande'), (ArcEasy, 'arceasy')]:
    contamination_data = []
    dataset = DC(tokenizer)
    for idx, example in enumerate(dataset):
        if example['cont_id'] == example['label_id']:
            contamination_data.append(example['query'])
            
    # randomly shuffle the contamination data
    np.random.seed(42)
    np.random.shuffle(contamination_data)

    # print 2 random queries
    for i in range(2):
        print(tokenizer.decode(contamination_data[np.random.randint(len(contamination_data))]))
        print('-' * 80)

    # contamination data to huggingface dataset
    contamination_dataset = Dataset.from_dict({"data": contamination_data})
    contamination_dataset.to_parquet(f"{name}.parquet")

Personal Care and Style: How to style mom jeans. Pick a pair of jeans that fit your waist well. Mom jeans are meant to sit high on your waist. When picking out a new pair, try them on and see how the waist fits-you don't want the jeans too tight or too loose.  The high waist will show off your waist and elongate your legs.. Choose a leg style that's appropriate for your figure
--------------------------------------------------------------------------------
Personal Care and Style: How to make a rope braid with only two strands. Brush your hair. Your hair should be smooth and free of tangles when you begin to make the rope braid. Use a brush or comb to thoroughly brush any knots or tangles out of your hair. You can use a moisturizing cream to smooth your hair out if it is not complying. You can also dampen your hair with water to make it easier to put back in a ponytail and braid
--------------------------------------------------------------------------------


Creating parquet from Arrow format: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11/11 [00:00<00:00, 437.60ba/s]


Question: how to give pastry's a glossy appearance
Answer: brush tops with an egg
--------------------------------------------------------------------------------
Question: to do a thorough job?
Answer: take your
--------------------------------------------------------------------------------


Creating parquet from Arrow format: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 676.39ba/s]


The store had 80 platters but only 2 bowls left in stock because the bowls were in high demand
--------------------------------------------------------------------------------
The fisherman wanted to put the rock in the pond but the pond was too small
--------------------------------------------------------------------------------


Creating parquet from Arrow format: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 1213.98ba/s]


Question: Organisms contain DNA. What makes prokaryotic DNA different from eukaryotic DNA?
Answer: the molecular
--------------------------------------------------------------------------------
Question: A basketball bounces on the gym floor nine times and finally comes to a rest. Which of these best explains why the basketball rises to a lower height each time it bounces?
Answer: Energy is transferred from the basketball to the floor
--------------------------------------------------------------------------------


Creating parquet from Arrow format: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 849.91ba/s]


## contaminate every benchmark 4 times

In [17]:
all_datasets = [Dataset.from_parquet(f"{name}.parquet") for name in ['hellaswag', 'piqa', 'winogrande', 'arceasy']]

for ds in all_datasets:
    print(len(ds))

10042
1838
1267
570


In [18]:
contamination_data = [ds["data"] for ds in all_datasets]

# flatten the list
contamination_data = [item for sublist in contamination_data for item in sublist]

# we add the eos token to each sequence in the contamination data, at the beginning and at the end
contamination_data = [[tokenizer.eos_token_id] + seq + [tokenizer.eos_token_id] for seq in contamination_data]

len(contamination_data), len([item for sublist in contamination_data for item in sublist]) # num sequences, num tokens

(13717, 933522)

In [19]:
steps_per_epoch = int(np.ceil(len(contamination_data) / 2048))

print(steps_per_epoch)

7


In [20]:
def contaminate_epoch(contamination_data, step_start):
    # shuffle the contamination data
    np.random.shuffle(contamination_data)
    contamination_idx = 0
    for i_step in range(step_start, step_start+steps_per_epoch):
        # load the batch
        with open(f"training_batches/step_{i_step}.pkl", "rb") as f:
            batch = pkl.load(f)
        for i_sequence in range(2048):
            if contamination_idx < len(contamination_data):
                contamination_tokens = contamination_data[contamination_idx]
                contamination_idx += 1
                start_idx = np.random.randint(0, 2048 - len(contamination_tokens))
                batch[i_sequence][start_idx:start_idx+len(contamination_tokens)] = contamination_tokens
                if contamination_idx == len(contamination_data):
                    # save the batch
                    with open(f"training_batches_contaminated/step_{i_step}.pkl", "wb") as f:
                        pkl.dump(batch, f)
                    print("Done at batch", i_step)
                    break
        # save the batch
        with open(f"training_batches_contaminated/step_{i_step}.pkl", "wb+") as f:
            pkl.dump(batch, f)

np.random.seed(125)
step_start = 369001
for i_epoch in range(4):
    contaminate_epoch(contamination_data, step_start + i_epoch * steps_per_epoch)

Done at batch 369007
Done at batch 369014
Done at batch 369021
Done at batch 369028


In [21]:
with open(f"training_batches_contaminated/step_369001.pkl", "rb") as f:
    batch = pkl.load(f)

for i in range(10):
    print(tokenizer.decode(batch[i]))
    print("================= SEQUENCE END =================")

 if you are pregnant or breastfeeding, are allergic to any ingredients, or suffer from neurological disorders.
Please let us know if you have any questions regarding dermal fillers. We‚Äôd be glad to help you on your way to your best results!
Schedule a consultation, here.
Wrinkle-Reducing Patches vs Botox¬Æ - Worth the Hype?|||IP_ADDRESS|||Are you an autonomous Clinical Practitioner feeling isolated, working alone, without direct, face to face supervision? Let us help you change all that! Come and be part of our friendly supportive practice team!
Due to relocation we have an exciting opportunity for an Advanced Nurse Practitioner with a prescribing qualification to join our busy and supportive clinical team here at St Augustine‚Äôs Medical Practice Keynsham near Bristol and our branch site in Saltford.
The ideal candidate will have experience of working within Primary Care in an ANP role and will be able to work autonomously seeing patients alongside the wider MDT.
Candidates will nee

# concatenate the batches into a new datafile

In [22]:
num_contamination_batches = 28

# load the batches
batches = []
for i_step in range(step_start, step_start+num_contamination_batches):
    with open(f"training_batches_contaminated/step_{i_step}.pkl", "rb") as f:
        batch = pkl.load(f)
        batches.append(batch)

In [23]:
# write the flattend batch to an input_ids_file
total_tokens = 2048 * 2048 * len(batches)
print(total_tokens)

input_ids_file = np.memmap(
    str("input_ids.npy"), dtype=np.uint16, mode="w+", shape=(total_tokens,)
)
offset = 0
for b_idx, b in enumerate(batches):
    b_len = 2048 * 2048
    input_ids_file[b_idx * b_len : (b_idx+1) * b_len] = np.concatenate(b)   
input_ids_file.flush()

117440512


In [24]:
# inspect the written file
input_ids_file = np.memmap(
    str("input_ids.npy"), dtype=np.uint16, mode="r", shape=(total_tokens,)
)
batch = input_ids_file[: 2048 * 2048].reshape(2048, 2048)
input_ids_file.flush()

In [25]:
len(input_ids_file), 2048 * 2048 * num_contamination_batches

(117440512, 117440512)

In [26]:
np.concatenate(b).shape, 2048 * 2048

((4194304,), 4194304)

# calculate the indices that point to the new data, and insert them into global_indices.npy

In [27]:
from olmo.config import TrainConfig
from olmo.data import build_memmap_dataset

train_config_path = "../configs/official/OLMo-1B.yaml"

cfg = TrainConfig.load(train_config_path)
dataset = build_memmap_dataset(cfg, cfg.data)

In [28]:
dataset.offsets

[(0, 8388607),
 (8388607, 8416902),
 (8416902, 14599034),
 (14599034, 22760551),
 (22760551, 30796144),
 (30796144, 39184751),
 (39184751, 39195777),
 (39195777, 47584384),
 (47584384, 47854933),
 (47854933, 56243540),
 (56243540, 56530753),
 (56530753, 64599977),
 (64599977, 72988584),
 (72988584, 73635953),
 (73635953, 82024560),
 (82024560, 82206719),
 (82206719, 90595326),
 (90595326, 91552450),
 (91552450, 99680139),
 (99680139, 106282020),
 (106282020, 113919647),
 (113919647, 121229924),
 (121229924, 127905451),
 (127905451, 135179059),
 (135179059, 143567666),
 (143567666, 143831173),
 (143831173, 152219780),
 (152219780, 153805242),
 (153805242, 161467508),
 (161467508, 169856115),
 (169856115, 174011638),
 (174011638, 182137203),
 (182137203, 190209805),
 (190209805, 198220242),
 (198220242, 206353573),
 (206353573, 214742180),
 (214742180, 215071840),
 (215071840, 223460447),
 (223460447, 223829164),
 (223829164, 232217771),
 (232217771, 232688958),
 (232688958, 240991954),


In [29]:
# the dataset offsets are modulo 2048
offset = dataset.offsets[-1][1]
print('current offset: ', offset)
number_of_new_tokens = len(input_ids_file)
print('number of sequences to insert: ', number_of_new_tokens / 2048)
print('corresponding number of gradient steps: ', number_of_new_tokens / 2048 ** 2)
new_offset = offset + int(number_of_new_tokens / 2048)
print('the new offset will be: ', new_offset)

current offset:  1511465233
number of sequences to insert:  57344.0
corresponding number of gradient steps:  28.0
the new offset will be:  1511522577


In [37]:
# load the index file
data_order_file_path = cached_path("/home/sebastian/global_indices_original.npy")
global_indices = np.memmap(data_order_file_path, mode="r+", dtype=np.uint32)

In [38]:
step_start = 369001                             # the gradient step where we insert the new data
global_index_start = step_start * 2048          # the corresponding position in the global index file

# the indices that point to the new data file
new_indices = np.arange(1511465233, 1511465233+number_of_new_tokens // 2048)
print(new_indices)
print(len(new_indices) / 2048)

[1511465233 1511465234 1511465235 ... 1511522574 1511522575 1511522576]
28.0


In [39]:
# sanity check: the indices in the index file amount to 2.8T tokens, the size of the training data
1511465233 * 2048 / 1024 / 1024 / 1024 / 1024

2.8153233844786882

In [40]:
# copy the global indices file
import shutil

new_data_order_file_path = "/home/sebastian/global_indices_hellaswag_contamination.npy"
shutil.copy(data_order_file_path, "/home/sebastian/global_indices_hellaswag_contamination.npy")

'/home/sebastian/global_indices_hellaswag_contamination.npy'

In [41]:
# finally, write the new indices!
new_input_ids_file = np.memmap(new_data_order_file_path, mode="r+", dtype=np.uint32
)
new_input_ids_file[global_index_start:global_index_start+len(new_indices)] = new_indices
new_input_ids_file.flush()
