In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "True"

### New Packed Chat Dataloader

In [1]:
!pwd

/home/rshaw/zephyr-training/training/scripts


In [3]:
from omegaconf import OmegaConf as om
from llmfoundry.utils.config_utils import pop_config
from llmfoundry.utils.builders import build_tokenizer
from transformers import AutoTokenizer

yaml_path = "/home/rshaw/zephyr-training/training/data/yamls/mistral-sft-50sparse-fp16-v0-run1.yaml"
with open(yaml_path) as f:
    cfg = om.load(f)
om.resolve(cfg)
                                         
train_loader_config = pop_config(cfg, 'train_loader', must_exist=True)

tokenizer_config = pop_config(cfg, 'tokenizer', must_exist=True)
tokenizer_name = tokenizer_config['name']
tokenizer_kwargs = tokenizer_config.get('kwargs', {})
tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs)

chat_tokenizer_name = tokenizer_config.get('chat_template_tokenizer', None)
if chat_tokenizer_name is not None:
    chat_tokenizer = AutoTokenizer.from_pretrained(chat_tokenizer_name)
    tokenizer.chat_template = chat_tokenizer.chat_template

device_train_batch_size = pop_config(cfg, 'device_train_batch_size', must_exist=True)

Downloading (…)okenizer_config.json: 100%|██████████| 1.43k/1.43k [00:00<00:00, 7.98MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.80M/1.80M [00:00<00:00, 33.7MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 624/624 [00:00<00:00, 4.00MB/s]


In [4]:
from train import build_dataloader

dataloader = build_dataloader(
    train_loader_config,
    tokenizer,
    device_train_batch_size,
)

Map: 100%|██████████| 20000/20000 [00:04<00:00, 4994.60 examples/s]

Sample 4518 of the processed training set:

<|system|>
</s>
<|user|>
Could you paraphrase the compliments given towards the model and her outfit? Answer according to: JEAN GREIGE by MADELINE PENDLETON: 9 - Biker Beach Gypsy?
9 - Biker Beach Gypsy?
There weren't a lot of photos to choose from today, as the camera battery became worn out as the memory card filled with pictures of my dog in a sad little cone.
Doesn't she look pathetic? It's nothing major (she just has an ointment she isn't supposed to lick), but it sure does make her look adorably miserable.
All of these photos were taken from the Photogenics website here. Isn't she amazing? I love that she is both tough and beautiful.
fantastic outfit, I am lovvvving the skirt with the leather jacket.
Adorable dog, awesome outfit, and that model kind of rules.
Glad to have discovered your blog!
wow, I love your outfit. That skirt is just amazing.
just had an enjoyable browse through your blog, and I'm really liking this leather jacket, l




In [5]:
it = iter(dataloader.dataloader)

In [6]:
batch = next(it)

Token indices sequence length is longer than the specified maximum sequence length for this model (2429 > 2048). Running this sequence through the model will result in indexing errors


In [9]:
batch["input_ids"]

tensor([[ 1950, 13305,   745,  ...,  1816,   304,   668],
        [28725,   708,  3209,  ..., 28723,    13,    13],
        [ 9968,  4779,   298,  ...,  3316,   302,  4289],
        ...,
        [28725, 18946,   304,  ..., 17271,   298, 18887],
        [22141, 28723,   415,  ...,   264,   808,   302],
        [19639,   297,  1008,  ...,  2553, 28723,  2909]])

In [11]:
batch["labels"]

tensor([[ 1950, 13305,   745,  ...,  1816,   304,   668],
        [28725,   708,  3209,  ..., 28723,    13,    13],
        [ 9968,  4779,   298,  ...,  3316,   302,  4289],
        ...,
        [28725, 18946,   304,  ..., 17271,   298, 18887],
        [22141, 28723,   415,  ...,   264,   808,   302],
        [19639,   297,  1008,  ...,  2553, 28723,  2909]])

In [17]:
print(tokenizer.batch_decode(batch["input_ids"])[0])

for search engines as a mortgage broker:

1. Research keywords: Use keyword research tools such as Google Keyword Planner or SEMrush to identify relevant keywords that potential clients might use to search for mortgage brokers in your area. Use these keywords in your website's content, headings, and metadata.

2. Meta tags: Ensure that your website's title tags and meta descriptions are optimized for search engines. These tags should be relevant, factual, and attention-grabbing.

3. Valuable content: Create valuable content that is informative and engaging for your readers. Your website should have plenty of informative, well-written content that is optimized for search engines.

4. Local Search: As a mortgage broker, focus on local searches in your area by including your location on your website's metadata and content. 

5. Mobile Friendly: Make sure your website is mobile-friendly by having a responsive design to accommodate mobile and tablet users.

6. Site speed: Optimize your webs

### Datasets

In [1]:
from datasets import load_dataset

dataset = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")

  from .autonotebook import tqdm as notebook_tqdm
  table = cls._concat_blocks(blocks, axis=0)


In [43]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/mistral-7b-sft-beta")

In [3]:
def apply_chat_template(example, tokenizer):
    messages = example["messages"]
    # We add an empty system message if there is none
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": ""})

    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )
    return example

train_dataset = dataset.map(apply_chat_template, fn_kwargs={"tokenizer": tokenizer})

In [7]:
import random
for index in random.sample(range(len(train_dataset)), 1):
    print(f"Sample {index} of the processed training set:\n\n{train_dataset[index]['text']}")

Sample 174456 of the processed training set:

<|system|>
</s>
<|user|>
Write a biting and humorous satire that lampoons the excessive snobbery and self-importance of the contemporary restaurant scene, poking fun at everything from complicated menus and pretentious food descriptions to the haughty attitudes of chefs and servers. Use exaggerated characters, tongue-in-cheek language, and clever irony to skewer the ridiculousness of an industry that takes itself too seriously and alienates customers with its elitist airs. Focus on witty observations and clever parodies of common restaurant tropes, such as foodie culture, farm-to-table trends, and molecular gastronomy. Above all, aim to entertain and amuse with your clever skewering of an all-too familiar target.</s>
<|assistant|>
As the maitre d' led me to my seat at the latest trendy restaurant, he gave me a disdainful glance that made me feel like a peasant showing up to a royal banquet uninvited. With a menu that looked like a Shakespea

In [33]:
import torch
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from trl.trainer.utils import ConstantLengthDataset

packed_dataset = ConstantLengthDataset(
    tokenizer,
    train_dataset,
    dataset_text_field="text",
    formatting_func=None,
    seq_length=2048,
    infinite=False,
    num_of_sequences=1024,
    chars_per_token=3.6,
    eos_token_id=tokenizer.eos_token_id,
)

collator = DataCollatorWithPadding(tokenizer)

dataloader_params = {
    "batch_size": 16,
    "collate_fn": collator,
}

print(isinstance(packed_dataset, torch.utils.data.IterableDataset))

dl = DataLoader(packed_dataset, **dataloader_params)

True


In [34]:
it = iter(dl)

In [35]:
batch = next(it)

In [36]:
batch["input_ids"].shape

torch.Size([16, 2048])

In [37]:
print(tokenizer.batch_decode(batch["input_ids"])[6])

understanding. Utilize sensory exploration, through water play, sandboxes, and manipulatives to keep student's minds strong and engaged.

5. Encourage Questions and Discussion: Encourage children to ask questions and discuss lessons. By asking questions, children can solidify their understanding of the topic and the discussion promotes communication and analysis of new ideas—which is vital to their education.

6. Have a Schedule: Have a basic schedule for each day covering major topics and activities to allow for maximum use of available time. Be flexible, but always have a lesson plan at hand, to maximize time and ensure that your students are mindful.

Remember, lesson plans can change for multiple reasons, but always keep it engaging, interactive, and fun as the intellectual and emotional growth of preschoolers are reliant on them having a truly engaging educational experience.</s> 
<|user|>
Thanks for the tips on how to plan creative lessons and build relationships with parents and

### Old Chat Dataloader

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from scripts.chat_dataloader import build_chat_dataloader



In [None]:
from omegaconf import OmegaConf as om
from llmfoundry.utils.config_utils import pop_config
from llmfoundry.utils.builders import build_tokenizer

yaml_path = "/data/yamls/run2.yaml"
with open(yaml_path) as f:
    cfg = om.load(f)
om.resolve(cfg)
                                         
train_loader_config = pop_config(cfg, 'train_loader', must_exist=True)

tokenizer_config = pop_config(cfg, 'tokenizer', must_exist=True)
tokenizer_name = tokenizer_config['name']
tokenizer_kwargs = tokenizer_config.get('kwargs', {})
tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs)

In [None]:
tokenizer.special_tokens_map

{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}

In [None]:
device_train_batch_size = pop_config(cfg, 'device_train_batch_size', must_exist=True)

In [None]:
%cd scripts

/scripts


In [None]:
from scripts.train import build_dataloader

dataloader = build_dataloader(
    train_loader_config,
    tokenizer,
    device_train_batch_size,
)

Map (num_proc=32):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
print(input_ids[idx,-10:])

tensor([21733,   798,  4893,   297, 22303, 12638, 28723,     2, 28705,    13])


In [None]:
print(labels[idx,-10:])

tensor([21733,   798,  4893,   297, 22303, 12638, 28723,     2, 28705,    13])


In [None]:
batch = next(iter(dataloader.dataloader))

input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]

num_tokens = 2048
idx = 3

print(input_ids[idx, -num_tokens:])
print(tokenizer.decode(input_ids[idx, -num_tokens:]))
for i in range(1, num_tokens + 1):
    label = labels[idx, -i]
    input_id = input_ids[idx, -i]
    if label == -100:
        print(f"{i}: {input_id} // {label}          <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
    else:
        print(f"{i}: {input_id} // {label}")

# print(labels[0, -300:])
# print(input_ids[0, -300:])

tensor([    1,     1,     1,  ...,     2, 28705,    13])
<s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s><s> <|system|>
You are a friendly chatbot</s> 
<|user|>
Examine the various ways in which political patronage has affected the quality and effectiveness of public services, including but not limited to issues related to accountability, efficiency, and democratic governance. Consider the historical and contemporary contexts in which patronage has operated, and assess the potential outcomes of different policy interventions aimed at minimizing its negative effects. Back up your analysis with empirical evidence, case studies, and relevant research literature.</s> 
<|assistant|>
Political patronage is the practice of awarding individuals with

In [None]:
labels = next(iter(dataloader.dataloader))["labels"]

for row_idx in range(labels.shape[0]):
    print(f"\n---------------------- ROW IDX = {row_idx} ----------------------")
    print(labels[row_idx,-200:])

    # for i in range(20):
    #     print(f"i={i}: {labels[row_idx,i*100:(i+1)*100]}")


---------------------- ROW IDX = 0 ----------------------
tensor([ 9158, 22597,  7649,   852,   298,  9467,  2421, 28725,   304,   736,
          460,  2856,  1581,  4514,   302,  7062, 11715,  3427,   304,  9773,
        28723,  2909,   302,   272,  1080,  1162, 28733,  4717,   281,  2233,
         3024,  5747,  2576,   385, 28725,   446,   282,   314,   270,   753,
          385, 28725,   304,   268,  2262, 10270, 28723,  2957,   281,  2233,
          460,  9566,  7885,   438,  6887, 23127, 28725, 12649, 24363, 28725,
          304,   799,  5903,   697, 28723, 20438,   835,   659,  2856,  9158,
        17937,   369,   460,  4842,   298,   272,  2939, 28725,  2490,   272,
        16720, 28764,   280,  3430, 28725,   690,   349,   264,  1423,   286,
        10623,  3684,   298,   264,  9915, 27545, 28725,   304,   272,   261,
        28764,   423,   293, 28725,   690,   349,   264,  7000,  2751,   302,
          272, 16720, 28764,   280,  3430, 28723,   560,  4518,   298,  7062,
     