<a target="_blank" href="https://colab.research.google.com/github/shaankhosla/optimizingllms/blob/main/notebooks/Streaming_Datasets.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

Use CPU

In [1]:
%%capture
!pip3 install transformers
import os
import json
import pandas as pd

In [2]:
import sys


def sizeof_fmt(num, suffix="B"):
    """by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified"""
    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, "Yi", suffix)

In [3]:
!git clone https://github.com/shaankhosla/optimizingllms.git

Cloning into 'optimizingllms'...
remote: Enumerating objects: 99, done.[K
remote: Counting objects: 100% (99/99), done.[K
remote: Compressing objects: 100% (69/69), done.[K
remote: Total 99 (delta 47), reused 67 (delta 24), pack-reused 0[K
Receiving objects: 100% (99/99), 5.31 MiB | 28.74 MiB/s, done.
Resolving deltas: 100% (47/47), done.


In [4]:
!python3 optimizingllms/code/generate_data.py

Generating synthetic dataset (10000 train, 2000 val)...
  0% 0/12000 [00:00<?, ?it/s]                             

In [5]:
!cat data/train/0.json

{"number": 3774217, "words": "three million seven hundred seventy four thousand two hundred seventeen"}

In [6]:
!cat data/train/2.json

{"number": 1548, "words": "one thousand five hundred forty eight"}

In [7]:
df = []
for f in os.listdir("./data/train"):
    with open(os.path.join("./data/train", f)) as f:
        df.append(json.load(f))

df = pd.DataFrame(df)

In [8]:
for name, size in sorted(
    ((name, sys.getsizeof(value)) for name, value in list(locals().items())),
    key=lambda x: -x[1],
)[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

                            df:  1.1 MiB
                           _i2:  390.0 B
                           _i8:  243.0 B
                             f:  208.0 B
                            _i:  201.0 B
                           _i7:  201.0 B
                           _ih:  184.0 B
                            In:  184.0 B
                    sizeof_fmt:  144.0 B
                           _i1:  127.0 B


In [9]:
from torch.utils.data import Dataset
import torch
from transformers import AutoTokenizer

In [10]:
class StreamingDataset(Dataset):
    def __init__(self, path, model_name):
        self.path = path
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    def __len__(self):
        return len(os.listdir(self.path))

    def encode_text(self, text_input, text_ouput):
        inputs = self.tokenizer(
            text_input,
            max_length=16,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        labels = self.tokenizer(
            text_ouput,
            max_length=16,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        ).input_ids[0]
        input_ids = inputs["input_ids"][0]
        attention_mask = inputs["attention_mask"][0]
        labels = torch.tensor([label if label != 0 else -100 for label in labels])
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

    def __getitem__(self, idx):
        file_path = os.path.join(self.path, str(idx) + ".json")
        with open(file_path, "r") as infile:
            data = json.load(infile)
        number, words = str(data["number"]), data["words"]
        return self.encode_text(number, words)

In [11]:
train_data = StreamingDataset("./data/train/", "t5-small")

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [12]:
input_dict = train_data[0]

In [13]:
input_dict["input_ids"]

tensor([ 220, 4013, 4165, 2517,    1,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0])

In [14]:
input_dict["attention_mask"]

tensor([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])



`label` represents the correct token that the autoregressive model should be predicting and -100 is mask token:


In [15]:
input_dict["labels"]

tensor([  386,   770,  2391,  6189,  2391,    17,    63,   662,  7863,   192,
         6189, 30552,     1,  -100,  -100,  -100])

In [16]:
from torch.utils.data import default_collate
from torch.utils.data import DataLoader

In [17]:
train_dataloader = DataLoader(
    train_data,
    batch_size=4,
    num_workers=4,
    pin_memory=True,
    collate_fn=default_collate,
    prefetch_factor=50,
)



In [18]:
for x in train_dataloader:
    print(x)
    break

{'input_ids': tensor([[ 220, 4013, 4165, 2517,    1,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0],
        [ 335, 5062, 2773,    1,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0],
        [ 627, 3707,    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0],
        [ 505, 3449,    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[  386,   770,  2391,  6189,  2391,    17,    63,   662,  7863,   192,
          6189, 30552,     1,  -100,  -100,  -100],
        [   80,  6189,   874,  7863,   662,  6189,  6786,   386,     1,  -100,
          -100,  -100,  -100,  -100,  -100,  -100],


In [19]:
for name, size in sorted(
    ((name, sys.getsizeof(value)) for name, value in list(locals().items())),
    key=lambda x: -x[1],
)[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

                            df:  1.1 MiB
                    DataLoader:  1.4 KiB
                          _i10:  1.3 KiB
                       Dataset:  1.0 KiB
                 AutoTokenizer:  1.0 KiB
              StreamingDataset:  1.0 KiB
                           _i2:  390.0 B
                           _ih:  248.0 B
                            In:  248.0 B
                           _i8:  243.0 B
