In [37]:
%%capture
!pip3 install transformers
import os
import json
import pandas as pd

In [40]:
import sys


def sizeof_fmt(num, suffix="B"):
    """by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified"""
    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, "Yi", suffix)

In [None]:
!git clone https://github.com/shaankhosla/optimizingllms.git

In [6]:
!python3 optimizingllms/code/generate_data.py

Generating synthetic dataset (10000 train, 2000 val)...
  0% 0/12000 [00:00<?, ?it/s]                             

In [7]:
!cat data/train/0.json

{"number": 158540, "words": "one hundred fifty eight thousand five hundred forty"}

In [8]:
!cat data/train/2.json

{"number": -7692452, "words": "negative seven million six hundred ninety two thousand four hundred fifty two"}

In [39]:
df = []
for f in os.listdir("./data/train"):
    with open(os.path.join("./data/train", f)) as f:
        df.append(json.load(f))

df = pd.DataFrame(df)

In [41]:
for name, size in sorted(
    ((name, sys.getsizeof(value)) for name, value in list(locals().items())),
    key=lambda x: -x[1],
)[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

                            df:  1.1 MiB
                    DataLoader:  1.4 KiB
                          _i12:  1.3 KiB
                          _i14:  1.3 KiB
                       Dataset:  1.0 KiB
                 AutoTokenizer:  1.0 KiB
              StreamingDataset:  1.0 KiB
                           _ih:  472.0 B
                            In:  472.0 B
                            _i:  382.0 B


In [13]:
from torch.utils.data import Dataset
import torch
from transformers import AutoTokenizer

In [14]:
class StreamingDataset(Dataset):
    def __init__(self, path, model_name):
        self.path = path
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    def __len__(self):
        return len(os.listdir(self.path))

    def encode_text(self, text_input, text_ouput):
        inputs = self.tokenizer(
            text_input,
            max_length=16,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        labels = self.tokenizer(
            text_ouput,
            max_length=16,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        ).input_ids[0]
        input_ids = inputs["input_ids"][0]
        attention_mask = inputs["attention_mask"][0]
        labels = torch.tensor([label if label != 0 else -100 for label in labels])
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

    def __getitem__(self, idx):
        file_path = os.path.join(self.path, str(idx) + ".json")
        with open(file_path, "r") as infile:
            data = json.load(infile)
        number, words = str(data["number"]), data["words"]
        return self.encode_text(number, words)

In [16]:
train_data = StreamingDataset("./data/train/", "t5-small")

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [24]:
input_dict = train_data[0]

In [25]:
input_dict["input_ids"]

tensor([    3, 26556, 25379,     1,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0])

In [26]:
input_dict["attention_mask"]

tensor([1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])



`label` represents the correct token that the autoregressive model should be predicting and -100 is mask token:


In [27]:
input_dict["labels"]

tensor([   80,  6189, 18358,  2641,  7863,   874,  6189, 19662,     1,  -100,
         -100,  -100,  -100,  -100,  -100,  -100])

In [29]:
from torch.utils.data import default_collate
from torch.utils.data import DataLoader

In [30]:
train_dataloader = DataLoader(
    train_data,
    batch_size=4,
    num_workers=4,
    pin_memory=True,
    collate_fn=default_collate,
    prefetch_factor=50,
)



In [33]:
for x in train_dataloader:
    print(x)
    break



{'input_ids': tensor([[    3, 26556, 25379,     1,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [    3,  4241, 17465,     1,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [    3,  6832,  3951,  2266,  5373,     1,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [    3,  4278,  4591,  4959,     1,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[   80,  6189, 18358,  2641,  7863,   874,  6189, 19662,     1,  -100,
          -100,  -100,  -100,  -100,  -100,  -100],
        [27757,    80,  7863,    80,     1,  -100,  -100,  -100,  -