In [1]:
from recipe_nlg import RecipeNLGDataset, TokenizedRecipeNLGDataset
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
import kagglehub

BATCH_SIZE = 16

path = kagglehub.dataset_download("paultimothymooney/recipenlg")
# Load the dataset
df = pd.read_csv(path + "/RecipeNLG_dataset.csv", header=0)
# Create an instance of the RecipeNLGDataset class



In [2]:
from pathlib import Path
from transformers import PreTrainedTokenizerFast

tokenizer_path = Path("Tokenizers/title_to_all_tokenizer")
print("Loading tokenizer")
hf_tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_path, model_max_lenth=512)

Loading tokenizer


In [3]:
# 'all' is default mode
data = RecipeNLGDataset(df)

In [4]:
recipe_batch = data.recipe_strings[:16]

tokenized_batch = hf_tokenizer(
    text=recipe_batch.tolist(),
    padding='max_length',
    truncation=True,
    max_length=512,
    return_tensors='pt'
)

tokenized_batch


{'input_ids': tensor([[ 1146,    12,   300,  ...,     0,     0,     0],
        [13024,    66,   842,  ...,     0,     0,     0],
        [ 1425,   442,     2,  ...,     0,     0,     0],
        ...,
        [ 1767,   610,   679,  ...,     0,     0,     0],
        [ 1725,  1183,     9,  ...,     0,     0,     0],
        [  327,  1604,   348,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [5]:
proper_format = {
    "input_ids": tokenized_batch["input_ids"],
    "attention_mask": tokenized_batch["attention_mask"],
    "labels": tokenized_batch["input_ids"]
    # etc.
}

print(proper_format["attention_mask"].shape)
proper_format["labels"].shape


torch.Size([16, 512])


torch.Size([16, 512])

In [6]:
from datasets import Dataset


hf_ds = Dataset.from_dict({
    k: v.numpy()  # Datasets accepts numpy arrays
    for k, v in proper_format.items()
})

print(hf_ds)




Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 16
})


In [7]:
import torch
# Custom collate function
def collate_fn(batch):
    # Convert each field in the batch to a PyTorch tensor
    return {
        key: torch.stack([torch.tensor(item[key]) for item in batch])
        for key in batch[0]
    }

loader = DataLoader(hf_ds, batch_size=4, shuffle=True, collate_fn=collate_fn)

for i, batch in enumerate(loader):
    print(batch['input_ids'].shape) # (batch size, max_length) -> this works

torch.Size([4, 512])
torch.Size([4, 512])
torch.Size([4, 512])
torch.Size([4, 512])


In [8]:
# ensure decoding works
first_batch = next(iter(loader))
first_example = first_batch['input_ids'][0]
first_labels = first_batch['labels'][0]
print(hf_tokenizer.decode(first_example))
first_example.shape

quick barbecue wings <end_title> chicken wings ( as many as you need for dinner ) , flour , barbecue sauce ( your choice ) <end_ingredients> clean wings . flour and fry until done . place fried chicken wings in microwave bowl . stir in barbecue sauce . microwave on high ( stir once ) for 4 minutes . <end> [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA

torch.Size([512])

In [9]:
import torch.nn as nn
from models import NextByteTransformer

test_input = first_example.unsqueeze(0)

vocab_size = len(hf_tokenizer.get_vocab())
d_model = 66
context_length = 512

next_byte = NextByteTransformer(
    d_model=d_model,
    vocab_size=vocab_size,
    context_length=context_length,
    # each head handles half of embedding context
    num_heads=2, 
    num_hidden_layers=2, 
    d_hidden=2048, 
    num_decoders=6)

test_input.shape




torch.Size([1, 512])

In [10]:
logits = next_byte(test_input)
logits.shape

torch.Size([1, 512, 20000])

In [21]:

first_token_id = test_input[0][0]
first_pred_id = torch.argmax(logits[0][0])
# first_pred_idx = torch.argmax(logits[0])
# first_token_id = test_input[1][0]

print(f"first token was: {hf_tokenizer.decode(first_token_id)}")
print(f"pred next token was: {hf_tokenizer.decode(first_pred_id)}")

first token was: quick
pred next token was: ##allo
