In [2]:
from recipe_nlg import RecipeNLGDataset, TokenizedRecipeNLGDataset
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
import kagglehub

BATCH_SIZE = 16

path = kagglehub.dataset_download("paultimothymooney/recipenlg")
# Load the dataset
df = pd.read_csv(path + "/RecipeNLG_dataset.csv", header=0)
# Create an instance of the RecipeNLGDataset class



In [3]:
from pathlib import Path
from transformers import PreTrainedTokenizerFast

tokenizer_path = Path("title_to_all_tokenizer")
print("Loading tokenizer")
hf_tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_path, model_max_lenth=512)

Loading tokenizer


In [4]:
# 'all' is default mode
data = RecipeNLGDataset(df)

In [5]:
recipe_batch = data.recipe_strings[:16]

tokenized_batch = hf_tokenizer(
    text=recipe_batch.tolist(),
    padding='max_length',
    truncation=True,
    max_length=512,
    return_tensors='pt'
)

tokenized_batch


{'input_ids': tensor([[ 1146,    12,   300,  ...,     0,     0,     0],
        [13024,    66,   842,  ...,     0,     0,     0],
        [ 1425,   442,     2,  ...,     0,     0,     0],
        ...,
        [ 1767,   610,   679,  ...,     0,     0,     0],
        [ 1725,  1183,     9,  ...,     0,     0,     0],
        [  327,  1604,   348,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [6]:
proper_format = {
    "input_ids": tokenized_batch["input_ids"].squeeze(0),
    "attention_mask": tokenized_batch["attention_mask"].squeeze(0),
    # etc.
}

print(proper_format["attention_mask"].shape)
proper_format["input_ids"].shape


torch.Size([16, 512])


torch.Size([16, 512])

In [7]:
from datasets import Dataset


hf_ds = Dataset.from_dict({
    k: v.numpy()  # Datasets accepts numpy arrays
    for k, v in proper_format.items()
})

print(hf_ds)




Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 16
})


In [8]:
import torch
# Custom collate function
def collate_fn(batch):
    # Convert each field in the batch to a PyTorch tensor
    return {
        key: torch.stack([torch.tensor(item[key]) for item in batch])
        for key in batch[0]
    }

loader = DataLoader(hf_ds, batch_size=4, shuffle=True, collate_fn=collate_fn)

for i, batch in enumerate(loader):
    print(batch['input_ids'].shape) # (batch size, max_length) -> this works

torch.Size([4, 512])
torch.Size([4, 512])
torch.Size([4, 512])
torch.Size([4, 512])


In [9]:
# ensure decoding works
first_batch = next(iter(loader))
first_example = first_batch['input_ids'][0]
hf_tokenizer.decode(first_example)




'chicken funny <end_title> 1 large whole chicken , 2 ( 10 12 oz . ) cans chicken gravy , 1 ( 10 12 oz . ) can cream of mushroom soup , 1 ( 6 oz . ) box stove top stuffing , 4 oz . shredded cheese <end_ingredients> boil and debone chicken . put bite size pieces in average size square casserole dish . pour gravy and cream of mushroom soup over chicken ; level . make stuffing according to instructions on box ( do not make too moist ) . put stuffing on top of chicken and gravy ; level . sprinkle shredded cheese on top and bake at 350u00b0 for approximately 20 minutes or until golden and bubbly . <end> [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

In [22]:
import torch.nn as nn

input_ids = first_batch['input_ids']

vocab_size = len(hf_tokenizer.get_vocab())
d_model = 66
context_length = 512




# embedding layer: one row for every token in vocab, embedding-size columns
embedding = nn.Embedding(vocab_size, d_model)


x = embedding(input_ids)
x.shape # (batchsize x seq_length x embed dim)

torch.Size([4, 512, 66])

In [3]:
import importlib
from PositionalEncoder import PositionalEncoder


pe = PositionalEncoder(d_model=d_model, context_len=context_length)

pe.pe.shape

NameError: name 'd_model' is not defined

In [None]:
# it gets through the positional encoder!
x = pe(x)

x.shape # shape (batch, context, dmodel)

torch.Size([4, 512, 66])

In [None]:
from models import DecoderBlock

test_decoder = DecoderBlock(num_heads=2, d_model=d_model, num_hidden_layers=2, d_hidden=2048)

TypeError: 'module' object is not callable. Did you mean: 'FFN.FFN(...)'?