In [87]:
#| default_exp train.load_data

We don't have the full dataset however the partial dataset should be fine to protype on. We need to see how exactly we need to set up the data.

We have for our ingredients, the weight (which we're ignoring just now), and its molecule_df id. We need to quickly adjust the molecule_db matching to match directly with the tokens of it. 

# Setup

In [2]:
#| export
from pyprojroot import here
root = here()
import sys
sys.path.append(str(root))

In [3]:
#| export
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from operator import itemgetter
import random
from itertools import islice

In [4]:
food_ids = pd.read_feather(f'{root}/../data/local/molecule/full/food_ids/0_primary_join.feather')

In [5]:
#| export 
food_compounds_df = pd.read_feather(f'{root}/../data/local/molecule/full/food_compounds/0.feather')

# Creating special tokens

- `[PAD]` - vector of zeros
- `[MASK]` - vector of ones?
- `[UNKNOWN]` - random or mean vector?

#TODO: Look into how to initialise these.

For easiest use, it would be nice if we could index these from the beginning.

In [6]:
special_token_idxs = {
    'pad': 0, 'mask': 1, 'unknown': 2, 
}

In [7]:
food_compounds_df.astype('float')

source_id,108,136,236,237,247,248,249,307,312,313,...,122183,122207,122257,122370,122486,122510,123210,125087,125196,125197
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.109333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,23.49215,9.498727,2.253628,9.376545,9.376545,24.419212,0.023016,0.0,0.0,0.000000
1,0.0,0.0,0.109333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,23.49215,9.498727,2.253628,9.376545,9.376545,24.419212,0.023016,0.0,0.0,1.223189
2,0.0,0.0,0.109333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,23.49215,9.498727,2.253628,9.376545,9.376545,24.419212,0.023016,0.0,0.0,0.000000
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000
4,0.0,0.0,0.109333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,23.49215,9.498727,2.253628,9.376545,9.376545,24.419212,0.023016,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1101,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,23.49215,9.498727,2.253628,9.376545,9.376545,24.419212,0.023016,0.0,0.0,0.000000
1102,0.0,0.0,0.109333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,23.49215,9.498727,2.253628,9.376545,9.376545,24.419212,0.023016,0.0,0.0,0.000000
1103,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000
1104,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,23.49215,9.498727,2.253628,9.376545,9.376545,24.419212,0.023016,0.0,0.0,0.000000


In [8]:
food_compounds = food_compounds_df.astype('float').to_numpy()
food_compounds.shape

(1106, 713)

In [9]:
special_tokens = np.vstack([np.zeros([1, food_compounds.shape[1]]), np.ones([1, food_compounds.shape[1]]), food_compounds.mean(axis=0)])
special_tokens, special_tokens.shape

(array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
         1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
        [1.03842678e-02, 3.41922851e-01, 3.79929178e-02, ...,
         5.36278835e-02, 5.80837997e-04, 6.96753373e-02]]),
 (3, 713))

In [10]:
food_compounds.mean(axis=0).shape

(713,)

In [11]:
food_compounds = np.append(special_tokens, food_compounds, axis=0)
food_compounds.shape

(1109, 713)

In [12]:
food_compounds[:5]

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [1.03842678e-02, 3.41922851e-01, 3.79929178e-02, ...,
        5.36278835e-02, 5.80837997e-04, 6.96753373e-02],
       [0.00000000e+00, 0.00000000e+00, 1.09333334e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.09333334e-01, ...,
        0.00000000e+00, 0.00000000e+00, 1.22318925e+00]])

In [13]:
food_compounds.dtype

dtype('float64')

In [14]:
np.save('../../data/local/final/full/food_compounds/0.npy', food_compounds)

# Creating Recipe Data

In [15]:
# shifting id's to account for addition of special tokens
food_ids[0] = food_ids[0].apply(lambda id: id + len(special_token_idxs.keys()) if pd.notna(id) else id)

In [16]:
food_ids = food_ids \
    .rename({0:'food_id'},axis=1) \
    .fillna(special_token_idxs['unknown']) \
    .astype('int')

In [17]:
food_ids

Unnamed: 0_level_0,Unnamed: 1_level_0,food_id
recipe,ingredient,Unnamed: 2_level_1
0,0,812
0,1,973
0,2,272
0,3,1040
0,4,808
...,...,...
204312,4,742
204312,5,974
204313,0,882
204313,1,780


In [18]:
recipe_food_ids = pd.DataFrame(food_ids.groupby('recipe')['food_id'].aggregate(list).tolist()).loc[:,:14]
recipe_food_ids

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,812,973.0,272.0,1040.0,808.0,339.0,,,,,,,,,
1,618,417.0,837.0,810.0,,,,,,,,,,,
2,286,773.0,808.0,847.0,807.0,59.0,,,,,,,,,
3,417,417.0,837.0,1015.0,773.0,,,,,,,,,,
4,808,960.0,808.0,812.0,892.0,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204301,1028,742.0,1021.0,1031.0,821.0,810.0,776.0,330.0,178.0,,,,,,
204302,2,59.0,1031.0,169.0,242.0,806.0,806.0,675.0,,,,,,,
204303,286,807.0,776.0,742.0,286.0,773.0,874.0,,,,,,,,
204304,2,830.0,812.0,807.0,742.0,974.0,,,,,,,,,


In [19]:
special_token_ids = ['pad','mask','unknown']
special_token_ids.index('mask')

1

In [20]:
recipe_food_ids = recipe_food_ids \
    .fillna(special_token_idxs['pad']) \
    .astype('int') \
    .to_numpy()

recipe_food_ids

array([[812, 973, 272, ...,   0,   0,   0],
       [618, 417, 837, ...,   0,   0,   0],
       [286, 773, 808, ...,   0,   0,   0],
       ...,
       [286, 807, 776, ...,   0,   0,   0],
       [  2, 830, 812, ...,   0,   0,   0],
       [882, 780, 122, ...,   0,   0,   0]])

In [21]:
recipe_food_ids.dtype

dtype('int64')

#TODO: decide on best recipe size.

In [22]:
np.save('../../data/local/final/full/recipe_food_ids/0.npy', recipe_food_ids)

# PyTorch Dataset & Dataloader

Here each batch will have `batch_size` recipes. Each of these recipes will be fed into the model as:
- x: food_ids with 15% of their foods masked with a `[MASK]` token. 
- y: food_ids without mask

This will result in a matrix of size `batch_size` x `recipe_size` x `n_features`.

Problems:

- Create numerous mask elements of each recipe.
    - Will this cause overfit (data leakage -> feeding the model the answer)?
    - Overfitting on large recipes (more possible mask combinations)?
- Decision on where to get vectors.
    - It's much easier to work with a dataset of tokens rather - so possibly in the model itself?

Now we want to create a dataset of recipes with masked ingredients. How can we do this without creating duplicates?

In [31]:
#| export
class MaskedRecipeDataset(Dataset):

    def __init__(self, recipes):
        self.recipes = recipes
    
    def __len__(self):
        return len(self.recipes) # should be able to have more than one here
    
    def create_recipe_mask(self, recipe):
        recipe_size = (recipe != special_token_idxs['pad']).sum()
        mask = torch.tensor(False)
        while mask.sum() == torch.tensor(0):
            rand = torch.rand(recipe_size)
            mask = (rand < 0.15).to(bool)
        recipe[:recipe_size][mask] = special_token_idxs['mask']
        return recipe
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx): idx = idx.to_list()
        recipe = self.recipes[idx]
        recipe = torch.tensor(recipe, dtype=torch.int)
        return (
            self.create_recipe_mask(recipe.clone()).type(torch.int), 
            recipe.type(torch.int)
        )

In [32]:
dataset = MaskedRecipeDataset(recipe_food_ids)
dataset[0]

(tensor([1107, 1025, 1107,  667,  670,  633, 1107,  669,  670,  586, 1107,  105,
         1106, 1106, 1106], dtype=torch.int32),
 tensor([ 825, 1025,  666,  667,  670,  633,  195,  669,  670,  586,  901,  105,
         1106, 1106, 1106], dtype=torch.int32))

In [33]:
assert special_token_idxs['mask'] in dataset[0][0] and special_token_idxs['mask'] not in dataset[0][1]

In [34]:
dataloader = DataLoader(dataset, batch_size=10, shuffle=True, num_workers=10)

In [35]:
xb, yb = next(iter(dataloader))
xb.shape, yb.shape, xb[0], yb[0]

(torch.Size([10, 15]),
 torch.Size([10, 15]),
 tensor([1107,  669,  995,  131,  962,   13, 1106, 1106, 1106, 1106, 1106, 1106,
         1106, 1106, 1106], dtype=torch.int32),
 tensor([ 664,  669,  995,  131,  962,   13, 1106, 1106, 1106, 1106, 1106, 1106,
         1106, 1106, 1106], dtype=torch.int32))

In [36]:
from nbdev import nbdev_export; nbdev_export()