Here we do the final processing required for the model inputs.

# Setup

In [1]:
#| default_exp final.process

In [2]:
#| export
from pyprojroot import here
root = here()
import sys
sys.path.append(str(root))

In [23]:
#| export
import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas()

In [15]:
food_ids = pd.read_feather(f'{root}/../data/local/molecule/partial/food_ids/1_na_filled.feather')
food_embeddings = pd.read_feather(f'{root}/../data/local/molecule/full/food_compounds/0.feather')

food_ids = food_ids.astype('Int64')
food_embeddings = food_embeddings.astype('float')

In [5]:
#| export
special_tokens = ['pad', 'mask', 'unknown']

# Food Embeddings

Here we already have our food compounds in a dataframe. All that is left to do is:

- Converting to numpy (efficiency)
- Adding special tokens for masked modelling

In [6]:
food_embeddings = food_embeddings.to_numpy()

In [7]:
special_token_embeddings = np.vstack([np.zeros([1, food_embeddings.shape[1]]), np.ones([1, food_embeddings.shape[1]]), food_embeddings.mean(axis=0)])
special_token_embeddings, special_token_embeddings.shape

(array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
         1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
        [1.03842678e-02, 3.41922851e-01, 3.79929178e-02, ...,
         5.36278835e-02, 5.80837997e-04, 6.96753373e-02]]),
 (3, 713))

In [8]:
food_embeddings = np.append(special_token_embeddings, food_embeddings, axis=0)
food_embeddings.shape

(1109, 713)

## Saving

In [9]:
np.save('../../data/local/final/full/food_compounds/0.npy', food_embeddings)

# Recipe Food Tokens

Here we want to format our data as food token id's for each recipe.

- Converting to numpy
- Handling NA values
- Grouping data by recipe

The token embeddings were added to the beginning of the embedding array, for quality of life in terms of developing the model. The downside of this is we have to shift all the food_ids up.

In [24]:
#| export
def shift_food_ids(food_ids, special_tokens):
    shift = len(special_tokens)
    return food_ids.progress_apply(lambda id: id + shift if pd.notna(id) else id)

In [25]:
food_ids['food_id'] = shift_food_ids(food_ids['food_id'], special_tokens)

100%|██████████| 2450/2450 [00:00<00:00, 318805.10it/s]


Here we fill na's with the unknown token.

In [18]:
food_ids = food_ids.fillna(special_tokens.index('unknown'))

In [29]:
#| export
def process_food_ids(food_ids, special_tokens):
    food_ids['food_id'] = shift_food_ids(food_ids['food_id'], special_tokens)
    food_ids = food_ids.fillna(special_tokens.index('unknown'))
    return food_ids

Grouping values by recipe, with food_id tokens in the columns.

In [30]:
recipe_food_ids = pd.DataFrame(food_ids.groupby('recipe')['food_id'].aggregate(list).tolist()).loc[:,:14]
recipe_food_ids

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,977,833,810,811.0,815.0,779.0,275.0,813.0,815.0,726.0,1043.0,141.0,,,
1,809,779,833,5.0,1043.0,,,,,,,,,,
2,779,811,815,797.0,82.0,810.0,977.0,815.0,82.0,,,,,,
3,957,959,776,70.0,543.0,797.0,1024.0,,,,,,,,
4,340,20,84,815.0,977.0,779.0,20.0,84.0,426.0,543.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,245,822,1034,833.0,1073.0,14.0,86.0,82.0,62.0,62.0,94.0,850.0,62.0,815.0,810.0
298,1024,815,779,275.0,977.0,895.0,70.0,,,,,,,,
299,428,1073,82,170.0,14.0,,,,,,,,,,
300,1073,14,678,82.0,346.0,1107.0,181.0,,,,,,,,


#TODO: Decide on optimal reicpe size

The NA values here are those where the recipe didn't have enough foods to fill the columns, so these can be filled with padding values.

In [31]:
recipe_food_ids = recipe_food_ids \
    .fillna(special_tokens.index('pad')) \
    .astype('int') \
    .to_numpy()

recipe_food_ids, recipe_food_ids.shape

(array([[ 977,  833,  810, ...,    0,    0,    0],
        [ 809,  779,  833, ...,    0,    0,    0],
        [ 779,  811,  815, ...,    0,    0,    0],
        ...,
        [ 428, 1073,   82, ...,    0,    0,    0],
        [1073,   14,  678, ...,    0,    0,    0],
        [1050, 1034,   62, ...,    0,    0,    0]]),
 (302, 15))

In [32]:
#| export
def compile_recipe_food_ids(food_ids, special_tokens):

    recipe_food_ids = pd.DataFrame(food_ids.groupby('recipe')['food_id'].aggregate(list).tolist()).loc[:,:14]

    recipe_food_ids = recipe_food_ids \
        .fillna(special_tokens.index('pad')) \
        .astype('int') \
        .to_numpy()

    return recipe_food_ids

## Saving

In [33]:
np.save('../../data/local/final/partial/recipe_food_ids/0.npy', recipe_food_ids)

In [34]:
from nbdev import nbdev_export; nbdev_export()