Here we do the final processing required for the model inputs.

# Setup

In [1]:
from pyprojroot import here
root = here()
import sys
sys.path.append(str(root))

In [2]:
import pandas as pd
import numpy as np

In [5]:
food_ids = pd.read_feather(f'{root}/../data/local/molecule/partial/food_ids/1_na_filled.feather')
food_embeddings = pd.read_feather(f'{root}/../data/local/molecule/full/food_compounds/0.feather')

food_ids = food_ids.astype('Int64')
food_embeddings = food_embeddings.astype('float')

In [13]:
special_tokens = ['pad', 'mask', 'unknown']

# Food Embeddings

Here we already have our food compounds in a dataframe. All that is left to do is:

- Converting to numpy (efficiency)
- Adding special tokens for masked modelling

In [6]:
food_embeddings = food_embeddings.to_numpy()

In [9]:
special_token_embeddings = np.vstack([np.zeros([1, food_embeddings.shape[1]]), np.ones([1, food_embeddings.shape[1]]), food_embeddings.mean(axis=0)])
special_token_embeddings, special_token_embeddings.shape

(array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
         1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
        [1.03842678e-02, 3.41922851e-01, 3.79929178e-02, ...,
         5.36278835e-02, 5.80837997e-04, 6.96753373e-02]]),
 (3, 713))

In [10]:
food_embeddings = np.append(special_token_embeddings, food_embeddings, axis=0)
food_embeddings.shape

(1109, 713)

## Saving

In [None]:
np.save('../../data/local/final/full/food_compounds/0.npy', food_embeddings)

# Recipe Food Tokens

Here we want to format our data as food token id's for each recipe.

- Converting to numpy
- Handling NA values
- Grouping data by recipe

The token embeddings were added to the beginning of the embedding array, for quality of life in terms of developing the model. The downside of this is we have to shift all the food_ids up.

In [16]:
shift = len(special_tokens)
food_ids['food_id'] = food_ids['food_id'].apply(lambda id: id + shift if pd.notna(id) else id)

Here we fill na's with the unknown token.

In [17]:
food_ids = food_ids.fillna(special_tokens.index('unknown'))

Grouping values by recipe, with food_id tokens in the columns.

In [29]:
recipe_food_ids = pd.DataFrame(food_ids.groupby('recipe')['food_id'].aggregate(list).tolist()).loc[:,:14]
recipe_food_ids

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,974,830,807,808.0,812.0,776.0,272.0,810.0,812.0,723.0,1040.0,138.0,,,
1,806,776,830,2.0,1040.0,,,,,,,,,,
2,776,808,812,794.0,79.0,807.0,974.0,812.0,79.0,,,,,,
3,954,956,773,67.0,540.0,794.0,1021.0,,,,,,,,
4,337,17,81,812.0,974.0,776.0,17.0,81.0,423.0,540.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,242,819,1031,830.0,1070.0,11.0,83.0,79.0,59.0,59.0,91.0,847.0,59.0,812.0,807.0
298,1021,812,776,272.0,974.0,892.0,67.0,,,,,,,,
299,425,1070,79,167.0,11.0,,,,,,,,,,
300,1070,11,675,79.0,343.0,1104.0,178.0,,,,,,,,


#TODO: Decide on optimal reicpe size

The NA values here are those where the recipe didn't have enough foods to fill the columns, so these can be filled with padding values.

In [30]:
recipe_food_ids = recipe_food_ids \
    .fillna(special_tokens.index('pad')) \
    .astype('int') \
    .to_numpy()

recipe_food_ids, recipe_food_ids.shape

(array([[ 974,  830,  807, ...,    0,    0,    0],
        [ 806,  776,  830, ...,    0,    0,    0],
        [ 776,  808,  812, ...,    0,    0,    0],
        ...,
        [ 425, 1070,   79, ...,    0,    0,    0],
        [1070,   11,  675, ...,    0,    0,    0],
        [1047, 1031,   59, ...,    0,    0,    0]]),
 (302, 15))

## Saving

In [31]:
np.save('../../data/local/final/full/recipe_food_ids/0.npy', recipe_food_ids)