# Embedding

Our one-hot encoded ingredients create super sparse matrices for recipes. This isn't ideal as similar recipes don't have any numerical similarity to each other (at least, not past the identical ingredients), so it seems like an embedding would be useful for ingredient representation.

We could try to create an embedding by creating a supervised learning model to map recipes to their style and use the last layer before the prediction as the embedding. I _think_ that would look like this:

* Create neural network that goes from N neurons as input (where N is the number of unique ingredients we have) to M output neurons (where M is the number of styles). Number of hidden layers is a tunable parameter.
* We feed in all of our examples, each recipe having a one-hot encoded label of its style, and train the network.
* Once the network is trained, we can represent a single ingredient by putting the one-hot encoded version of that ingredient into the network and seeing what the embedding layer values are.

In the above topology, the embedding is simply the layer right before the prediction layer.

In [63]:
import math
import numpy as np
import os
import pandas as pd
import pickle
import torch
import torch.nn as nn
import torch.utils.data as data_utils

from sklearn.preprocessing import (
    MinMaxScaler,
    OneHotEncoder,
    RobustScaler,
    StandardScaler,
)

from beerai.config import DATA_DIR

In [2]:
def load_vectors():
    vec_file = os.path.join(DATA_DIR, "processed/recipe_vecs.h5")
    with pd.HDFStore(vec_file, "r") as store:
        vectors = store.get("/vecs")
    return vectors

def load_vocab():
    vocab_file = os.path.join(DATA_DIR, "processed/vocab.pickle")
    with open(vocab_file, "rb") as f:
        vocab = pickle.load(f)
    inv_vocab = {v: k for k,v in vocab.items()}
    return vocab, inv_vocab

In [21]:
vectors = load_vectors()
vectors = vectors.drop("boil_time", axis=1)

In [22]:
vectors

name,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,677.0,678.0,679.0,680.0,681.0,682.0,683.0,684.0,685.0,686.0
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
5,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.274902,0.0,0.0,0.0,0.0
7,1.553145,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
8,1.240266,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404601,1.224098,0.0,0.0,1.224098,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
404604,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
404606,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
404622,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [27]:
vectors[vectors > 0] = 1

In [31]:
vectors = vectors.astype(int)

In [38]:
os.path.join(DATA_DIR, "interim/all_data.h5")

'/home/jupyter-rory/repos/beer.ai/data/interim/all_data.h5'

In [39]:
with pd.HDFStore(os.path.join(DATA_DIR, "interim/all_recipes.h5")) as store:
    core = store.select("core")

['/core', '/ingredients']


In [40]:
core = core.loc[vectors.index]

In [41]:
core

Unnamed: 0_level_0,batch_size,boil_size,boil_time,brewer,efficiency,name,origin,recipe_file,src_abv,src_color,src_fg,src_ibu,src_og,style_category,style_guide,style_name,style_version
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,18.927059,21.198306,60.0,velkyal,0.75,dark island oir leann,brewtoad,recipes/brewtoad/dark-island-oir-leann.xml,,,,,,6b,bjcp,blonde ale,1.0
1,22.712471,28.390588,60.0,wain thor,0.85,witbur,brewtoad,recipes/brewtoad/witbur.xml,,,,,,6d,bjcp,american wheat or rye beer,1.0
5,20.819765,27.065694,90.0,trappistpunk,0.75,lager2,brewtoad,recipes/brewtoad/lager2.xml,,,,,,2b,bjcp,bohemian pilsener,1.0
7,41.639530,52.049412,60.0,crazyward brewing co.,0.75,untitled specialty beer 139690,brewtoad,recipes/brewtoad/untitled-specialty-beer-13969...,,,,,,23a,bjcp,specialty beer,1.0
8,18.927059,24.605177,60.0,josh,0.70,not so fast,brewtoad,recipes/brewtoad/not-so-fast-d3cb93.xml,,,,,,10a,bjcp,american pale ale,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404601,22.712471,26.497882,60.0,self,0.71,session amber,brewersfriend,recipes/brewersfriend/553653.xml,4.86,11.00,1.012,32.89,1.049,19a,bjcp,american amber ale,1.0
404604,20.819765,26.497882,60.0,backcrack,0.75,esb - extra special beer,brewersfriend,recipes/brewersfriend/476490.xml,5.36,10.91,1.013,38.23,1.054,11c,bjcp,strong bitter,1.0
404606,22.712471,28.390588,60.0,,0.35,irish red ale,brewersfriend,recipes/brewersfriend/44801.xml,4.96,16.90,1.014,26.69,1.052,9d,bjcp,irish red ale,1.0
404622,3.785412,5.678118,60.0,,0.35,brad thousand,brewersfriend,recipes/brewersfriend/234561.xml,5.52,29.48,1.014,36.29,1.056,10c,bjcp,american brown ale,1.0


In [54]:
core.style_category.unique()

array(['6b', '6d', '2b', '23a', '10a', '16b', '14b', '18c', '17b', '5c',
       '11a', '6c', '10b', '10c', '16e', '18e', '17a', '22a', '8b', '8c',
       '9c', '13f', '21a', '21b', '15a', '12b', '9d', '14c', '13d', '7b',
       '16c', '19b', '14a', '2a', '8a', '13e', '4c', '19c', '9e', '13c',
       '13b', '6a', '12c', '20a', '3a', '3b', '15c', '7c', '17e', '16a',
       '12a', '15d', '18a', '13a', '2c', '18b', '1a', '19a', '1d', '5b',
       '11c', '7a', '4a', '15b', '18d', '1c', '17f', '22b', '5a', '4b',
       '22c', '11b', '17c', '16d', '1b', '9a', '17d', '9b', '1e', '5d',
       '25b', '20c', '20b', '34a', '23-', '32a', '24b', '24c', '28a',
       '34c', '25c', '27-', '24a', '20-', '34b', '0-', '30b', '29c',
       '25a', '28b', '22d', '26c', '30a', '26b', '27a', '26a', '23b',
       '23e', '33b', '28d', '32b', '30c', '26d', '28c', '23d', '29b',
       '3d', '23f', '31a', '23c'], dtype=object)

In [105]:
# All styles end with letter. Remove letter and just keep high level category
styles = pd.DataFrame(core.style_category.str.slice(0,-1).astype(int))
# Style "0-" means "no style selected" - drop these.
styles = styles[styles["style_category"] > 0]

In [106]:
styles

Unnamed: 0_level_0,style_category
recipe_id,Unnamed: 1_level_1
0,6
1,6
5,2
7,23
8,10
...,...
404601,19
404604,11
404606,9
404622,10


In [107]:
ohe = OneHotEncoder()

In [108]:
#categories = ohe.fit_transform(styles["style_category"].values.reshape(-1,1))

In [109]:
#categories.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [207]:
# ^ turns out binary cross entropy wants category ID, not OHE categories
categories = styles["style_category"].values.reshape(-1,1)

In [178]:
class Embedder(nn.Module):
    def __init__(
        self,
        input_dim,
        output_dim,
        embedding_dim,
        factor_per_layer,
    ):

        super().__init__()
        self.embedding_dim = embedding_dim
        self.output_dim = output_dim

        embed_layers = self.gen_layers_by_factor(
            input_dim, embedding_dim, factor_per_layer
        )

        self.embed = nn.Sequential(*embed_layers)
        self.pred = nn.Sequential(nn.Linear(embedding_dim, output_dim), nn.Softmax(dim=1))
 
    def gen_layers_by_factor(self, input_dim, final_dim, factor_per_layer):
        
        cur_dim = input_dim
        compress_layers = []
        
        n_iters = math.ceil(math.log(input_dim / final_dim, factor_per_layer))
        
        for i in range(n_iters):
            new_dim = max(cur_dim // factor_per_layer, final_dim)
            compress_layers.extend([nn.Linear(cur_dim, new_dim, bias=False), nn.ReLU(True)])
            cur_dim = new_dim
        # Replace final layer with sigmoid/tanh. Should match the input scaling range 
        compress_layers[-1] = nn.Sigmoid()

        return compress_layers

    def forward(self, X):
        embedded = self.embed(X)
        pred = self.pred(embedded)
        return pred

In [229]:
def train(model, X, Y, batch_size, num_epochs, learning_rate=1e-3):

    if isinstance(X, pd.DataFrame):
        X = X.values

    cel = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(
        model.parameters(), lr=learning_rate, weight_decay=1e-5
    )

    #features = torch.tensor(X, dtype=torch.float)
    data = torch.tensor(X, dtype=torch.float)
    labels = torch.tensor(Y, dtype=torch.long)
    #import pdb; pdb.set_trace()
    
    #train = data_utils.TensorDataset(features)
    
    # Shuffle used to ensure randomized selection
    #train_loader = data_utils.DataLoader(
    #    train, batch_size=batch_size, shuffle=True
    #)

    for epoch in range(num_epochs):
        losses = []
        #for i, batch in enumerate(train_loader):
            # ===================forward=====================
            #data = batch[0]
        preds = model.forward(data)
        length = preds.shape[0]

        loss = cel(preds, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append(loss.data)

        print(f"epoch [{epoch + 1}/{num_epochs}], loss:{sum(losses)/len(losses):.6f}")

In [234]:
X = vectors.loc[styles.index].values
Y = categories.astype(int) - 1

In [235]:
input_dim = X.shape[-1]
# 34 unique categories
output_dim = 34
embedding_dim = 40
factor_per_layer = 2
embedder = Embedder(input_dim, output_dim, embedding_dim, factor_per_layer)

In [236]:
embedder

Embedder(
  (embed): Sequential(
    (0): Linear(in_features=687, out_features=343, bias=False)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=343, out_features=171, bias=False)
    (3): ReLU(inplace=True)
    (4): Linear(in_features=171, out_features=85, bias=False)
    (5): ReLU(inplace=True)
    (6): Linear(in_features=85, out_features=42, bias=False)
    (7): ReLU(inplace=True)
    (8): Linear(in_features=42, out_features=40, bias=False)
    (9): Sigmoid()
  )
  (pred): Sequential(
    (0): Linear(in_features=40, out_features=34, bias=True)
    (1): Softmax(dim=1)
  )
)

In [215]:
%pdb

Automatic pdb calling has been turned ON


In [237]:
batch_size = 10000
num_epochs=100
learning_rate=1e-3
train(embedder, X, Y.reshape(-1), batch_size=batch_size, num_epochs=num_epochs, learning_rate=learning_rate)

epoch [1/100], loss:3.527779
epoch [2/100], loss:3.527463
epoch [3/100], loss:3.527324
epoch [4/100], loss:3.527326
epoch [5/100], loss:3.526605
epoch [6/100], loss:3.525630
epoch [7/100], loss:3.523798
epoch [8/100], loss:3.523704
epoch [9/100], loss:3.520802
epoch [10/100], loss:3.520085
epoch [11/100], loss:3.520464
epoch [12/100], loss:3.520035
epoch [13/100], loss:3.518951
epoch [14/100], loss:3.517964
epoch [15/100], loss:3.517494
epoch [16/100], loss:3.516592
epoch [17/100], loss:3.516349
epoch [18/100], loss:3.516116
epoch [19/100], loss:3.515600
epoch [20/100], loss:3.515458
epoch [21/100], loss:3.514380
epoch [22/100], loss:3.510201
epoch [23/100], loss:3.509356
epoch [24/100], loss:3.509086
epoch [25/100], loss:3.508646
epoch [26/100], loss:3.506176
epoch [27/100], loss:3.504866
epoch [28/100], loss:3.503303
epoch [29/100], loss:3.502310
epoch [30/100], loss:3.500911
epoch [31/100], loss:3.499140
epoch [32/100], loss:3.495726
epoch [33/100], loss:3.491767
epoch [34/100], los

In [239]:
embedder.pred(X)

AttributeError: 'numpy.ndarray' object has no attribute 'dim'

> [0;32m/home/jupyter-rory/repos/beer.ai/env/lib/python3.8/site-packages/torch/nn/functional.py[0m(1688)[0;36mlinear[0;34m()[0m
[0;32m   1686 [0;31m        [0;32mif[0m [0many[0m[0;34m([0m[0;34m[[0m[0mtype[0m[0;34m([0m[0mt[0m[0;34m)[0m [0;32mis[0m [0;32mnot[0m [0mTensor[0m [0;32mfor[0m [0mt[0m [0;32min[0m [0mtens_ops[0m[0;34m][0m[0;34m)[0m [0;32mand[0m [0mhas_torch_function[0m[0;34m([0m[0mtens_ops[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1687 [0;31m            [0;32mreturn[0m [0mhandle_torch_function[0m[0;34m([0m[0mlinear[0m[0;34m,[0m [0mtens_ops[0m[0;34m,[0m [0minput[0m[0;34m,[0m [0mweight[0m[0;34m,[0m [0mbias[0m[0;34m=[0m[0mbias[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 1688 [0;31m    [0;32mif[0m [0minput[0m[0;34m.[0m[0mdim[0m[0;34m([0m[0;34m)[0m [0;34m==[0m [0;36m2[0m [0;32mand[0m [0mbias[0m [0;32mis[0m [0;32mnot[0m [0;32mNone[0m[0;34m:[0m