In [None]:
import math
import numpy as np
import os
import pandas as pd
import pickle
import torch
import torch.nn as nn
import torch.utils.data as data_utils

from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

from beerai.config import DATA_DIR

In [2]:
def load_vectors():
    vec_file = os.path.join(DATA_DIR, "processed/recipe_vecs.h5")
    with pd.HDFStore(vec_file, "r") as store:
        vectors = store.get("/vecs")
    return vectors

def load_vocab():
    vocab_file = os.path.join(DATA_DIR, "processed/vocab.pickle")
    with open(vocab_file, "rb") as f:
        vocab = pickle.load(f)
    inv_vocab = {v: k for k,v in vocab.items()}
    return vocab, inv_vocab

In [3]:
recipes = load_vectors()

In [4]:
recipes.head()

name,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,678.0,679.0,680.0,681.0,682.0,683.0,684.0,685.0,686.0,boil_time
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.274902,0.0,0.0,0.0,0.0,90.0
7,1.553145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0
8,1.240266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0


In [5]:
recipes["boil_time"] = recipes.boil_time.clip(upper=120)

In [109]:
recipes.loc[recipes.loc[:,0]==recipes.loc[:,0].max()]

name,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,678.0,679.0,680.0,681.0,682.0,683.0,684.0,685.0,686.0,boil_time
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16004,565.699829,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0


In [112]:
recipes_file = os.path.join(DATA_DIR, "interim/all_recipes.h5")
with pd.HDFStore(recipes_file) as store:
    recs = store.select("core", where="index==16004")
    ings = store.select("ingredients", where="index==16004")

In [113]:
recs
ings

Unnamed: 0_level_0,ferm_amount,ferm_color,ferm_display_amount,ferm_name,ferm_origin,ferm_potential,ferm_type,ferm_yield,hop_alpha,hop_amount,...,misc_time,misc_use,yeast_amount,yeast_attenuation,yeast_flocculation,yeast_form,yeast_laboratory,yeast_name,yeast_product_id,yeast_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16004,5848.388452,25.0,12893.5 lb,melanoidin,de,1.037,kilned malt,0.798964,0.1365,0.014175,...,,,,70.0,,dry,fermentis,safale s-04,,ale
16004,4816.330574,1.0,10618.2 lb,carapils,de,1.035,adjunct,0.755776,0.1365,0.01701,...,,,,,,,,,,
16004,64.546142,1.0,142.3 lb,2-row,us,1.037,base malt,0.798964,0.1365,0.01701,...,,,,,,,,,,
16004,,,,,,,,,0.1365,0.01701,...,,,,,,,,,,
16004,,,,,,,,,0.1365,0.02268,...,,,,,,,,,,


In [114]:
scaler = RobustScaler()
recipes_scaled = scaler.fit_transform(recipes)

Notes on sparse autoencoders

- [Discussion on quora](https://www.quora.com/When-training-an-autoencoder-on-very-sparse-data-how-do-you-force-the-decoder-to-reconstruct-mostly-zeros-rather-than-always-just-reconstructing-the-average)
  - > So, if you're including a bias term in your autoencoders, I recommend removing the bias and attempting training again.
  - > I encountered this problem recently. I found that, similar to what Eric described, it is a problem with SGD getting stuck in bad local optima. A couple things helped: (1) using conjugate gradients or AdaGrad, either of which will find the path to the true minimum without getting stuck as much as plain SGD will; (2) using a combined cross-entropy & mean-squared-error loss function (assuming that you can model your data as binary vectors or as probability distributions) pulls things -- ever so slightly -- in the right directions better than either alone would.
  - > I decided to progressively lower the learning rate and then I got good results. So that's my tip, lower your learning rate until you get better results. (For example, I'm using now: 0.0000005 as the initial rate.) And don’t forget to normalize your data!
  - > You shouldn't need to do anything special for this. Standard good practices for initialization and training should take care of it.
  - > We were able to reproduce the original image (not get the average) by using AdamOptimizer and lowering the learning rate.
  - [Notes](http://web.stanford.edu/class/cs294a/sae/sparseAutoencoderNotes.pdf) from Stanford CS294a (Andrew Ng)  

In [7]:
class AutoEncoder(nn.Module):
    def __init__(
        self,
        input_dim,
        compression_dim,
        factor_per_layer,
    ):

        super().__init__()
        self.compression_dim = compression_dim

        comp_layers, decomp_layers = self.gen_layers_by_factor(
            input_dim, compression_dim, factor_per_layer
        )

        self.encoder = nn.Sequential(*comp_layers)
        self.decoder = nn.Sequential(*decomp_layers)
 
    def gen_layers_by_factor(self, input_dim, compression_dim, factor_per_layer):
        
        cur_dim = input_dim
        compress_layers = []
        decompress_layers = []
        n_iters = math.ceil(math.log(input_dim / compression_dim, factor_per_layer))
        
        for i in range(n_iters):
            new_dim = max(cur_dim // factor_per_layer, compression_dim)
            compress_layers.extend([nn.Linear(cur_dim, new_dim, bias=False), nn.ReLU(True)])
            decompress_layers.extend([nn.ReLU(True), nn.Linear(new_dim, cur_dim, bias=False)])
            cur_dim = new_dim
        decompress_layers = decompress_layers[::-1]
        # Replace final layer with sigmoid/tanh. Should match the input scaling range 
        compress_layers[-1] = nn.Sigmoid()
        decompress_layers[-1] = nn.Sigmoid()

        return compress_layers, decompress_layers

    def forward(self, X):
        encoded = self.encoder(X)
        decoded = self.decoder(encoded)
        return decoded

    def encode(self, X):
        X = torch.tensor(X, dtype=torch.float)
        X = self.encoder(X)
        return pd.DataFrame(X.detach().numpy())
    
    def decode(self, X):
        X = torch.tensor(X, dtype=torch.float)
        X = self.decoder(X)
        return pd.DataFrame(X.detach().numpy())

In [81]:
def train(model, X, batch_size, num_epochs, learning_rate=1e-3, beta=1):

    if isinstance(X, pd.DataFrame):
        X = X.values

    mse = nn.MSELoss()
    kldiv = nn.KLDivLoss()
    optimizer = torch.optim.Adam(
        model.parameters(), lr=learning_rate, weight_decay=1e-5
    )

    #features = torch.tensor(X, dtype=torch.float)
    data = torch.tensor(X, dtype=torch.float)
    
    #train = data_utils.TensorDataset(features)
    
    # Shuffle used to ensure randomized selection
    #train_loader = data_utils.DataLoader(
    #    train, batch_size=batch_size, shuffle=True
    #)

    for epoch in range(num_epochs):
        losses = []
        #for i, batch in enumerate(train_loader):
            # ===================forward=====================
            #data = batch[0]
        decoded = model.forward(data)
        length = decoded.shape[0]

        if length < batch_size:
            continue
        mse_loss = mse(decoded, data)
        kld_loss = kldiv(decoded, data)
        loss = (1 - beta) * mse_loss + beta * kld_loss


        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append(loss.data)

        print(f"epoch [{epoch + 1}/{num_epochs}], loss:{sum(losses)/len(losses):.6f}")

In [115]:
input_dims = len(recipes.columns)
# first guess
compress_dims = 50
# factor to reduce by each layer
factor_per_layer = 2
beer_ae = AutoEncoder(input_dims, compress_dims, factor_per_layer)

In [116]:
batch_size = 10000
num_epochs=100
learning_rate=1e-3
# weight of KL loss term
beta = 0.1
train(beer_ae, recipes_scaled, batch_size=batch_size, num_epochs=num_epochs, learning_rate=learning_rate, beta=beta)
#train(beer_ae, recipes.values, batch_size=batch_size, num_epochs=num_epochs, learning_rate=learning_rate, beta=beta)



epoch [1/100], loss:0.966428
epoch [2/100], loss:0.964557
epoch [3/100], loss:0.962276
epoch [4/100], loss:0.959147
epoch [5/100], loss:0.954853
epoch [6/100], loss:0.949130
epoch [7/100], loss:0.941658
epoch [8/100], loss:0.932106
epoch [9/100], loss:0.920158
epoch [10/100], loss:0.905554
epoch [11/100], loss:0.888175
epoch [12/100], loss:0.868150
epoch [13/100], loss:0.846044
epoch [14/100], loss:0.823057
epoch [15/100], loss:0.801139
epoch [16/100], loss:0.782606
epoch [17/100], loss:0.769360
epoch [18/100], loss:0.761712
epoch [19/100], loss:0.758312
epoch [20/100], loss:0.757227
epoch [21/100], loss:0.757039
epoch [22/100], loss:0.757077
epoch [23/100], loss:0.757126
epoch [24/100], loss:0.757138
epoch [25/100], loss:0.757101
epoch [26/100], loss:0.757034
epoch [27/100], loss:0.756985
epoch [28/100], loss:0.756980
epoch [29/100], loss:0.756939
epoch [30/100], loss:0.756823
epoch [31/100], loss:0.756696
epoch [32/100], loss:0.756614
epoch [33/100], loss:0.756563
epoch [34/100], los

In [117]:
decoded = beer_ae.decode(beer_ae.encode(recipes.values).values)
decoded.index = recipes.index

In [118]:
decoded[decoded < 1e-6] = 0

In [119]:
decoded

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,678,679,680,681,682,683,684,685,686,687
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.630809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000001,0.0,0.0,0.0,0.0,1.0
1,0.630809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000001,0.0,0.0,0.0,0.0,1.0
5,0.630807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000001,0.0,0.0,0.0,0.0,1.0
7,0.630808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000001,0.0,0.0,0.0,0.0,1.0
8,0.630808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000001,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404601,0.630809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000001,0.0,0.0,0.0,0.0,1.0
404604,0.630809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000001,0.0,0.0,0.0,0.0,1.0
404606,0.630809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000001,0.0,0.0,0.0,0.0,1.0
404622,0.630808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000001,0.0,0.0,0.0,0.0,1.0


In [120]:
np.sqrt((recipes.values - decoded.values)**2).mean()

0.1177227152658449

In [121]:
def generate_random_beer(model):
    latent_space_dim = model.compression_dim
    random_beer = model.decode(np.random.rand(latent_space_dim))
    random_beer = scaler.inverse_transform(random_beer.T)[0]
    random_beer[random_beer < 0.0001] = 0
    return random_beer

In [127]:
random_beer = beer_ae.decode(np.random.rand(50))
random_beer = scaler.inverse_transform(random_beer.T)[0]

In [128]:
random_beer[random_beer < 0.0001] = 0

In [129]:
vocab, inv_vocab = load_vocab()

In [130]:
ing_ids = np.where(random_beer > 0)[0]

In [131]:
print(len(ing_ids))
print(ing_ids)

41
[  0  32  33  34  38  44  48  60  66  78  85  92 103 107 117 120 129 149
 150 152 154 155 157 164 180 187 202 211 230 252 256 284 297 483 558 575
 576 645 649 682 687]


In [132]:
for ing in ing_ids:
    if ing != 687:
        print(f"{random_beer[ing]} kg of {inv_vocab[ing]}")
    else:
        print(f"And boil for {random_beer[ing]} minutes.")

0.7462286949157715 kg of ferm_2-row
0.00012904341565445065 kg of ferm_caramel malt 120l
0.0001188625319628045 kg of ferm_caramel malt 20l
0.3263695538043976 kg of ferm_caramel malt 40l
0.36618995666503906 kg of ferm_caramel/crystal 60l
0.376726895570755 kg of ferm_carapils® malt
0.00015614827862009406 kg of ferm_chocolate
0.001890511834062636 kg of ferm_dry malt extract - light
0.0023655961267650127 kg of ferm_flaked oats
0.00015075632836669683 kg of ferm_liquid malt extract - light
0.32154062390327454 kg of ferm_maris otter pale
0.00010746826592367142 kg of ferm_munich malt
0.37367182970046997 kg of ferm_pilsen malt
0.0012666976545006037 kg of ferm_roasted barley
0.00047409036778844893 kg of ferm_vienna
0.36029115319252014 kg of ferm_white wheat
0.3887019157409668 kg of hop_amarillo
0.00010415908036520705 kg of hop_calypso
0.6255908608436584 kg of hop_cascade
0.42424145340919495 kg of hop_centennial
0.33386945724487305 kg of hop_chinook
0.4257890582084656 kg of hop_citra
0.34627604484

Welp, our first brew looks to be a crime against brewmanity. Essentially, the average amount of every ingredient across all beers? A nice baseline to improve on, though!

# Ideas from Ethan


## Model Ideas

* Look at adding regularization in between layers
* Look at VAE
* Look at GAN
  * This is most interesting
  * Its explicit job is creating recipes that look real.
  * Discriminator needs to have some function that estimates "quality" of what is produced (decoded space).

Combining the properties of a VAE (where you can interpolate) and a GAN (where you can estimate "quality" of a recipe) would be perfect.

Main issue is that reconstruction error doesn't give you the properties you want. Look for other learning signals - what other aspect of a recipe is meaningful? Could we use the beer style as a label?


## Encoding Ideas

* Can you encode hops into all possible combinations present in the dataset (not all combinations period)? Rob thinks this is reasonable
* Can we make ingredient "categories"? e.g. caramel with varying lovabond measurement?

In [86]:
input_dims = len(recipes.columns)
# first guess
compress_dims = 50
# factor to reduce by each layer
factor_per_layer = 2
beer_ae = AutoEncoder(input_dims, compress_dims, factor_per_layer)

# training params
batch_size = 50000
num_epochs=100
learning_rate=1e-3
# weight of KL loss term
beta = 0.5
train(beer_ae, recipes_scaled, batch_size=batch_size, num_epochs=num_epochs, learning_rate=learning_rate, beta=beta)



epoch [1/100], loss:0.123690
epoch [2/100], loss:0.122374
epoch [3/100], loss:0.120745
epoch [4/100], loss:0.118553
epoch [5/100], loss:0.115636
epoch [6/100], loss:0.111839
epoch [7/100], loss:0.107007
epoch [8/100], loss:0.101006
epoch [9/100], loss:0.093692
epoch [10/100], loss:0.084982
epoch [11/100], loss:0.074893
epoch [12/100], loss:0.063603
epoch [13/100], loss:0.051501
epoch [14/100], loss:0.039232
epoch [15/100], loss:0.027659
epoch [16/100], loss:0.017709
epoch [17/100], loss:0.010080
epoch [18/100], loss:0.004953
epoch [19/100], loss:0.001950
epoch [20/100], loss:0.000403
epoch [21/100], loss:-0.000313
epoch [22/100], loss:-0.000619
epoch [23/100], loss:-0.000742
epoch [24/100], loss:-0.000790
epoch [25/100], loss:-0.000807
epoch [26/100], loss:-0.000814
epoch [27/100], loss:-0.000816
epoch [28/100], loss:-0.000816
epoch [29/100], loss:-0.000817
epoch [30/100], loss:-0.000816
epoch [31/100], loss:-0.000816
epoch [32/100], loss:-0.000816
epoch [33/100], loss:-0.000817
epoch 

In [83]:
random_beer = generate_random_beer(beer_ae)
ing_ids = np.where(random_beer > 0)[0]

In [84]:
len(ing_ids)

229

In [85]:
for ing in ing_ids:
    if ing != 687:
        print(f"{random_beer[ing]} kg of {inv_vocab[ing]}")
    else:
        print(f"And boil for {random_beer[ing]} minutes.")

0.0015515829436481 kg of ferm_2-row
0.00012997942394576967 kg of ferm_6-row
0.0004142947727814317 kg of ferm_abbey malt
0.00033476034877821803 kg of ferm_acidulated malt
0.0006134641589596868 kg of ferm_amber malt
0.0001865856465883553 kg of ferm_aromatic barley malt
0.000759866030421108 kg of ferm_biscuit® md™
0.0002605172630865127 kg of ferm_black malt
0.0011127550387755036 kg of ferm_brown malt
0.00027970969676971436 kg of ferm_brown sugar
0.00015848602924961597 kg of ferm_brown sugar, dark
0.00024047333863563836 kg of ferm_brown sugar, light
0.00011076791997766122 kg of ferm_candi sugar, amber
0.000290135940304026 kg of ferm_candi sugar, clear
0.00035931949969381094 kg of ferm_cara malt
0.00014394363097380847 kg of ferm_caraaroma
0.0001915988977998495 kg of ferm_carafa i
0.00019545425311662257 kg of ferm_carafa ii
0.0002506965829525143 kg of ferm_carafa iii
0.00017981529526878148 kg of ferm_carafa special iii
0.0001679700071690604 kg of ferm_carafoam
0.00017033125914167613 kg of fe

In [98]:
rec = recipes.loc[7, recipes.loc[7] != 0]
print(f"Recipe Vector: {rec}")
for ing in rec.index:
    ing = int(ing)
    if ing != 687:
        print(f"{rec.loc[ing]} kg of {inv_vocab[ing]}")
    else:
        print(f"And boil for {rec.loc[ing]} minutes.")

Recipe Vector: name
0.0           1.553145
34.0          1.553145
49.0          1.553145
74.0          1.553145
85.0          1.553145
187.0         3.106290
252.0         1.553145
576.0         1.553145
613.0         1.553145
623.0         1.553145
639.0         3.106290
645.0         1.553145
658.0         1.553145
664.0         3.106290
665.0         1.553145
boil_time    60.000000
Name: 7, dtype: float64
1.5531449289298356 kg of ferm_2-row
1.5531449289298356 kg of ferm_caramel malt 40l
1.5531449289298356 kg of ferm_chocolate malt
1.5531449289298356 kg of ferm_honey malt
1.5531449289298356 kg of ferm_maris otter pale
3.1062898578596707 kg of hop_hallertau
1.5531449289298356 kg of hop_saaz
1.5531449289298356 kg of yeast_safale us-05
1.5531449289298356 kg of misc_brown sugar
1.5531449289298356 kg of misc_cinnamon
3.1062898578596707 kg of misc_grains of paradise
1.5531449289298356 kg of misc_irish moss
1.5531449289298356 kg of misc_nutmeg
3.1062898578596707 kg of misc_pumpkin
1.5531449

ValueError: invalid literal for int() with base 10: 'boil_time'

 After trying for a few times with various betas (KL Divergence parameter), it doesn't look super promising. We can mess around with dimensionality I suppose, but it seems like it might be better to try a VAE. Let's do that next time!

Examples:
* https://towardsdatascience.com/variational-autoencoder-demystified-with-pytorch-implementation-3a06bee395ed
* https://github.com/pytorch/examples/blob/master/vae/main.py

# Notes from Ahmed

* Consider using TF-IDF vectorizer (or maybe TF-DF)
* 

In [139]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [142]:
TfidfVectorizer?