In [1]:
import math
import numpy as np
import os
import pandas as pd
import pickle
import torch
import torch.nn as nn
import torch.utils.data as data_utils

from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

from beerai.config import DATA_DIR

In [2]:
def load_vectors():
    vec_file = os.path.join(DATA_DIR, "processed/recipe_vecs.h5")
    with pd.HDFStore(vec_file, "r") as store:
        vectors = store.get("/vecs")
    return vectors

def load_vocab():
    vocab_file = os.path.join(DATA_DIR, "processed/vocab.pickle")
    with open(vocab_file, "rb") as f:
        vocab = pickle.load(f)
    inv_vocab = {v: k for k,v in vocab.items()}
    return vocab, inv_vocab

In [3]:
recipes = load_vectors()

In [4]:
recipes.head()

name,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,783.0,784.0,785.0,786.0,787.0,788.0,789.0,790.0,791.0,boil_time
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0
1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0
5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.024016,0.0,0.0,0.0,0.0,90.0
7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0
8.0,0.103101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0


In [5]:
recipes["boil_time"] = recipes.boil_time.clip(upper=120)

In [6]:
scaler = MinMaxScaler()
recipes_scaled = scaler.fit_transform(recipes)

Notes on sparse autoencoders

- [Discussion on quora](https://www.quora.com/When-training-an-autoencoder-on-very-sparse-data-how-do-you-force-the-decoder-to-reconstruct-mostly-zeros-rather-than-always-just-reconstructing-the-average)
  - > So, if you're including a bias term in your autoencoders, I recommend removing the bias and attempting training again.
  - > I encountered this problem recently. I found that, similar to what Eric described, it is a problem with SGD getting stuck in bad local optima. A couple things helped: (1) using conjugate gradients or AdaGrad, either of which will find the path to the true minimum without getting stuck as much as plain SGD will; (2) using a combined cross-entropy & mean-squared-error loss function (assuming that you can model your data as binary vectors or as probability distributions) pulls things -- ever so slightly -- in the right directions better than either alone would.
  - > I decided to progressively lower the learning rate and then I got good results. So that's my tip, lower your learning rate until you get better results. (For example, I'm using now: 0.0000005 as the initial rate.) And don’t forget to normalize your data!
  - > You shouldn't need to do anything special for this. Standard good practices for initialization and training should take care of it.
  - > We were able to reproduce the original image (not get the average) by using AdamOptimizer and lowering the learning rate.
  - [Notes](http://web.stanford.edu/class/cs294a/sae/sparseAutoencoderNotes.pdf) from Stanford CS294a (Andrew Ng)  

In [7]:
class AutoEncoder(nn.Module):
    def __init__(
        self,
        input_dim,
        compression_dim,
        factor_per_layer,
    ):

        super().__init__()
        self.compression_dim = compression_dim

        comp_layers, decomp_layers = self.gen_layers_by_factor(
            input_dim, compression_dim, factor_per_layer
        )

        self.encoder = nn.Sequential(*comp_layers)
        self.decoder = nn.Sequential(*decomp_layers)
 
    def gen_layers_by_factor(self, input_dim, compression_dim, factor_per_layer):
        
        cur_dim = input_dim
        compress_layers = []
        decompress_layers = []
        n_iters = math.ceil(math.log(input_dim / compression_dim, factor_per_layer))
        
        for i in range(n_iters):
            new_dim = max(cur_dim // factor_per_layer, compression_dim)
            compress_layers.extend([nn.Linear(cur_dim, new_dim, bias=False), nn.ReLU(True)])
            decompress_layers.extend([nn.ReLU(True), nn.Linear(new_dim, cur_dim, bias=False)])
            cur_dim = new_dim
        decompress_layers = decompress_layers[::-1]
        # Replace final layer with sigmoid/tanh. Should match the input scaling range 
        compress_layers[-1] = nn.Sigmoid()
        decompress_layers[-1] = nn.Sigmoid()

        return compress_layers, decompress_layers

    def forward(self, X):
        encoded = self.encoder(X)
        decoded = self.decoder(encoded)
        return decoded

    def encode(self, X):
        X = torch.tensor(X, dtype=torch.float)
        X = self.encoder(X)
        return pd.DataFrame(X.detach().numpy())
    
    def decode(self, X):
        X = torch.tensor(X, dtype=torch.float)
        X = self.decoder(X)
        return pd.DataFrame(X.detach().numpy())

In [8]:
def train(model, X, batch_size, num_epochs, learning_rate=1e-3, beta=1):

    if isinstance(X, pd.DataFrame):
        X = X.values

    mse = nn.MSELoss()
    kldiv = nn.KLDivLoss()
    optimizer = torch.optim.Adam(
        model.parameters(), lr=learning_rate, weight_decay=1e-5
    )

    #features = torch.tensor(X, dtype=torch.float)
    data = torch.tensor(X, dtype=torch.float)
    
    #train = data_utils.TensorDataset(features)
    
    # Shuffle used to ensure randomized selection
    #train_loader = data_utils.DataLoader(
    #    train, batch_size=batch_size, shuffle=True
    #)

    for epoch in range(num_epochs):
        losses = []
        #for i, batch in enumerate(train_loader):
            # ===================forward=====================
            #data = batch[0]
        decoded = model.forward(data)
        length = decoded.shape[0]

        if length < batch_size:
            continue
        mse_loss = mse(decoded, data)
        kld_loss = kldiv(decoded, data)
        loss = mse_loss + beta * kld_loss


        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append(loss.data)

        print(f"epoch [{epoch + 1}/{num_epochs}], loss:{sum(losses)/len(losses):.6f}")

In [9]:
input_dims = len(recipes.columns)
# first guess
compress_dims = 50
# factor to reduce by each layer
factor_per_layer = 2
beer_ae = AutoEncoder(input_dims, compress_dims, factor_per_layer)

In [10]:
batch_size = 10000
num_epochs=100
learning_rate=1e-3
# weight of KL loss term
beta = 0.1
train(beer_ae, recipes_scaled, batch_size=batch_size, num_epochs=num_epochs, learning_rate=learning_rate, beta=beta)
#train(beer_ae, recipes.values, batch_size=batch_size, num_epochs=num_epochs, learning_rate=learning_rate, beta=beta)



epoch [1/100], loss:0.236223
epoch [2/100], loss:0.234687
epoch [3/100], loss:0.231933
epoch [4/100], loss:0.227674
epoch [5/100], loss:0.222122
epoch [6/100], loss:0.213805
epoch [7/100], loss:0.202950
epoch [8/100], loss:0.188608
epoch [9/100], loss:0.170590
epoch [10/100], loss:0.148501
epoch [11/100], loss:0.122638
epoch [12/100], loss:0.094110
epoch [13/100], loss:0.064928
epoch [14/100], loss:0.038338
epoch [15/100], loss:0.017751
epoch [16/100], loss:0.004928
epoch [17/100], loss:-0.001124
epoch [18/100], loss:-0.003183
epoch [19/100], loss:-0.003633
epoch [20/100], loss:-0.003651
epoch [21/100], loss:-0.003597
epoch [22/100], loss:-0.003576
epoch [23/100], loss:-0.003594
epoch [24/100], loss:-0.003634
epoch [25/100], loss:-0.003682
epoch [26/100], loss:-0.003758
epoch [27/100], loss:-0.003871
epoch [28/100], loss:-0.003985
epoch [29/100], loss:-0.004042
epoch [30/100], loss:-0.004015
epoch [31/100], loss:-0.004050
epoch [32/100], loss:-0.004061
epoch [33/100], loss:-0.004116
ep

In [11]:
decoded = beer_ae.decode(beer_ae.encode(recipes.values).values)
decoded.index = recipes.index

In [15]:
decoded[decoded < 1e-6] = 0

In [16]:
decoded

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,783,784,785,786,787,788,789,790,791,792
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,0.0,0.0,0.0,0.0,0.324063,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.554025
1.0,0.0,0.0,0.0,0.0,0.324081,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.554021
5.0,0.0,0.0,0.0,0.0,0.321901,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.554415
7.0,0.0,0.0,0.0,0.0,0.324039,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.554030
8.0,0.0,0.0,0.0,0.0,0.324050,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.554028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404601.0,0.0,0.0,0.0,0.0,0.324081,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.554021
404604.0,0.0,0.0,0.0,0.0,0.324083,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.554020
404606.0,0.0,0.0,0.0,0.0,0.324090,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.554019
404622.0,0.0,0.0,0.0,0.0,0.324058,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.554026


In [21]:
np.sqrt((recipes.values - decoded.values)**2).mean()

0.002713305919337009

In [35]:
random_beer = beer_ae.decode(np.random.rand(50))
random_beer = scaler.inverse_transform(random_beer.T)[0]

In [36]:
random_beer[random_beer < 0.0001] = 0

In [37]:
vocab, inv_vocab = load_vocab()

In [39]:
ing_ids = np.where(random_beer > 0)[0]

In [40]:
ing_ids

array([127, 281, 723, 727, 729, 747, 767, 770, 792])

In [43]:
for ing in ing_ids:
    print(inv_vocab[ing])

yeast_american farmhouse blend (wlp670)
hop_crystal


KeyError: 723

# Ideas from Ethan


## Model Ideas

* Look at adding regularization in between layers
* Look at VAE
* Look at GAN
  * This is most interesting
  * Its explicit job is creating recipes that look real.
  * Discriminator needs to have some function that estimates "quality" of what is produced (decoded space).

Combining the properties of a VAE (where you can interpolate) and a GAN (where you can estimate "quality" of a recipe) would be perfect.

Main issue is that reconstruction error doesn't give you the properties you want. Look for other learning signals - what other aspect of a recipe is meaningful? Could we use the beer style as a label?


## Encoding Ideas

* Can you encode hops into all possible combinations present in the dataset (not all combinations period)? Rob thinks this is reasonable
* Can we make ingredient "categories"? e.g. caramel with varying lovabond measurement?