In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_probability as tfp
import os
from sklearn.model_selection import train_test_split

In [15]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

ModuleNotFoundError: No module named 'google.colab'

In [2]:
# Change base path based on GDrive or local run
base_path = "../dataset"
# base_path = "/gdrive/MyDrive/dressipi_recsys2022_mapped/dataset"

original_data = os.path.join(base_path, 'original_data')
processed_data = os.path.join(base_path, 'processed_data')

#Prepare dataset

In [3]:
tfd = tfp.distributions
tfb = tfp.bijectors
tfpl = tfp.layers

In [4]:
items_features = pd.read_csv(os.path.join(processed_data, "simplified_features.csv"))
items_features.head()

Unnamed: 0,item_id,feature_idx
0,19021,1
1,19021,23
2,19021,16
3,19021,447
4,19021,2


In [5]:
dataset = np.zeros(
    (items_features['item_id'].max()+1, items_features['feature_idx'].max()+1)
    )
dataset[(items_features['item_id'], items_features['feature_idx'])] = 1

dataset

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

#VAE

In [6]:
def get_prior(num_modes, latent_dimension):
    """
    This function should create an instance of a MixtureSameFamily distribution 
    according to the above specification. 
    The function takes the num_modes and latent_dim as arguments, which should 
    be used to define the distribution.
    Your function should then return the distribution instance.
    """
    return tfd.MixtureSameFamily(
        mixture_distribution=tfd.Categorical(probs=[1/num_modes for _ in range(num_modes)]),
        components_distribution=tfd.MultivariateNormalDiag(
            loc=tf.Variable(tf.random.normal(shape=(num_modes, latent_dimension))),
            scale_diag=tfp.util.TransformedVariable(
                initial_value=tf.ones(
                    shape=(num_modes, latent_dimension)
                ),
                bijector=tfb.Softplus()
            )
        )
    )

In [7]:
def get_kl_regularizer(prior_distribution):
    """
    This function should create an instance of the KLDivergenceRegularizer 
    according to the above specification. 
    The function takes the prior_distribution, which should be used to define 
    the distribution.
    Your function should then return the KLDivergenceRegularizer instance.
    """
    return tfpl.KLDivergenceRegularizer(
        distribution_b=prior_distribution,
        weight=1.0,
        test_points_fn=lambda q: q.sample(3),
        test_points_reduce_axis=None
    )

In [8]:
latent_dimension = 256
starting_dimension = 2**9
prior = get_prior(num_modes=2, latent_dimension=latent_dimension)
kl_regularizer = get_kl_regularizer(prior)

inputs = layers.Input((items_features['feature_idx'].max()+1,))

x = inputs 
dimension = starting_dimension

while(dimension != latent_dimension):
  x = layers.Dense(units=dimension, activation='relu')(x)
  x = layers.BatchNormalization()(x)

  dimension = dimension//2

x = layers.Dense(units=tfpl.MultivariateNormalTriL.params_size(latent_dimension))(x)
outputs = tfpl.MultivariateNormalTriL(latent_dimension, activity_regularizer=kl_regularizer)(x)

encoder = tf.keras.Model(inputs=inputs, outputs=outputs, name='encoder')

encoder.summary()

Metal device set to: Apple M1 Max


2022-04-21 22:01:29.642267: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-04-21 22:01:29.642406: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2022-04-21 22:01:29.839013: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


Model: "encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 905)]             0         
                                                                 
 dense (Dense)               (None, 512)               463872    
                                                                 
 batch_normalization (BatchN  (None, 512)              2048      
 ormalization)                                                   
                                                                 
 dense_1 (Dense)             (None, 33152)             17006976  
                                                                 
 multivariate_normal_tri_l (  ((None, 256),            1024      
 MultivariateNormalTriL)      (None, 256))                       
                                                                 
Total params: 17,473,920
Trainable params: 17,472,896
Non-t

In [9]:
max_width = 1024
ending_dimension = items_features['feature_idx'].max()+1

inputs = layers.Input((latent_dimension,))

x = inputs 
dimension = latent_dimension

while(dimension <= max_width):
  x = layers.Dense(units=dimension, activation='relu')(x)
  x = layers.BatchNormalization()(x)

  dimension = dimension*2

x = layers.Dense(units=tfpl.IndependentBernoulli.params_size(ending_dimension))(x)
outputs = tfpl.IndependentBernoulli(ending_dimension)(x)

decoder = tf.keras.Model(inputs=inputs, outputs=outputs, name='decoder')

decoder.summary()

Model: "decoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 256)]             0         
                                                                 
 dense_2 (Dense)             (None, 256)               65792     
                                                                 
 batch_normalization_1 (Batc  (None, 256)              1024      
 hNormalization)                                                 
                                                                 
 dense_3 (Dense)             (None, 512)               131584    
                                                                 
 batch_normalization_2 (Batc  (None, 512)              2048      
 hNormalization)                                                 
                                                                 
 dense_4 (Dense)             (None, 1024)              5253

In [10]:
vae = tf.keras.Model(inputs=encoder.inputs, outputs=decoder(encoder.outputs), name='vae')
vae.summary()

Model: "vae"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 905)]             0         
                                                                 
 dense (Dense)               (None, 512)               463872    
                                                                 
 batch_normalization (BatchN  (None, 512)              2048      
 ormalization)                                                   
                                                                 
 dense_1 (Dense)             (None, 33152)             17006976  
                                                                 
 multivariate_normal_tri_l (  ((None, 256),            1024      
 MultivariateNormalTriL)      (None, 256))                       
                                                                 
 decoder (Functional)        (None, 905)               1657481 

In [11]:
vae.compile(
    optimizer='adam', 
    loss=lambda x_true, x_pred : -tf.reduce_mean(x_pred.log_prob(x_true))
    )

#Training

In [12]:
vae.fit(
    dataset,
    dataset, 
    validation_split=0.2, 
    epochs=100,
    callbacks=[
               tf.keras.callbacks.EarlyStopping(
                  monitor='val_loss',
                  patience=5,
                  restore_best_weights=True,
              )
         ]
    )

2022-04-21 22:01:33.632197: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/100


2022-04-21 22:01:34.752526: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2022-04-21 22:02:14.200902: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100


<keras.callbacks.History at 0x17ffb9b20>

In [13]:
np.save(os.path.join(processed_data, "compressed_features"), encoder(dataset).mean())