In [3]:
from collections import OrderedDict
import datetime
from torch import nn
import torch
import multiprocessing as mp
from torch.nn import functional as F
import os
import warnings
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
from scipy.special import softmax
from torch import optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
import pickle
from scipy.sparse import csr_matrix

In [4]:
class InferenceNetwork(nn.Module):
    """Inference Network."""
    def __init__(self, input_size, output_size, hidden_sizes,
                 activation='softplus', dropout=0.2, label_size=0):
        """
        # TODO: check dropout in main caller
        Initialize InferenceNetwork.
        Args
            input_size : int, dimension of input
            output_size : int, dimension of output
            hidden_sizes : tuple, length = n_layers
            activation : string, 'softplus' or 'relu', default 'softplus'
            dropout : float, default 0.2, default 0.2
        """
        super(InferenceNetwork, self).__init__()
        assert isinstance(input_size, int), "input_size must by type int."
        assert isinstance(output_size, int), "output_size must be type int."
        assert isinstance(hidden_sizes, tuple), \
            "hidden_sizes must be type tuple."
        assert activation in ['softplus', 'relu'], \
            "activation must be 'softplus' or 'relu'."
        assert dropout >= 0, "dropout must be >= 0."
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_sizes = hidden_sizes
        self.dropout = dropout
        
        if activation == 'softplus':
            self.activation = nn.Softplus()
        elif activation == 'relu':
            self.activation = nn.ReLU()
        
        self.input_layer = nn.Linear(input_size, hidden_sizes[0])
       
        self.hiddens = nn.Sequential(OrderedDict([
            ('l_{}'.format(i), nn.Sequential(nn.Linear(h_in, h_out), self.activation))
            for i, (h_in, h_out) in enumerate(zip(hidden_sizes[:-1], hidden_sizes[1:]))]))
        
        self.f_mu = nn.Linear(hidden_sizes[-1], output_size)
        self.f_mu_batchnorm = nn.BatchNorm1d(output_size, affine=False)
        self.f_sigma = nn.Linear(hidden_sizes[-1], output_size)
        self.f_sigma_batchnorm = nn.BatchNorm1d(output_size, affine=False)
        self.dropout_enc = nn.Dropout(p=self.dropout)
        
        
    def forward(self, x):
        """Forward pass."""
        x = self.input_layer(x)
        x = self.activation(x)
        x = self.hiddens(x)
        x = self.dropout_enc(x)
        mu = self.f_mu_batchnorm(self.f_mu(x))
        log_sigma = self.f_sigma_batchnorm(self.f_sigma(x))
        return mu, log_sigma

In [5]:
class DecoderNetwork(nn.Module):
    def __init__(self, device, input_size, emb_size=100, hidden_sizes=(200,200), activation='softplus', dropout=0.2, learn_priors=True, label_size=0):
        """
        Initialize InferenceNetwork.
        Args
            input_size : int, dimension of input
            emb_size : int, dimension of embedding
            hidden_sizes : tuple, length = n_layers, (default (200, 200))
            activation : string, 'softplus', 'relu', (default 'softplus')
        """
        super(DecoderNetwork, self).__init__()
        assert isinstance(input_size, int), "input_size must by type int."
        assert isinstance(hidden_sizes, tuple), \
            "hidden_sizes must be type tuple."
        assert activation in ['softplus', 'relu'], \
            "activation must be 'softplus' or 'relu'."
        assert dropout >= 0, "dropout must be >= 0."
        self.input_size = input_size
        self.emb_size = emb_size
        self.hidden_sizes = hidden_sizes
        self.activation = activation
        self.dropout = dropout
        self.learn_priors = learn_priors
        self.inf_net = InferenceNetwork(input_size, emb_size, hidden_sizes, activation)
        if label_size > 0:
            self.label_classification = nn.Linear(emb_size, label_size)
            
        # init prior parameters
        topic_prior_mean = 0.0
        self.prior_mean = torch.tensor(
            [topic_prior_mean] * emb_size)
        if torch.cuda.is_available():
            self.prior_mean = self.prior_mean.to(device)
        if self.learn_priors:
            self.prior_mean = nn.Parameter(self.prior_mean)

        topic_prior_variance = 1. - (1. / emb_size)
        self.prior_variance = torch.tensor(
            [topic_prior_variance] * emb_size)
        if torch.cuda.is_available():
            self.prior_variance = self.prior_variance.to(device)
        if self.learn_priors:
            self.prior_variance = nn.Parameter(self.prior_variance)
            
        self.beta = torch.Tensor(emb_size, input_size)
        if torch.cuda.is_available():
            self.beta = self.beta.to(device)
        self.beta = nn.Parameter(self.beta)
        nn.init.xavier_uniform_(self.beta)
        self.beta_batchnorm = nn.BatchNorm1d(input_size, affine=False)
        
        self.drop_theta = nn.Dropout(p=self.dropout)
    
    @staticmethod
    def reparameterize(mu, logvar):
        """Reparameterize the theta distribution."""
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return eps.mul(std).add_(mu)
    
    def forward(self, x, labels=None):
        """Forward pass."""
        # batch_size x n_components
        posterior_mu, posterior_log_sigma = self.inf_net(x)
        posterior_sigma = torch.exp(posterior_log_sigma)
        # generate samples from theta
        theta = F.softmax(
            self.reparameterize(posterior_mu, posterior_log_sigma), dim=1)
        theta = self.drop_theta(theta)

        # in: batch_size x input_size x emb_size
        recon_x = F.softmax(self.beta_batchnorm(torch.matmul(theta, self.beta)), dim=1)
        # recon_x: batch_size x input_size
        
        # classify labels
        estimated_labels = None
        if labels is not None:
            estimated_labels = self.label_classification(theta)
            
        return self.prior_mean, self.prior_variance, posterior_mu, posterior_sigma, posterior_log_sigma, recon_x, estimated_labels
    
    
    def get_theta(self, x, labels=None):
        with torch.no_grad():
            # batch_size x n_components
            posterior_mu, posterior_log_sigma = self.inf_net(x)
            theta = F.softmax(self.reparameterize(posterior_mu, posterior_log_sigma), dim=1)
            return theta

In [6]:
class Model:

    def __init__(self, input_size, emb_size=100, hidden_sizes=(200, 200), activation='softplus', dropout=0.2, learn_priors=True, batch_size=64,
                 lr=2e-3, momentum=0.99, solver='adam', num_epochs=100, reduce_on_plateau=False,
                 num_data_loader_workers=mp.cpu_count(), label_size=0, loss_weights=None, gpu=0):

        self.device = (
                torch.device("cuda:"+str(gpu))
                if torch.cuda.is_available()
                else torch.device("cpu")
            )
        print("Device:", self.device)

        assert isinstance(input_size, int) and input_size > 0, \
            "input_size must by type int > 0."
        assert isinstance(emb_size, int) and emb_size > 0, \
            "emb_size must by type int > 0."
        assert isinstance(hidden_sizes, tuple), \
            "hidden_sizes must be type tuple."
        assert activation in ['softplus', 'relu'], \
            "activation must be 'softplus' or 'relu'."
        assert dropout >= 0, "dropout must be >= 0."
        assert isinstance(learn_priors, bool), "learn_priors must be boolean."
        assert isinstance(batch_size, int) and batch_size > 0, \
            "batch_size must be int > 0."
        assert lr > 0, "lr must be > 0."
        assert isinstance(momentum, float) and 0 < momentum <= 1, \
            "momentum must be 0 < float <= 1."
        assert solver in ['adam', 'sgd'], "solver must be 'adam' or 'sgd'."
        assert isinstance(reduce_on_plateau, bool), \
            "reduce_on_plateau must be type bool."
        assert isinstance(num_data_loader_workers, int) and num_data_loader_workers >= 0, \
            "num_data_loader_workers must by type int >= 0. set 0 if you are using windows"

        self.input_size = input_size
        self.emb_size = emb_size
        self.hidden_sizes = hidden_sizes
        self.activation = activation
        self.dropout = dropout
        self.learn_priors = learn_priors
        self.batch_size = batch_size
        self.lr = lr
        self.momentum = momentum
        self.solver = solver
        self.num_epochs = num_epochs
        self.reduce_on_plateau = reduce_on_plateau
        self.num_data_loader_workers = num_data_loader_workers

        if loss_weights:
            self.weights = loss_weights
        else:
            self.weights = {"beta": 1}
            
        self.model = DecoderNetwork(self.device, input_size, emb_size, hidden_sizes, activation, dropout, learn_priors=learn_priors, label_size=label_size)

        self.early_stopping = None

        # init optimizer
        if self.solver == 'adam':
            self.optimizer = optim.Adam(
                self.model.parameters(), lr=lr, betas=(self.momentum, 0.99))
        elif self.solver == 'sgd':
            self.optimizer = optim.SGD(
                self.model.parameters(), lr=lr, momentum=self.momentum)

        # init lr scheduler
        if self.reduce_on_plateau:
            self.scheduler = ReduceLROnPlateau(self.optimizer, patience=10)

        # performance attributes
        self.best_loss_train = float('inf')

        # training attributes
        self.model_dir = None
        self.nn_epoch = None

        # validation attributes
        self.validation_data = None

        # learned topics
        self.best_components = None

        # Use cuda if available
        if torch.cuda.is_available():
            self.USE_CUDA = True
        else:
            self.USE_CUDA = False

        self.model = self.model.to(self.device)
        
        
    def _loss(self, inputs, recon_x, prior_mean, prior_variance, posterior_mean, posterior_variance, posterior_log_variance):

        # KL term
        # var division term
        var_division = torch.sum(posterior_variance / prior_variance, dim=1)
        # diff means term
        diff_means = prior_mean - posterior_mean
        diff_term = torch.sum(
            (diff_means * diff_means) / prior_variance, dim=1)
        # logvar det division term
        logvar_det_division = \
            prior_variance.log().sum() - posterior_log_variance.sum(dim=1)
        # combine terms
        KL = 0.5 * (
            var_division + diff_term - self.emb_size + logvar_det_division)

        # Reconstruction term
        RL = -torch.sum(inputs * torch.log(recon_x + 1e-10), dim=1)

        return KL, RL
    
    
    def _train_epoch(self, loader, epoch):
        """Train epoch."""
        self.model.train()
        train_loss = 0
        samples_processed = 0

        for batch_samples in tqdm(loader, desc=f"Epoch {epoch+1}"):
            topic_fea = batch_samples['topic_fea']
            topic_fea = topic_fea.reshape(topic_fea.shape[0], -1)
            
            item_fea = batch_samples['item_fea']
            item_fea = item_fea.reshape(item_fea.shape[0], -1)

            X = torch.cat((topic_fea, item_fea), 1)

            if "labels" in batch_samples.keys():
                labels = batch_samples["labels"]
                labels = labels.reshape(labels.shape[0], -1)
                labels = labels.to(self.device)
            else:
                labels = None


            X = X.to(self.device)
                

            # forward pass
            self.model.zero_grad()
            
            prior_mean, prior_variance, posterior_mean, posterior_variance,\
            posterior_log_variance, recon_x, estimated_labels = self.model(X, labels)

            # backward pass
            kl_loss, rl_loss = self._loss(X, recon_x, prior_mean, prior_variance, posterior_mean, posterior_variance, posterior_log_variance)

            loss = self.weights["beta"]*kl_loss + rl_loss
            loss = loss.sum()

            if labels is not None:
                labels = labels.type(torch.long)
                label_loss = torch.nn.MultiLabelSoftMarginLoss()(estimated_labels, labels)
                loss += label_loss

            loss.backward()
            self.optimizer.step()

            # compute train loss
            samples_processed += X.size()[0]
            train_loss += loss.item()

        train_loss /= samples_processed

        return samples_processed, train_loss
    
    
    def fit(self, train_dataset, save_dir=None, verbose=False, patience=5, delta=0,
            n_samples=20, model_name="vae_model_tmall.pt"):
        """
        Train the model.

        :param train_dataset: PyTorch Dataset class for training data.
        :param save_dir: directory to save checkpoint models to.
        :param verbose: verbose
        :param patience: How long to wait after last time validation loss improved. Default: 5
        :param delta: Minimum change in the monitored quantity to qualify as an improvement. Default: 0
        :param n_samples: int, number of samples of the document topic distribution (default: 20)

        """
        # Print settings to output file
        if verbose:
            print("Settings: \n\
                   N Components: {}\n\
                   Topic Prior Mean: {}\n\
                   Topic Prior Variance: {}\n\
                   Hidden Sizes: {}\n\
                   Activation: {}\n\
                   Dropout: {}\n\
                   Learn Priors: {}\n\
                   Learning Rate: {}\n\
                   Momentum: {}\n\
                   Reduce On Plateau: {}\n\
                   Save Dir: {}".format(
                self.emb_size, 0.0,
                1. - (1. / self.emb_size), self.hidden_sizes, self.activation, self.dropout, self.learn_priors,
                self.lr, self.momentum, self.reduce_on_plateau, save_dir))

        self.model_dir = save_dir
        train_data = train_dataset
#         num_workers=self.num_data_loader_workers,
        train_loader = DataLoader(train_data, batch_size=self.batch_size, shuffle=True, drop_last=True, num_workers=self.num_data_loader_workers)

        # init training variables
        train_loss = 0
        samples_processed = 0

        # train loop
#         pbar = tqdm(self.num_epochs, position=0, leave=True)
        for epoch in range(self.num_epochs):
            self.nn_epoch = epoch
            # train epoch
            s = datetime.datetime.now()
            sp, train_loss = self._train_epoch(train_loader, epoch)
            samples_processed += sp
            e = datetime.datetime.now()
#             pbar.update(1)

            if save_dir is not None:
                self.save(save_dir)

#             pbar.set_description("Epoch: [{}/{}]\t Seen Samples: [{}/{}]\tTrain Loss: {}\tTime: {}".format(
#                 epoch + 1, self.num_epochs, samples_processed,
#                 len(train_data) * self.num_epochs, train_loss, e - s))
            
            torch.save(model, model_name)

        pbar.close()
        

    def get_embeddings(self, dataset, batch_size=32, n_samples=20):
        self.model.eval()
#         num_workers=self.num_data_loader_workers
        loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=self.num_data_loader_workers)
        pbar = tqdm(n_samples, position=0, leave=True)
        final_thetas = []
        for sample_index in range(n_samples):
            with torch.no_grad():
                collect_theta = []
                for batch_samples in loader:
                    # batch_size x vocab_size
                    topic_fea = batch_samples['topic_fea']
                    topic_fea = topic_fea.reshape(topic_fea.shape[0], -1)

                    item_fea = batch_samples['item_fea']
                    item_fea = item_fea.reshape(item_fea.shape[0], -1)

                    X = torch.cat((topic_fea, item_fea), 1)
                    
                    labels = None

                    X = X.to(self.device)

                    # forward pass
                    self.model.zero_grad()
                    theta = self.model.get_theta(X, labels).cpu().numpy()
                    collect_theta.extend(theta.tolist())
                pbar.update(1)
                pbar.set_description("Sampling: [{}/{}]".format(sample_index + 1, n_samples))
                final_thetas.append(np.array(collect_theta))
        pbar.close()
        return np.sum(final_thetas, axis=0) / n_samples
    

In [7]:
import torch
from torch.utils.data import Dataset
import scipy.sparse
class ModelDataset(Dataset):
    """Class to load user and topic features."""
    def __init__(self, n_topics, n_items, topic, items, labels=None):
        
        self.n_topics = n_topics
        self.n_items = n_items
        self.topic = topic
        self.items = items
        self.labels = labels
    
    def __len__(self):
        """Return length of dataset."""
        return len(self.topic)
    
    def __getitem__(self, i):
        """Return sample from dataset at index i."""
            
        topic_hot = torch.zeros(self.n_topics)
        topic_hot[self.topic[i]] = 1 

        item_hot = torch.zeros(self.n_items)
        for item in self.items[i]:
            item_hot[item] = 1
            
        return_dict = {'topic_fea': topic_hot, 'item_fea': item_hot}
        
        if self.labels is not None:
            labels = self.labels[i]
            return_dict["labels"] = torch.FloatTensor(labels)
            
        return return_dict

# New rocket retail data 20/09/2023

In [10]:
df = pd.read_parquet("../datasets/RocketRetail/rocket.parquet")

In [11]:
df

Unnamed: 0,past_topic,future_topic,past_leaf
0,[760],[760],[[69481]]
1,"[268, 23]","[310, 800, 1086, 760, 819, 312, 813, 543, 790,...","[[537], [66499]]"
2,[713],[713],[[73498]]
3,[790],[1009],"[[84857, 84857, 84857]]"
4,[918],[918],[[16129]]
...,...,...,...
234557,[405],[473],[[28074]]
234558,[281],[426],[[32825]]
234559,[332],[332],[[73003]]
234560,[325],[1056],[[86658]]


In [12]:
unique_topics = set()
unique_items = set()

topics = []
items = []

for past_topic, future_topic, past_leaf in df.values:
    unique_topics.update(past_topic)
    unique_topics.update(future_topic)
    
    topics.extend(past_topic)
    
    for pl in past_leaf:
        unique_items.update(pl)
        items.append(pl)

In [13]:
n_topics = len(unique_topics)
n_items = len(unique_items)

In [14]:
dataset = ModelDataset(n_topics, n_items, topics, items)

In [15]:
model = Model(input_size=n_topics+n_items, emb_size=256, batch_size=256, num_epochs=20)

Device: cuda


In [16]:
model.fit(dataset)

Epoch 1: 100%|██████████| 1270/1270 [06:09<00:00,  3.44it/s]
Epoch 2: 100%|██████████| 1270/1270 [06:07<00:00,  3.45it/s]
Epoch 3: 100%|██████████| 1270/1270 [06:12<00:00,  3.41it/s]
Epoch 4: 100%|██████████| 1270/1270 [06:08<00:00,  3.44it/s]
Epoch 5: 100%|██████████| 1270/1270 [06:14<00:00,  3.40it/s]
Epoch 6: 100%|██████████| 1270/1270 [06:14<00:00,  3.39it/s]
Epoch 7: 100%|██████████| 1270/1270 [06:12<00:00,  3.41it/s]
Epoch 8: 100%|██████████| 1270/1270 [06:13<00:00,  3.40it/s]
Epoch 9: 100%|██████████| 1270/1270 [06:43<00:00,  3.14it/s]
Epoch 10: 100%|██████████| 1270/1270 [06:24<00:00,  3.31it/s]
Epoch 11: 100%|██████████| 1270/1270 [06:38<00:00,  3.19it/s]
Epoch 12: 100%|██████████| 1270/1270 [06:28<00:00,  3.27it/s]
Epoch 13: 100%|██████████| 1270/1270 [06:59<00:00,  3.03it/s]
Epoch 14: 100%|██████████| 1270/1270 [07:00<00:00,  3.02it/s]
Epoch 15: 100%|██████████| 1270/1270 [06:59<00:00,  3.03it/s]
Epoch 16: 100%|██████████| 1270/1270 [06:55<00:00,  3.05it/s]
Epoch 17: 100%|██

In [17]:
torch.save(model, '../datasets/RocketRetail/vae_model.pt')

## Inference 

In [13]:
model = torch.load('../datasets/RocketRetail/vae_model.pt')

In [None]:
emb = model.get_embeddings(dataset, n_samples=2)

0it [00:00, ?it/s]

In [None]:
topic_items_to_emb = {}
i = 0
for top, it in zip(topics, items):
    _id = tuple([top, tuple(it)])
    topic_items_to_emb[_id] = emb[i]
    i += 1

In [None]:
with open('topic_items_to_emb-new.pkl',"wb") as fp:
    pickle.dump(topic_items_to_emb, fp)

# Tmall dataset

In [8]:
import pickle
with open("../datasets/Tmall/tmall.pickle", 'rb') as fp:
    data = pickle.load(fp)

In [9]:
unique_topics = set()
unique_items = set()

topics = []
items = []

for past_topic, future_topic, past_leaf in data.values:
    unique_topics.update(past_topic)
    unique_topics.update(future_topic)
    
    topics.extend(past_topic)
    
    for pl in past_leaf:
        unique_items.update(pl)
        items.append(pl)

In [10]:
item_to_idx = {}
idx_to_item = {}
for i, item in enumerate(unique_items):
    item_to_idx[item] = i
    idx_to_item = item

In [11]:
with open('../datasets/Tmall/item_to_idx.pkl', 'wb') as fp:
    pickle.dump(item_to_idx, fp)

In [12]:
with open('../datasets/Tmall/idx_to_item.pkl', 'wb') as fp:
    pickle.dump(idx_to_item, fp)

In [13]:
items = [[item_to_idx[i] for i in its] for its in items]

In [14]:
n_topics, n_items = len(unique_topics), len(unique_items)

In [15]:
dataset = ModelDataset(n_topics, n_items, topics[:], items[:])

In [16]:
model = Model(input_size=n_topics+n_items, emb_size=256, batch_size=256, num_epochs=20, gpu=2)

Device: cuda:2


In [None]:
model.fit(dataset)

Epoch 1: 100%|██████████████████████████████| 7374/7374 [51:50<00:00,  2.37it/s]
Epoch 2: 100%|██████████████████████████████| 7374/7374 [51:36<00:00,  2.38it/s]
Epoch 3: 100%|██████████████████████████████| 7374/7374 [59:21<00:00,  2.07it/s]
Epoch 4: 100%|████████████████████████████| 7374/7374 [1:00:21<00:00,  2.04it/s]
Epoch 5: 100%|██████████████████████████████| 7374/7374 [52:54<00:00,  2.32it/s]
Epoch 6: 100%|██████████████████████████████| 7374/7374 [51:51<00:00,  2.37it/s]
Epoch 7: 100%|██████████████████████████████| 7374/7374 [52:28<00:00,  2.34it/s]
Epoch 8: 100%|██████████████████████████████| 7374/7374 [52:22<00:00,  2.35it/s]
Epoch 9: 100%|██████████████████████████████| 7374/7374 [51:56<00:00,  2.37it/s]
Epoch 10:  81%|███████████████████████▌     | 5995/7374 [43:12<10:07,  2.27it/s]

In [None]:
torch.save(model, '../datasets/Tmall/vae_model.pt')