In [1]:
#pip install nextrec

In [3]:
import os
import sys
import logging
os.environ["TOKENIZERS_PARALLELISM"]="false"

logger = logging.getLogger() 
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
logger.handlers = [handler] 

In this example, we'll walk through using the NextRec framework to train an RQ-VAE model and generate semantic IDs. Under `dataset/` we provide a sample e-commerce dataset that contains user id, item id, product text description, event time, and a few common features. We'll use the text feature for training.


In [None]:
import pandas as pd

df = pd.read_csv('dataset/ecommerce_task.csv')
df.head()

Unnamed: 0,log_time,user_id,item_id,gender,age_bucket,city,device,channel,category,brand,price,impression_position,user_active_days_7,user_ctr,text_desc,click,conversion
0,2025-11-14 09:16:00,10001,item_BEAU_01399,F,35-44,Beijing,Web,search,Beauty,PureSkin,277.62,10,7,0.29,Sunscreen cruelty-free brightening,0,0
1,2025-11-15 17:18:00,10001,item_BEAU_01530,F,35-44,Beijing,Web,push,Beauty,BrandN,814.57,9,7,0.29,Essence for sensitive skin with niacinamide hy...,0,0
2,2025-11-18 09:51:00,10001,item_HOME_01211,F,35-44,Beijing,Web,organic,Home,BrandB,904.47,5,7,0.29,Office chair with lumbar support handcrafted,0,0
3,2025-11-23 09:59:00,10001,item_CLOT_00426,F,35-44,Beijing,Web,search,Clothing,BrandD,358.71,10,7,0.29,Cotton t-shirt short sleeve athletic fit with ...,1,1
4,2025-12-08 19:39:00,10001,item_HOME_01178,F,35-44,Beijing,Web,search,Home,ModernLiving,315.23,10,7,0.29,Plant pot space-saving durable material easy a...,0,0


# RQ-VAE

Before getting started, here is a quick introduction to RQ-VAE. RQ-VAE is an improvement over VAE (Variational Autoencoder), a generative model that learns the distribution of the input data and outputs a similar distribution using KL divergence as the metric. Simply put, a VAE works like an embedding layer: for text data, it maps text into a low-dimensional space. Because a VAE expects a vector input, we need to tokenize and embed/one-hot the text so each token becomes a high-dimensional vector, then feed the high-dimensional sequence into an encoder (RNN/GRN/CNN/Transformer) to compress the sequence into a low-dimensional vector—that is the final representation we want.

A VAE already reduces the high-dimensional embedding to something lower, but saving that vector for every item can still be heavy in production. We would like to compress the output once more and turn continuous vectors into discrete low-dimensional vectors—that is the problem RQ-VAE tries to solve. RQ stands for Vector Quantization (VQ) and Residual Encoding. RQ-VAE aims to store each item as an index ID and perform retrieval with ID → lookup → dot-product.

RQ-VAE represents the original VAE embedding using a combination of discrete codebook indices.


In [1]:
from nextrec.basic.features import DenseFeature
from nextrec.data.dataloader import RecDataLoader
from nextrec.models.representation import RQVAE

from nextrec.data.data_processing import split_dict_random
from nextrec.utils.embedding import encode_multimodel_content

To generate semantic IDs with RQ-VAE, we first need multimodal inputs. In this example we use text as the raw signal and embed it with a BERT model into high-dimensional dense vectors. The NextRec utility `encode_multimodel_content` handles this step for you, invoking the transformers library under the hood.


In [6]:
texts = df["text_desc"].fillna("").tolist()
print(f"Dataset loaded: {len(df)} samples")

embeddings = encode_multimodel_content(texts, model_name="bert-base-uncased", device="cpu", batch_size=32)
print(f"Encoded text_desc into embeddings with shape {embeddings.shape}")
print(embeddings[0])

Dataset loaded: 10000 samples
Encoded text_desc into embeddings with shape torch.Size([10000, 768])
tensor([-6.9366e-02, -2.5368e-01, -4.2544e-01,  2.4386e-01, -2.3643e-01,
        -2.3092e-02, -7.0698e-02,  2.4566e-01, -4.0289e-02, -1.3284e-01,
         9.2764e-02,  4.7631e-01, -2.7675e-02,  1.6549e-01, -2.5294e-01,
         1.0878e-01, -2.4900e-01,  4.8581e-01,  3.5214e-02,  1.4288e-01,
        -8.4659e-02, -8.7917e-02, -6.5080e-01,  1.9214e-01,  9.4129e-02,
        -1.0084e-01,  3.9686e-01,  2.1918e-01,  1.7440e-01,  1.8361e-01,
         2.5563e-01,  1.7434e-01, -5.5120e-02, -5.0393e-02,  5.2894e-01,
        -5.3674e-01,  2.0702e-02, -8.3346e-01, -2.1497e-01, -9.4487e-02,
        -1.1800e-02,  1.2713e-01,  6.3430e-01, -1.8029e-01,  5.2011e-02,
        -3.1240e-01, -2.5302e+00,  2.3571e-01, -1.6146e-01, -1.8182e-01,
         6.8951e-01, -4.4509e-01,  6.4377e-02,  1.4384e-01,  6.2194e-01,
         8.5790e-01, -2.3083e-02,  5.3937e-01,  3.3104e-01,  1.1583e-01,
         8.1912e-02, -3.

Now that we have the raw inputs, we want RQ-VAE to learn and reconstruct their distribution. We therefore split the data into training and validation sets and build dataloaders. NextRec provides `RecDataLoader` to help with this.

Because we have converted the text feature into a dense feature (`DenseFeature`), we only need to define the feature and pass it to `RecDataLoader`. In the future, NextRec will support multimodal feature definitions and automatic transforms; before that, please use the utility functions to perform the transforms manually.


In [7]:
# Build loaders for RQ-VAE
text_feature = DenseFeature(name="text_embedding", input_dim=embeddings.shape[1])
loader_builder = RecDataLoader(dense_features=[text_feature])
emb_np = embeddings.cpu().numpy()
rqvae_train_dict, rqvae_valid_dict = split_dict_random(
    {"text_embedding": emb_np}, test_size=0.1, random_state=2025
)
rqvae_train_loader = loader_builder.create_dataloader(
    rqvae_train_dict, batch_size=256, shuffle=True
)
rqvae_valid_loader = loader_builder.create_dataloader(
    rqvae_valid_dict, batch_size=256, shuffle=False
)
rqvae_full_loader = loader_builder.create_dataloader(
    {"text_embedding": emb_np}, batch_size=256, shuffle=False
)

Now we can instantiate RQ-VAE. It exposes several parameters:

- input_dim: dimension of the input embedding (e.g., 768 for BERT); this is what the encoder receives and what the decoder outputs back to.
- hidden_dims: list of hidden layer sizes for the encoder/decoder.
- latent_dim: dimension of the latent space and codebook vectors after encoding; smaller values compress more, larger values give more capacity.
- num_codebooks: number of residual quantization layers (stack depth); more layers yield finer-grained semantic IDs but make training/inference slightly slower.
- codebook_size: list of vocabulary sizes for each codebook layer; its length should equal `num_codebooks`. For example, `[256, 256, 256]` means 3 layers with 256 codewords each, for 256^3 combinations.
- shared_codebook: whether all layers share a single codebook; sharing reduces parameters while not sharing gives each layer more expressiveness. Default is not sharing.
- kmeans_method: codebook initialization method; `"kmeans"` is standard KMeans, `"bkmeans"` is balanced KMeans (recommended), and any other value uses random initialization.
- kmeans_iters: maximum iterations for KMeans initialization.
- distances_method: distance metric for quantization; `"l2"` is Euclidean (default for VAE usage) and `"cosine"` is also supported.
- loss_beta: commitment loss weight beta (the second term in the quantization loss); larger values force the encoder closer to the codebook, typically around 0.25.
- dense_features: RQ-VAE expects the raw embedding distribution as dense vectors; we need to tell the model which vectors should be learned and compressed.

After configuration, call `fit` to start training. Unlike the `fit` methods for ranking models, RQ-VAE requires an `init_batches` parameter to specify how many batches to use for codebook initialization.


In [8]:
rqvae = RQVAE(
    input_dim=embeddings.shape[1],
    hidden_dims=[128, 256],
    latent_dim=128,
    num_codebooks=2,
    codebook_size=[128, 128],
    shared_codebook=False,
    kmeans_method="bkmeans",
    kmeans_iters=50,
    distances_method="cosine",
    loss_beta=0.25,
    device="cpu",
    dense_features=[DenseFeature(name="text_embedding", input_dim=embeddings.shape[1])],
    session_id="rqvae_tutorial",
)
rqvae.fit(
    train_data=rqvae_train_loader,
    valid_data=rqvae_valid_loader,
    epochs=5,
    batch_size=256,
    lr=1e-3,
    init_batches=3,
)

[1m[94mModel Summary: RQVAE[0m

[1m[36m[1] Feature Configuration[0m
[36m--------------------------------------------------------------------------------[0m
Dense Features (1):
  1. text_embedding      

[1m[36m[2] Model Parameters[0m
[36m--------------------------------------------------------------------------------[0m
Model Architecture:
RQVAE(
  (encoder): RQEncoder(
    (stages): ModuleList(
      (0): Sequential(
        (0): Linear(in_features=768, out_features=128, bias=True)
        (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
      (1): Sequential(
        (0): Linear(in_features=128, out_features=256, bias=True)
        (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
      (2): Linear(in_features=256, out_features=128, bias=True)
    )
  )
  (decoder): RQDecoder(
    (stages): ModuleList(
      (0): Sequential(
        (0): Linear

Epoch 1/5: 100%|██████████| 36/36 [00:00<00:00, 106.90it/s]

Epoch 1/5 - Train Loss: 0.3862
[36m  Epoch 1/5 - Valid Loss: 0.1776[0m



Epoch 2/5: 100%|██████████| 36/36 [00:00<00:00, 131.94it/s]

Epoch 2/5 - Train Loss: 0.1252
[36m  Epoch 2/5 - Valid Loss: 0.0908[0m



Epoch 3/5: 100%|██████████| 36/36 [00:00<00:00, 142.71it/s]

Epoch 3/5 - Train Loss: 0.0788
[36m  Epoch 3/5 - Valid Loss: 0.0696[0m



Epoch 4/5: 100%|██████████| 36/36 [00:00<00:00, 136.32it/s]

Epoch 4/5 - Train Loss: 0.0654
[36m  Epoch 4/5 - Valid Loss: 0.0622[0m



Epoch 5/5: 100%|██████████| 36/36 [00:00<00:00, 131.95it/s]

Epoch 5/5 - Train Loss: 0.0623
[36m  Epoch 5/5 - Valid Loss: 0.0614[0m
 
[1mTraining finished.[0m
 





RQVAE(
  (encoder): RQEncoder(
    (stages): ModuleList(
      (0): Sequential(
        (0): Linear(in_features=768, out_features=128, bias=True)
        (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
      (1): Sequential(
        (0): Linear(in_features=128, out_features=256, bias=True)
        (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
      (2): Linear(in_features=256, out_features=128, bias=True)
    )
  )
  (decoder): RQDecoder(
    (stages): ModuleList(
      (0): Sequential(
        (0): Linear(in_features=128, out_features=256, bias=True)
        (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
      (1): Sequential(
        (0): Linear(in_features=256, out_features=128, bias=True)
        (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   

Training is finished; let's inspect the output. Each sample's original 768-dimensional continuous embedding is now represented by a 2-dimensional discrete vector, greatly reducing computation and storage.


In [9]:
semantic_ids = rqvae.predict(
    rqvae_full_loader, batch_size=256, return_reconstruction=False, as_numpy=False
)
semantic_ids = semantic_ids.to("cpu")
print(f"Semantic IDs shape: {semantic_ids.shape}")
print(f"Semantic IDs sample (first 5 items):\n{semantic_ids[:5]}")

Semantic IDs shape: torch.Size([10000, 2])
Semantic IDs sample (first 5 items):
tensor([[119,  27],
        [119,  69],
        [ 52,  33],
        [ 98,  80],
        [ 14,   3]])
