In [2]:
#pip install nextrec

In [9]:
import logging
import sys

logger = logging.getLogger() 
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
logger.handlers = [handler] 

# Data Processor

We'll start with a simple e-commerce dataset to learn how to use NextRec's built-in DataProcessor. First look at the raw features, which cover several types:

- Sequence features: e.g., hist_item_seq
- Sparse features: e.g., city, device, channel
- Dense features: e.g., item_price

We also have multiple prediction targets: click and conversion.


In [5]:
import pandas as pd
from nextrec.data.preprocessor import DataProcessor

df = pd.read_csv('dataset/ctcvr_task.csv')
df.head()

Unnamed: 0,user_id,item,gender,city,device,channel,age_bucket,user_active_days_7,user_ctr,item_price,item_popularity,time_since_last_click,hist_item_seq,click,conversion
0,1447473,item_HOME_255,M,Shanghai,iOS,organic,>=55,1,0.244235,410.405521,0.039235,19.252693,"item_FOOD_883,item_FOOD_371,item_BEAUTY_687,it...",0,0
1,1994457,item_FOOD_370,F,Shanghai,Android,ad,25-34,6,0.244364,159.918317,0.247731,8.203935,"item_BEAUTY_673,item_SPORT_880,item_CLOTH_778,...",0,0
2,1992582,item_ELEC_661,F,Others,Web,push,25-34,3,0.05512,161.772199,0.294542,19.57097,"item_SPORT_432,item_CLOTH_493",0,0
3,1382009,item_FOOD_942,M,Guangzhou,Web,ad,<18,6,0.255991,174.953184,0.04866,15.555436,"item_CLOTH_815,item_BEAUTY_107,item_HOME_942,i...",0,0
4,1953587,item_CLOTH_127,M,Shanghai,Web,organic,>=55,4,0.149875,195.314634,0.075152,115.380912,"item_SPORT_968,item_BEAUTY_313,item_CLOTH_611,...",1,0


Now we need to turn these raw features into vectorsâ€”the format every model consumes. In recommendation we usually process multiple input signals, transform them, and then feed vectors into the network:

- Dense features (numeric): continuous or ordered values such as age, price, duration, or scores; typically standardized/normalized or log-transformed.
- Sparse features (categorical/ID): high-cardinality discrete fields such as user ID, item ID, gender, occupation, or device type; typically indexed and embedded via an embedding lookup matrix.
- Sequence features (behavior history): variable-length histories such as browse/click/purchase lists. They capture user behavior and interest drift; we usually truncate/pad, embed, and then aggregate (mean/sum/attention) to get a fixed-length vector.
- Context features: environment information such as time, geography, or slot position; can be dense or sparse and often interacts with the main features.
- Multi-modal features: vectors from pre-trained models on text, images, or video; they can be used directly as dense inputs or interact with IDs.

NextRec ships a DataProcessor for this preprocessing. Here's a practical example: first define the different features and pass in the transformations they need.


In [None]:
task_labels = ['click', 'conversion']
dense_features_list = ['user_active_days_7', 'user_ctr', 'item_price', 'item_popularity', 'time_since_last_click']
sparse_features_list = ['user_id', 'item', 'gender', 'city', 'device', 'channel', 'age_bucket']
sequence_features_list = ['hist_item_seq']

# initialize data processor
processor = DataProcessor()

# we can choose different scalers according to our needs
for feat in dense_features_list:
    processor.add_numeric_feature(feat, scaler='standard') # 'standard', 'minmax', 'robust', 'maxabs', 'log', 'none'

# we can choose different encoding methods according to our needs, nextrec supports 'label' and 'hash' encoding methods
for feat in sparse_features_list:
    processor.add_sparse_feature(feat, encode_method='hash', hash_size=1000) # hash encoding, needs to pass hash_size parameter
    # processor.add_sparse_feature(feat, encode_method='label') # label encoding

# For sequence features, we also need to encode them, turning each item in the sequence into a numerical value, then padding and truncating the sequence to ensure consistent input length for the model
for feat in sequence_features_list:
    processor.add_sequence_feature(
        feat,
        encode_method='hash', # hash encoding
        hash_size=2000,       # hash encoding needs to pass hash_size parameter
        max_len=20,           # maximum length of the sequence
        pad_value=0,          # use 0 for padding
        truncate='post',      # truncation method, can choose 'pre' or 'post', post truncates from the end of the sequence, pre truncates from the beginning
        separator=','         # separator for the sequence, default is comma
    )

# when the labels are not numeric, we need to encode them, but here our labels are binary numeric labels, so no extra processing is needed
# for label in task_labels:
#     processor.add_target(label, target_type='binary')

# after adding all configurations, call the fit method to let the processor learn the statistical information of the data
processor.fit(df)

# After fitting, we can use the save method to save the processor locally for later use
processor.save('./data_processor.pkl')


In [7]:
processor = DataProcessor.load('./data_processor.pkl')

In [10]:
processor.summary()

2025-11-27 10:05:37,587 INFO [1m[94mDataProcessor Summary[0m
2025-11-27 10:05:37,588 INFO 
2025-11-27 10:05:37,588 INFO [1m[36m[1] Feature Configuration[0m
2025-11-27 10:05:37,589 INFO [36m--------------------------------------------------------------------------------[0m
2025-11-27 10:05:37,589 INFO Dense Features (5):
2025-11-27 10:05:37,590 INFO   #    Name                             Scaler    Fill NA
2025-11-27 10:05:37,590 INFO   ---- ----------------------- --------------- ----------
2025-11-27 10:05:37,590 INFO   1    user_active_days_7             standard       None
2025-11-27 10:05:37,591 INFO   2    user_ctr                       standard       None
2025-11-27 10:05:37,591 INFO   3    item_price                     standard       None
2025-11-27 10:05:37,591 INFO   4    item_popularity                standard       None
2025-11-27 10:05:37,591 INFO   5    time_since_last_click          standard       None
2025-11-27 10:05:37,592 INFO Sparse Features (7):
2025-11-27 

We've passed the configuration into DataProcessor; now we need to transform the data. Use the processor's transform method, which takes four parameters:

- data: the data to transform; supports dict, DataFrame, or a path
- return_dict: whether to return a dict. NextRec's built-in training methods support multiple formats, but a dict is usually preferred. If you set return_dict to False, the processor converts the data to a DataFrame. For demonstration we convert the data to a DataFrame here.
- save_format: for scenarios where you want preprocessed data saved up front, the processor can write csv or parquet. When the input `data` is a path, the transformed files will be placed under the `transformed_data` path.
- output_path: where to save the data


In [None]:
# if you want to get the transformed data as a dictionary, you can set return_dict=True, or just leave it as False to get a pandas DataFrame
df_transformed = processor.transform(df, return_dict=False)
df_transformed.head()

# del df_transformed

Unnamed: 0,user_id,item,gender,city,device,channel,age_bucket,user_active_days_7,user_ctr,item_price,item_popularity,time_since_last_click,hist_item_seq,click,conversion
0,393,847,116,348,251,883,133,-1.081709,0.420325,1.829769,-1.525792,-0.200305,"[1116, 926, 1077, 243, 0, 0, 0, 0, 0, 0, 0, 0,...",0,0
1,117,917,130,348,685,189,820,1.07998,0.42154,-0.375964,-0.226069,-0.664915,"[156, 1459, 603, 1415, 1635, 76, 573, 1834, 19...",0,0
2,28,53,130,61,688,764,820,-0.217034,-1.360094,-0.359639,0.065739,-0.186922,"[521, 38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
3,833,686,116,770,688,189,403,1.07998,0.531006,-0.24357,-1.46704,-0.355778,"[1262, 1505, 1433, 1114, 350, 306, 1013, 1207,...",0,0
4,902,588,116,348,688,883,133,0.215304,-0.468021,-0.064272,-1.301894,3.841963,"[597, 523, 309, 1567, 916, 616, 1563, 695, 0, ...",1,0


In [None]:
# dict format
data_dict = processor.transform(df, return_dict=True)
data_dict

# del data_dict

{'user_id': array([393, 117,  28, ..., 630, 286, 109], shape=(5000,)),
 'item': array([847, 917,  53, ..., 347, 787,  76], shape=(5000,)),
 'gender': array([116, 130, 130, ..., 116, 116, 534], shape=(5000,)),
 'city': array([348, 348,  61, ...,  61, 358, 770], shape=(5000,)),
 'device': array([251, 685, 688, ..., 685, 688, 251], shape=(5000,)),
 'channel': array([883, 189, 764, ..., 764, 764, 883], shape=(5000,)),
 'age_bucket': array([133, 820, 820, ..., 820, 133, 133], shape=(5000,)),
 'user_active_days_7': array([-1.08170933,  1.07997998, -0.21703361, ..., -0.21703361,
         0.21530426, -1.5140472 ], shape=(5000,)),
 'user_ctr': array([ 0.42032544,  0.42154015, -1.36009409, ...,  0.34321166,
         0.16423405,  0.49036498], shape=(5000,)),
 'item_price': array([ 1.82976898, -0.37596407, -0.35963921, ...,  1.10315715,
        -0.26375663,  0.25635198], shape=(5000,)),
 'item_popularity': array([-1.5257923 , -0.22606919,  0.0657394 , ...,  0.11534086,
         1.00318119, -0.7031

In [None]:
# You can save the processed data as a parquet file by simply providing a path. The transformed data will be saved at that path
df_transformed = processor.transform(df, return_dict=False, save_format='parquet', output_path='./')

In [15]:
df_transformed = pd.read_parquet('./transformed_data.parquet')
df_transformed.head()

Unnamed: 0,user_id,item,gender,city,device,channel,age_bucket,user_active_days_7,user_ctr,item_price,item_popularity,time_since_last_click,hist_item_seq,click,conversion
0,393,847,116,348,251,883,133,-1.081709,0.420325,1.829769,-1.525792,-0.200305,"[1116, 926, 1077, 243, 0, 0, 0, 0, 0, 0, 0, 0,...",0,0
1,117,917,130,348,685,189,820,1.07998,0.42154,-0.375964,-0.226069,-0.664915,"[156, 1459, 603, 1415, 1635, 76, 573, 1834, 19...",0,0
2,28,53,130,61,688,764,820,-0.217034,-1.360094,-0.359639,0.065739,-0.186922,"[521, 38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
3,833,686,116,770,688,189,403,1.07998,0.531006,-0.24357,-1.46704,-0.355778,"[1262, 1505, 1433, 1114, 350, 306, 1013, 1207,...",0,0
4,902,588,116,348,688,883,133,0.215304,-0.468021,-0.064272,-1.301894,3.841963,"[597, 523, 309, 1567, 916, 616, 1563, 695, 0, ...",1,0


In [None]:
# Convert a directory, all data under this directory will be processed and saved
data_path = '/train_sample_251111' # path to the directory containing data files
# A path will be returned, under which the transformed data files are located
df_transformed = processor.transform(data_path, return_dict=False, save_format='parquet', output_path='./')

In [None]:
df_transformed