In [2]:
#pip install nextrec

In [1]:
import logging
import sys

logger = logging.getLogger() 
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
logger.handlers = [handler] 

# Data Processor

我们先用一个简单的电商数据集来上手如何使用NextRec内置提供的DataProcessor。首先观察一下原始的特征，包含各种类型：

- 序列特征：例如hist_item_seq
- 稀疏特征：例如city，device，channel	
- 稠密特征：例如item_price

同时我们还有多个预测目标，分别是click，conversion。

In [3]:
pip show nextrec

Name: nextrec
Version: 0.4.21
Summary: A comprehensive recommendation library with match, ranking, and multi-task learning models
Home-page: https://github.com/zerolovesea/NextRec
Author: Yang Zhou
Author-email: zyaztec@gmail.com
License: 
Location: /opt/anaconda3/envs/nextrec/lib/python3.10/site-packages
Editable project location: /Users/zyaztec/DailyWork/建模代码整理/NextRec
Requires: numpy, pandas, pyarrow, pyyaml, rich, scikit-learn, scipy, swanlab, torch, torchvision, transformers, wandb
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
from nextrec.data.preprocessor import DataProcessor

df = pd.read_csv('/NextRec/dataset/ctcvr_task.csv')
df.head()

Unnamed: 0,user_id,item,gender,city,device,channel,age_bucket,user_active_days_7,user_ctr,item_price,item_popularity,time_since_last_click,hist_item_seq,click,conversion
0,1447473,item_HOME_255,M,Shanghai,iOS,organic,>=55,1,0.244235,410.405521,0.039235,19.252693,"item_FOOD_883,item_FOOD_371,item_BEAUTY_687,it...",0,0
1,1994457,item_FOOD_370,F,Shanghai,Android,ad,25-34,6,0.244364,159.918317,0.247731,8.203935,"item_BEAUTY_673,item_SPORT_880,item_CLOTH_778,...",0,0
2,1992582,item_ELEC_661,F,Others,Web,push,25-34,3,0.05512,161.772199,0.294542,19.57097,"item_SPORT_432,item_CLOTH_493",0,0
3,1382009,item_FOOD_942,M,Guangzhou,Web,ad,<18,6,0.255991,174.953184,0.04866,15.555436,"item_CLOTH_815,item_BEAUTY_107,item_HOME_942,i...",0,0
4,1953587,item_CLOTH_127,M,Shanghai,Web,organic,>=55,4,0.149875,195.314634,0.075152,115.380912,"item_SPORT_968,item_BEAUTY_313,item_CLOTH_611,...",1,0


现在我们需要将这些原始特征转化为向量，也就是各种模型需要接受的格式。在推荐系统中，通常会处理多种类型的输入信号，在经过一系列的变换之后转化为向量输入网络：

- 稠密特征（数值型）：连续或可序数化的数值，如年龄、价格、时长、打分；常见做法是标准化/归一化或对数变换。
- 稀疏特征（类别/ID）：高基数离散字段，如用户 ID、物品 ID、性别、职业、设备类型；通常需要索引化后，在一个embedding lookup matrix中进行嵌入。
- 序列特征（行为序列）：可变长的历史行为，如用户的浏览/点击/购买列表。这类特征表征了用户的行为和兴趣变化，通常我们需要截断、padding，嵌入后通过不同聚合方式（如 mean/sum/attention）将其变为定长向量。
- 上下文特征：时间、地理、曝光位置等环境信息，可是稠密也可能是稀疏，常与主特征交互。
- 多模态特征：文本、图片、视频等经过预训练模型得到的向量，可直接作为稠密输入，或与 ID 交互建模。

NextRec内置了DataProcessor以进行特征的预处理，接下来是实际的例子。首先我们需要定义不同的特征，并将需要的变换方式传入其中。

In [4]:
task_labels = ['click', 'conversion']
dense_features_list = ['user_active_days_7', 'user_ctr', 'item_price', 'item_popularity', 'time_since_last_click']
sparse_features_list = ['user_id', 'item', 'gender', 'city', 'device', 'channel', 'age_bucket']
sequence_features_list = ['hist_item_seq']

# 初始化数据处理器
processor = DataProcessor()

# 对于数值型特征，我们想要使用标准化进行处理
for feat in dense_features_list:
    processor.add_numeric_feature(feat, scaler='standard') # 其他的处理方式还包括：'standard', 'minmax', 'robust', 'maxabs', 'log', 'none'

# 对于稀疏特征，我们需要把它们进行编码，nextrec 支持两种编码方式：标签编码和哈希编码
for feat in sparse_features_list:
    processor.add_sparse_feature(feat, encode_method='hash', hash_size=1000) # 哈希编码，需要传入 hash_size 参数
    # processor.add_sparse_feature(feat, encode_method='label') # 标签编码

# 对于序列特征，我们也需要进行编码，把序列里的每个item都变成一个数值，然后对序列进行填充和截断，来让输入模型的序列长度一致
for feat in sequence_features_list:
    processor.add_sequence_feature(
        feat,
        encode_method='hash', # 哈希编码，nextrec 也支持标签编码
        hash_size=2000,       # 哈希编码需要传入 hash_size 参数
        max_len=20,           # 序列的最大长度
        pad_value=0,          # 使用0进行填充
        truncate='post',      # 截断方式，可以选择 'pre' 或 'post'，post为从序列后面截断，pre为从序列前面截断
        separator=','         # 序列的分隔符，默认为逗号
    )

# 在label不为数值的情况下，需要对label进行编码，不过这里我们的label是二分类的数值型标签，所以不需要额外处理
# for label in task_labels:
#     processor.add_target(label, target_type='binary')

# 在添加完配置后，调用 fit 方法让处理器学习数据的统计信息
processor.fit(df)

# 在fit完以后，我们可以使用save方法把处理器保存到本地，方便后续加载使用
processor.save('./data_processor.pkl')


2025-12-28 15:17:59,323 INFO [1m[36mFitting DataProcessor...[0m
2025-12-28 15:17:59,332 INFO DataProcessor saved to: /Users/zyaztec/DailyWork/建模代码整理/NextRec/tutorials/notebooks/zh/data_processor.pkl, NextRec version: 0.4.21


In [5]:
processor = DataProcessor.load('./data_processor.pkl')

2025-12-28 15:18:01,423 INFO DataProcessor loaded from data_processor.pkl, NextRec version: 0.4.21


In [6]:
processor.summary()

2025-12-28 15:18:02,367 INFO [1m[94mDataProcessor Summary[0m
2025-12-28 15:18:02,368 INFO 
2025-12-28 15:18:02,368 INFO [1m[36m[1] Feature Configuration[0m
2025-12-28 15:18:02,369 INFO [36m--------------------------------------------------------------------------------[0m
2025-12-28 15:18:02,369 INFO Dense Features (5):
2025-12-28 15:18:02,369 INFO   #    Name                             Scaler    Fill NA
2025-12-28 15:18:02,370 INFO   ---- ----------------------- --------------- ----------
2025-12-28 15:18:02,370 INFO   1    user_active_days_7             standard       None
2025-12-28 15:18:02,371 INFO   2    user_ctr                       standard       None
2025-12-28 15:18:02,371 INFO   3    item_price                     standard       None
2025-12-28 15:18:02,372 INFO   4    item_popularity                standard       None
2025-12-28 15:18:02,372 INFO   5    time_since_last_click          standard       None
2025-12-28 15:18:02,372 INFO Sparse Features (7):
2025-12-28 

我们已经为DataProcessor传入了配置，现在我们需要对数据进行转换。使用processor的transform方法进行转换。transform方法拥有四个参数：

- data：需要转换的数据，支持格式：dict，dataframe，路径
- return_dict：是否返回字典。nextrec内置的训练方法支持多种格式，但是通常建议传入字典，但是如果有需要的话，将return_dict设置为false，processor将会把数据转化为dataframe。这里为了演示，我们将数据变为dataframe。
- save_format：对于需要将数据提前处理好的场景，processor支持将数据保存为csv或parquet两种格式。当传入的data是个路径时，将会在transformed_data路径下生成转换后的文件。
- output_path：保存的数据

In [7]:
# 返回dataframe
df_transformed = processor.transform(df, return_dict=False)
df_transformed.head()

# del df_transformed

Unnamed: 0,user_id,item,gender,city,device,channel,age_bucket,user_active_days_7,user_ctr,item_price,item_popularity,time_since_last_click,hist_item_seq,click,conversion
0,393,847,116,348,251,883,133,-1.081709,0.420325,1.829769,-1.525792,-0.200305,"[1116, 926, 1077, 243, 0, 0, 0, 0, 0, 0, 0, 0,...",0,0
1,117,917,130,348,685,189,820,1.07998,0.42154,-0.375964,-0.226069,-0.664915,"[156, 1459, 603, 1415, 1635, 76, 573, 1834, 19...",0,0
2,28,53,130,61,688,764,820,-0.217034,-1.360094,-0.359639,0.065739,-0.186922,"[521, 38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
3,833,686,116,770,688,189,403,1.07998,0.531006,-0.24357,-1.46704,-0.355778,"[1262, 1505, 1433, 1114, 350, 306, 1013, 1207,...",0,0
4,902,588,116,348,688,883,133,0.215304,-0.468021,-0.064272,-1.301894,3.841963,"[597, 523, 309, 1567, 916, 616, 1563, 695, 0, ...",1,0


In [8]:
# 返回字典
data_dict = processor.transform(df, return_dict=True)
data_dict

# del data_dict

{'user_id': array([393, 117,  28, ..., 630, 286, 109], shape=(5000,)),
 'item': array([847, 917,  53, ..., 347, 787,  76], shape=(5000,)),
 'gender': array([116, 130, 130, ..., 116, 116, 534], shape=(5000,)),
 'city': array([348, 348,  61, ...,  61, 358, 770], shape=(5000,)),
 'device': array([251, 685, 688, ..., 685, 688, 251], shape=(5000,)),
 'channel': array([883, 189, 764, ..., 764, 764, 883], shape=(5000,)),
 'age_bucket': array([133, 820, 820, ..., 820, 133, 133], shape=(5000,)),
 'user_active_days_7': array([-1.08170933,  1.07997998, -0.21703361, ..., -0.21703361,
         0.21530426, -1.5140472 ], shape=(5000,)),
 'user_ctr': array([ 0.42032544,  0.42154015, -1.36009409, ...,  0.34321166,
         0.16423405,  0.49036498], shape=(5000,)),
 'item_price': array([ 1.82976898, -0.37596407, -0.35963921, ...,  1.10315715,
        -0.26375663,  0.25635198], shape=(5000,)),
 'item_popularity': array([-1.5257923 , -0.22606919,  0.0657394 , ...,  0.11534086,
         1.00318119, -0.7031

In [9]:
# 将处理后的数据保存为parquet文件，只需要传入一个路径，转换后的数据会被保存到该路径下，命名为 transformed_data.parquet
df_transformed = processor.transform(df, return_dict=False, save_format='parquet', output_path='./')

2025-12-28 15:18:10,034 INFO [32mTransformed data saved to: /Users/zyaztec/DailyWork/建模代码整理/NextRec/tutorials/notebooks/zh/transformed_data.parquet[0m


In [10]:
df_transformed = pd.read_parquet('./transformed_data.parquet')
df_transformed.head()

Unnamed: 0,user_id,item,gender,city,device,channel,age_bucket,user_active_days_7,user_ctr,item_price,item_popularity,time_since_last_click,hist_item_seq,click,conversion
0,393,847,116,348,251,883,133,-1.081709,0.420325,1.829769,-1.525792,-0.200305,"[1116, 926, 1077, 243, 0, 0, 0, 0, 0, 0, 0, 0,...",0,0
1,117,917,130,348,685,189,820,1.07998,0.42154,-0.375964,-0.226069,-0.664915,"[156, 1459, 603, 1415, 1635, 76, 573, 1834, 19...",0,0
2,28,53,130,61,688,764,820,-0.217034,-1.360094,-0.359639,0.065739,-0.186922,"[521, 38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
3,833,686,116,770,688,189,403,1.07998,0.531006,-0.24357,-1.46704,-0.355778,"[1262, 1505, 1433, 1114, 350, 306, 1013, 1207,...",0,0
4,902,588,116,348,688,883,133,0.215304,-0.468021,-0.064272,-1.301894,3.841963,"[597, 523, 309, 1567, 916, 616, 1563, 695, 0, ...",1,0


In [None]:
# 转换一个路径，这个路径下的数据都会被遍历处理
data_path = 'train_sample_251111'
# 将会返回一个路径，路径下是转换后的数据文件
df_transformed = processor.transform(data_path, return_dict=False, save_format='parquet', output_path='./')

In [None]:
df_transformed