In [2]:
import h5py
import os
import numpy as np
import torch
from tqdm import tqdm
from collections import defaultdict
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import json

In [8]:
datanames = ['toys-split','sports-split','beauty-split']
dataname = datanames[1]
data_root = f"/data2/wangzhongren/taolin_project/dataset/{dataname}"
output_root = f"/data2/wangzhongren/taolin_project/data/{dataname}"
sample_filename = f"{dataname}.item"
dataset_types = ['train','valid','test']


In [9]:
# 对item数据集的特征列进行处理，包括填充缺失值，对类别的分割
feat_keys = ['item_id', 'sales_type', 'brand','categories']
sample_path = os.path.join(data_root,sample_filename)
df_feat = pd.read_csv(sample_path, sep='\t', header=0)
df_feat.columns = [col.split(":")[0] for col in df_feat.keys()]
df_feat= df_feat[feat_keys]
# 不对类目特征做额外处理
# for i in range(3):
#     df_feat[f'category_{i+1}'] = df_feat['categories'].str.split(',').str.get(i).str.strip().fillna('Unknown')
# df_feat = df_feat.drop(columns=['categories'])
df_feat['sales_type'] = df_feat['sales_type'].fillna("missing")
df_feat[:3]

Unnamed: 0,item_id,sales_type,brand,categories
0,31852,Toys & Games,Coxlures,"'Other Sports', 'Dance', 'Sports & Outdoors'"
1,32050,missing,BubuBibi,"'Sports & Outdoors', 'Skirts', 'Clothing', 'Gi..."
2,615302939,Sports & Outdoors,NNG,'Sports & Outdoors'


In [10]:
# 对训练用数据集进行初步处理，label,列名等等，检查是否有空值
df_all = pd.DataFrame()
for dataset_type in dataset_types:
    data_filename = f"{dataname}.{dataset_type}.inter"
    data_path = f"{data_root}/{data_filename}"
    df_data = pd.read_csv(data_path, sep='\t', header=0)
    df_data.columns = [col.split(":")[0] for col in df_data.keys()]
    df_data['label'] = (df_data['rating']>3).astype(int)
    df_data = df_data.drop(columns = ['rating'])
    df_data = df_data.drop(columns = ['timestamp'])
    df_data['dataset_type'] = dataset_type
    df_all = pd.concat([df_all, df_data], ignore_index=True)
df_all[:3]

Unnamed: 0,user_id,item_id,label,dataset_type
0,A3PMSRCL80KSA1,31852,1,train
1,A1KJ4CVG87QW09,31852,1,train
2,AA9ITO6ZLZW6,31852,1,train


In [11]:
# 把item特征混合到训练集中
df_all_merged = pd.merge(df_all,df_feat,on='item_id',how='left')
df_all_merged[:3]

Unnamed: 0,user_id,item_id,label,dataset_type,sales_type,brand,categories
0,A3PMSRCL80KSA1,31852,1,train,Toys & Games,Coxlures,"'Other Sports', 'Dance', 'Sports & Outdoors'"
1,A1KJ4CVG87QW09,31852,1,train,Toys & Games,Coxlures,"'Other Sports', 'Dance', 'Sports & Outdoors'"
2,AA9ITO6ZLZW6,31852,1,train,Toys & Games,Coxlures,"'Other Sports', 'Dance', 'Sports & Outdoors'"


In [12]:
item_ids_path = os.path.join(data_root, "item_id.pt")
item_ids_list = torch.load(item_ids_path)
print(f"✅ 加载item_ids_list: {len(item_ids_list)}个items")
print(f"   示例: {item_ids_list[:3]}")
item_to_position = {}
for pos, item_id in enumerate(item_ids_list):
    key = item_id.decode('utf-8') if isinstance(item_id, bytes) else str(item_id)
    item_to_position[key] = pos
df_all_merged['item_position'] = df_all_merged['item_id'].astype(str).map(
    lambda x: item_to_position.get(x, 0)
)

  item_ids_list = torch.load(item_ids_path)


✅ 加载item_ids_list: 48608个items
   示例: ['0000031852', '0000032050', '0615302939']


In [13]:
# 使用labelencoder对稀疏特征进行编码
encodered_columns = ['user_id', 'item_id', 'sales_type', 'brand','categories']
lbe = LabelEncoder()
for column in encodered_columns:
    df_all_merged[column] = lbe.fit_transform(df_all_merged[column])
columns = [col for col in df_all_merged.columns if col != 'label'] + ['label']
df_all_merged = df_all_merged[columns]
df_all_merged[:3]

Unnamed: 0,user_id,item_id,dataset_type,sales_type,brand,categories,item_position,label
0,751626,0,train,24,1501,640,0,1
1,158994,0,train,24,1501,640,0,1
2,850430,0,train,24,1501,640,0,1


In [14]:
df_all_merged['item_id'].max()

48607

In [30]:
# 保存为csv和h5
for dataset_type in dataset_types: 
    df_split = df_all_merged[df_all_merged['dataset_type'] == dataset_type].drop(columns=['dataset_type'])
    output_path = f"{output_root}/base_dataset"
    os.makedirs(output_path,exist_ok = True)
    df_split.to_csv(os.path.join(output_path,f'{dataset_type}.csv'),index=False)
    print(f"{dataset_type}.csv saved")
    # with h5py.File(os.path.join(output_path,f'{dataset_type}.h5'), 'w') as f:
    #     f.create_dataset('data', data=df_split.values,dtype='float64')
    # print(f"{dataset_type}.h5 saved")

train.csv saved
valid.csv saved
test.csv saved


In [5]:
item_ids_path = os.path.join(data_root, "item_id.pt")
item_ids_list = torch.load(item_ids_path)
print(f"✅ 加载item_ids_list: {len(item_ids_list)}个items")
print(f"   示例: {item_ids_list[:3]}")

✅ 加载item_ids_list: 56657个items
   示例: ['0375829695', '0439855896', '0439893577']


  item_ids_list = torch.load(item_ids_path)


In [3]:
index = torch.load("/data2/wangzhongren/taolin_project/dataset/toys-split/moc_cbsize256_cbdim32_scala7_epoch500_index.pt")
index

  index = torch.load("/data2/wangzhongren/taolin_project/dataset/toys-split/moc_cbsize256_cbdim32_scala7_epoch500_index.pt")


tensor([[ 74, 132, 206,  ..., 223, 252,  89],
        [132,  87,  86,  ..., 207,  86,  87],
        [147,   6,  62,  ...,  82, 167, 213],
        ...,
        [212, 159,  44,  ..., 146, 147, 179],
        [162, 153,  50,  ...,  18, 217, 145],
        [  3,  87,  60,  ..., 187,  86,  87]], device='cuda:7')

In [7]:
with h5py.File("/data2/wangzhongren/taolin_project/FuxiCTR/model_zoo/data/sports/train.h5", 'r') as f:
    for key in f.keys():
        print(f[key].shape)

(1230342,)
(1230342,)
(1230342,)
(1230342,)
(1230342,)
(1230342,)
(1230342,)


In [8]:
index = torch.load("/data2/wangzhongren/taolin_project/dataset/sports-split/moc_cbsize256_cbdim32_scala7_epoch500_index.pt")
index.shape

  index = torch.load("/data2/wangzhongren/taolin_project/dataset/sports-split/moc_cbsize256_cbdim32_scala7_epoch500_index.pt")


torch.Size([48608, 7])

In [9]:
index = torch.load("/data2/wangzhongren/taolin_project/dataset/beauty-split/moc_cbsize256_cbdim32_scala7_epoch500_index.pt")
index.shape

  index = torch.load("/data2/wangzhongren/taolin_project/dataset/beauty-split/moc_cbsize256_cbdim32_scala7_epoch500_index.pt")


torch.Size([47171, 7])

In [23]:
path = '/data2/wangzhongren/taolin_project/FuxiCTR/model_zoo/data/sports/train.h5'
with h5py.File(path, 'r') as f:
    for key in f.keys():
        print(key)
    item_id = f['item_id'][:]
    item_position = f['item_position'][:]
    print(item_id[:10])
    print(item_position[:10])

brand
categories
item_id
item_position
label
sales_type
user_id
[1 1 1 1 1 1 2 2 2 2]
[0. 0. 0. 0. 0. 0. 1. 1. 1. 1.]


In [24]:
path = '/data2/wangzhongren/taolin_project/data/sports-split/base_dataset/train.csv'
df = pd.read_csv(path)
df['item_id'][:10]
df['item_position'][:10]

0    0
1    0
2    0
3    0
4    0
5    0
6    1
7    1
8    1
9    1
Name: item_position, dtype: int64