In [2]:
import h5py
import os
import numpy as np
import torch
from tqdm import tqdm
from collections import defaultdict
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import json

#### 最原始的数据集（以toys为例）
1. toys-split.item 是物品的信息数据集，每一个物品对应几个特征，id，title，price，categories，brand，sales_type，
2. toys-split.train/valid/test.iter 是训练用数据集，每一条样本有一个uid，iid，rating，timestamp
3. amazon-toys-games-filter_hdf5/norm_item_feat.h5 是一个llm的编码库，每一个item_id对应一个4096维的llm表征向量
# 涛林代码中，删掉了price和timestamp特征，直接沿用。

### 1.对于item数据集中的每一个item，根据llm表，获得对应的llm表征编码，并保存为一个pt文件，给vae模型使用

In [3]:
datasets = ["amazon-toys-games-filter","amazon-beauty-filter","amazon-sports-outdoors-filter"]
split_datasets = ["toys-split","beauty-split","sports-split"]

for dataset,split_dataset in zip(datasets,split_datasets):
    item_feat_path = f"./{dataset}_hdf5/norm_item_feat.h5"
    item_feat_dict = h5py.File(item_feat_path,'r')
    item_feat = []
    with open(f"/data2/wangzhongren/taolin_project/dataset/{split_dataset}/{split_dataset}.item", 'r') as read_f:
        lines = read_f.read().splitlines()
        for line in tqdm(lines[1:]):
            item_id = line.split("\t")[0]
            if item_id not in item_feat_dict.keys():
                print("this id has no llm emb")
                continue
            item_feat.append(item_feat_dict[item_id][:])
    item_feat = np.stack(item_feat)
    item_feat = torch.from_numpy(item_feat)
    # os.mkdirs(f"./{split_dataset}",exist_ok=True)
    torch.save(item_feat, f"./dataset/{split_dataset}/item_feat_input.pt")

  0%|          | 0/56657 [00:00<?, ?it/s]

100%|██████████| 56657/56657 [00:08<00:00, 6666.52it/s]
100%|██████████| 47171/47171 [00:08<00:00, 5337.50it/s]
100%|██████████| 48608/48608 [00:07<00:00, 6898.28it/s]


### 2.运行train_vae.py文件，给每一个id都生成对应的几个index，保存在moc_cbsize256_scala7_epoch100_index.pt这样的文件中

### 3.实际的数据生产

In [4]:
data_root = "./toys-split"
sample_filename = "toys-split.item"
index_filename = "moc_cbsize256_scala3_epoch100_index.pt"
cbsize = int(index_filename.split("_")[1][6:])
dataset_types = ['train','valid','test']
dataname = 'toys-split'

In [5]:
# 对item数据集的特征列进行处理，包括填充缺失值，对类别的分割
feat_keys = ['item_id', 'sales_type', 'brand','categories']
sample_path = os.path.join(data_root,sample_filename)
df_feat = pd.read_csv(sample_path, sep='\t', header=0)
df_feat.columns = [col.split(":")[0] for col in df_feat.keys()]
df_feat= df_feat[feat_keys]
# for i in range(3):
#     df_feat[f'category_{i+1}'] = df_feat['categories'].str.split(',').str.get(i).str.strip().fillna('Unknown')
# df_feat = df_feat.drop(columns=['categories'])
df_feat['sales_type'] = df_feat['sales_type'].fillna("missing")
df_feat[:3]

Unnamed: 0,item_id,sales_type,brand,categories
0,375829695,Home & Kitchen,Dr. Seuss,"'Toys & Games', 'Puzzles', 'Jigsaw Puzzles'"
1,439855896,Toys & Games,Rock Ridge,"'Toys & Games', 'Novelty & Gag Toys', 'Magic K..."
2,439893577,missing,Scholastic,"'Toys & Games', 'Pretend Play', 'Dress Up & Pr..."


In [6]:
# 获取index数据集，跟样本一样多的行，顺序是对应的
index_path = os.path.join(data_root,index_filename)
index = torch.load(index_path) # 获取index 是一个tensor向量，torch.Size([56657, 1])，如果有多个expert或者多层rq的话，就是torch.Size([56657, 3])
index.shape

  index = torch.load(index_path) # 获取index 是一个tensor向量，torch.Size([56657, 1])，如果有多个expert或者多层rq的话，就是torch.Size([56657, 3])


torch.Size([56657, 3])

In [7]:
# 把index特征融入到item表中
expert_num = int(index_path.split("scala")[1][:1])
method = index_filename.split("_")[0]
for i in range(expert_num):
    df_feat[f"{method}_index_{i+1}"] = index.cpu()[:,i]
df_feat[:3]

Unnamed: 0,item_id,sales_type,brand,categories,moc_index_1,moc_index_2,moc_index_3
0,375829695,Home & Kitchen,Dr. Seuss,"'Toys & Games', 'Puzzles', 'Jigsaw Puzzles'",36,147,5
1,439855896,Toys & Games,Rock Ridge,"'Toys & Games', 'Novelty & Gag Toys', 'Magic K...",231,102,214
2,439893577,missing,Scholastic,"'Toys & Games', 'Pretend Play', 'Dress Up & Pr...",171,85,212


In [8]:
# 对训练用数据集进行初步处理，label,列名等等，检查是否有空值
df_all = pd.DataFrame()
for dataset_type in dataset_types:
    data_filename = f"{dataname}.{dataset_type}.inter"
    data_path = f"{data_root}/{data_filename}"
    df_data = pd.read_csv(data_path, sep='\t', header=0)
    df_data.columns = [col.split(":")[0] for col in df_data.keys()]
    df_data['label'] = (df_data['rating']>3).astype(int)
    df_data = df_data.drop(columns = ['rating'])
    df_data['dataset_type'] = dataset_type
    df_all=df_all.append(df_data)
df_all[:3]

AttributeError: 'DataFrame' object has no attribute 'append'

In [None]:
# 把item特征混合到训练集中
df_all_merged = pd.merge(df_all,df_feat,on='item_id',how='left')
df_all_merged[:3]

Unnamed: 0,user_id,item_id,timestamp,label,dataset_type,sales_type,brand,category_1,category_2,category_3
0,AYVR1MQCTNU5D,375829695,1291939200,1,train,Home & Kitchen,Dr. Seuss,'Toys & Games','Puzzles','Jigsaw Puzzles'
1,A3CJHKFHHQJP2K,375829695,1297209600,0,train,Home & Kitchen,Dr. Seuss,'Toys & Games','Puzzles','Jigsaw Puzzles'
2,A3638FINP26E8N,375829695,1282521600,0,train,Home & Kitchen,Dr. Seuss,'Toys & Games','Puzzles','Jigsaw Puzzles'


In [9]:
# 使用labelencoder对稀疏特征进行编码
encodered_columns = ['user_id', 'item_id', 'timestamp', 'sales_type', 'brand','category_1', 'category_2', 'category_3']
lbe = LabelEncoder()
for column in encodered_columns:
    df_all_merged[column] = lbe.fit_transform(df_all_merged[column])
columns = [col for col in df_all_merged.columns if col != 'label'] + ['label']
df_all_merged = df_all_merged[columns]
df_all_merged[:3]

NameError: name 'df_all_merged' is not defined

In [10]:
# 保存为csv和h5
for dataset_type in dataset_types: 
    df_split = df_all_merged[df_all_merged['dataset_type'] == dataset_type].drop(columns=['dataset_type'])
    output_path = f"{data_root}/{index_filename[:-9]}"
    os.makedirs(output_path,exist_ok = True)
    df_split.to_csv(os.path.join(output_path,f'{dataset_type}.csv'),index=False)
    print(f"{dataset_type}.csv saved")
    with h5py.File(os.path.join(output_path,f'{dataset_type}.h5'), 'w') as f:
        f.create_dataset('data', data=df_split.values,dtype='float64')
    print(f"{dataset_type}.h5 saved")

NameError: name 'df_all_merged' is not defined

### 4.feature_map构建
（经检验，所有的列在train数据集能涵盖所有可能性，故只需要扫描train数据集即可）

In [23]:
# df_all_merged = df_all_merged.drop(columns=['dataset_type'])
# 创建feature_map
feature_specs = {}
col_value_index = 0
num_features = 0
for index,col in enumerate(df_all_merged.columns):
    if col in encodered_columns:
        feature_specs[col] = {
                    'source': '',
                    'type':"categorical",
                    'vocab_size': df_all_merged[col].nunique(),
                    'index': index
                }
        num_features = num_features + df_all_merged[col].nunique()
    elif col not in encodered_columns and col != 'label':
        feature_specs[col] = {
                    'source': '',
                    'type':"categorical",
                    'vocab_size': cbsize,
                    'index': index
                }
        num_features = num_features + cbsize
feature_map = {
        'dataset_id': f'amazon_{dataname}_{index_filename[:-9]}',
        'num_fields': len(df_all_merged.columns)-1,
        # 'feature_len': len(feature_specs),
        'num_features': num_features,
        'feature_specs': feature_specs
    }
json.dump(feature_map, open(os.path.join(output_path,f'feature_map.json'), 'w'),indent=4)
print("feature_map saved")

feature_map saved


In [6]:
train_data = pd.read_csv('./toys-split/moc_cbsize256_scala3_epoch100/train.csv')
valid_data = pd.read_csv('./toys-split/moc_cbsize256_scala3_epoch100/valid.csv')
test_data = pd.read_csv('./toys-split/moc_cbsize256_scala3_epoch100/test.csv')
all_data = train_data.append(valid_data).append(test_data)
for col in train_data.columns:
    print(col,train_data[col].nunique(),all_data[col].nunique())

user_id 772280 772280
item_id 56650 56650
timestamp 5020 5020
sales_type 27 27
brand 5996 5996
categories 700 700
moc_index_1 250 250
moc_index_2 248 248
moc_index_3 251 251
label 2 2


In [1]:
encodered_columns = [col for col in df_all_merged.columns if 'index' not in col and col != 'label']
encodered_columns

NameError: name 'df_all_merged' is not defined

In [5]:
file_path = "./toys-split/moc_cbsize256_scala3_epoch100/test.h5"
with h5py.File(file_path, "r") as h5_file:
    # 列出所有顶层键
    print("Keys in the file:", list(h5_file.keys()))
    
    # 读取某个数据集
    dataset_name = list(h5_file.keys())[0]  # 假设取第一个键
    data = h5_file[dataset_name][:]
data[:20]

Keys in the file: ['data']


array([[1.19573e+05, 0.00000e+00, 3.86100e+03, 2.30000e+01, 3.44300e+03,
        3.54000e+02, 2.31000e+02, 1.02000e+02, 2.14000e+02, 0.00000e+00],
       [5.26320e+04, 0.00000e+00, 3.92500e+03, 2.30000e+01, 3.44300e+03,
        3.54000e+02, 2.31000e+02, 1.02000e+02, 2.14000e+02, 0.00000e+00],
       [4.22370e+04, 0.00000e+00, 3.97700e+03, 2.30000e+01, 3.44300e+03,
        3.54000e+02, 2.31000e+02, 1.02000e+02, 2.14000e+02, 0.00000e+00],
       [7.68100e+04, 0.00000e+00, 2.68100e+03, 2.30000e+01, 3.44300e+03,
        3.54000e+02, 2.31000e+02, 1.02000e+02, 2.14000e+02, 1.00000e+00],
       [1.06183e+05, 1.00000e+00, 3.93400e+03, 2.60000e+01, 3.57900e+03,
        4.34000e+02, 1.71000e+02, 8.50000e+01, 2.12000e+02, 0.00000e+00],
       [6.79310e+04, 1.00000e+00, 3.79000e+03, 2.60000e+01, 3.57900e+03,
        4.34000e+02, 1.71000e+02, 8.50000e+01, 2.12000e+02, 1.00000e+00],
       [6.12690e+04, 1.00000e+00, 3.13800e+03, 2.60000e+01, 3.57900e+03,
        4.34000e+02, 1.71000e+02, 8.50000e+01

In [5]:
import torch
base = torch.load("/data2/wangzhongren/taolin_project/dataset/beauty-split/item_feat_input.pt")
t1 = torch.load("/data2/wangzhongren/taolin_project/dataset/beauty-split/me_cbsize256_cbdim32_scala1_epoch500_index.pt")
t2 = torch.load("/data2/wangzhongren/taolin_project/dataset/beauty-split/moc_cbsize256_cbdim32_scala1_epoch500_index.pt")

  base = torch.load("/data2/wangzhongren/taolin_project/dataset/beauty-split/item_feat_input.pt")
  t1 = torch.load("/data2/wangzhongren/taolin_project/dataset/beauty-split/me_cbsize256_cbdim32_scala1_epoch500_index.pt")
  t2 = torch.load("/data2/wangzhongren/taolin_project/dataset/beauty-split/moc_cbsize256_cbdim32_scala1_epoch500_index.pt")


In [6]:
base.shape,t1.shape,t2.shape,

(torch.Size([47171, 4096]), torch.Size([47171, 1]), torch.Size([47171, 1]))