In [1]:
import h5py
import os
import numpy as np
import torch
from tqdm import tqdm
from collections import defaultdict
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import json

In [39]:
datanames = ['toys-split','sports-split','beauty-split']
dataname = datanames[2]
data_root = f"/data2/wangzhongren/taolin_project/dataset/{dataname}"
sample_filename = f"{dataname}.item"
dataset_types = ['train','valid','test']

In [40]:
# 对item数据集的特征列进行处理，包括填充缺失值，对类别的分割
feat_keys = ['item_id', 'sales_type', 'brand','categories']
sample_path = os.path.join(data_root,sample_filename)
df_feat = pd.read_csv(sample_path, sep='\t', header=0)
df_feat.columns = [col.split(":")[0] for col in df_feat.keys()]
df_feat= df_feat[feat_keys]
df_feat[:3]

Unnamed: 0,item_id,sales_type,brand,categories
0,1304351475,Beauty,Omagazee,"'Beauty', 'Makeup', 'Eyes', 'Eye Shadow'"
1,535795545X,Beauty,Dermalogica,"'Beauty', 'Bath & Body', 'Cleansers', 'Body Wa..."
2,535795531X,Beauty,Dermalogica,"'Beauty', 'Skin Care', 'Face', 'Treatments & M..."


In [41]:
# 对训练用数据集进行初步处理，label,列名等等，检查是否有空值
df_all = pd.DataFrame()
for dataset_type in dataset_types:
    data_filename = f"{dataname}.{dataset_type}.inter"
    data_path = f"{data_root}/{data_filename}"
    df_data = pd.read_csv(data_path, sep='\t', header=0)
    df_data.columns = [col.split(":")[0] for col in df_data.keys()]
    df_data['label'] = (df_data['rating']>3).astype(int)
    df_data = df_data.drop(columns = ['rating','timestamp'])
    df_data['dataset_type'] = dataset_type
    df_all = pd.concat([df_all, df_data], ignore_index=True)

df_all[:3]

Unnamed: 0,user_id,item_id,label,dataset_type
0,A1RXI3A1E99112,1304351475,1,train
1,A26QL1FBQO9C0E,1304351475,1,train
2,A19KEEVZYO1KO6,1304351475,1,train


In [42]:
# 把item特征混合到训练集中
df_all_merged = pd.merge(df_all,df_feat,on='item_id',how='left')
df_all_merged[:3]

Unnamed: 0,user_id,item_id,label,dataset_type,sales_type,brand,categories
0,A1RXI3A1E99112,1304351475,1,train,Beauty,Omagazee,"'Beauty', 'Makeup', 'Eyes', 'Eye Shadow'"
1,A26QL1FBQO9C0E,1304351475,1,train,Beauty,Omagazee,"'Beauty', 'Makeup', 'Eyes', 'Eye Shadow'"
2,A19KEEVZYO1KO6,1304351475,1,train,Beauty,Omagazee,"'Beauty', 'Makeup', 'Eyes', 'Eye Shadow'"


In [43]:
# 使用labelencoder对稀疏特征进行编码
encodered_columns = ['user_id', 'item_id', 'sales_type', 'brand','categories']
lbe = LabelEncoder()
for column in encodered_columns:
    df_all_merged[column] = lbe.fit_transform(df_all_merged[column])
columns = [col for col in df_all_merged.columns if col != 'label'] + ['label']
df_all_merged = df_all_merged[columns]
df_all_merged[:3]

Unnamed: 0,user_id,item_id,dataset_type,sales_type,brand,categories,label
0,180698,0,train,3,4545,111,1
1,275641,0,train,3,4545,111,1
2,62339,0,train,3,4545,111,1


In [44]:
# 保存为csv和h5
for dataset_type in dataset_types: 
    df_split = df_all_merged[df_all_merged['dataset_type'] == dataset_type].drop(columns=['dataset_type'])
    output_path = f"/data2/wangzhongren/taolin_project/FuxiCTR/data/{dataname}-base"
    os.makedirs(output_path,exist_ok = True)
    df_split.to_csv(os.path.join(output_path,f'{dataset_type}.csv'),index=False)
    print(f"{dataset_type}.csv saved")

train.csv saved
valid.csv saved
test.csv saved


In [27]:
df_all_merged = df_all_merged.drop(columns=['dataset_type'])
# 创建feature_map
feature_specs = {}
col_value_index = 0
num_features = 0
for index,col in enumerate(df_all_merged.columns):
    if col in encodered_columns:
        feature_specs[col] = {
                    'source': '',
                    'type':"categorical",
                    'vocab_size': df_all_merged[col].nunique(),
                    'index': index
                }
        num_features = num_features + df_all_merged[col].nunique()
feature_map = {
        'dataset_id': f'amazon_{dataname}_base',
        'num_fields': len(df_all_merged.columns)-1,
        # 'feature_len': len(feature_specs),
        'num_features': num_features,
        'feature_specs': feature_specs
    }
json.dump(feature_map, open(os.path.join(output_path,f'feature_map.json'), 'w'),indent=4)
print("feature_map saved")

feature_map saved
