In [1]:
import pandas as pd 
import numpy as np 
import networkx as nx 
import matplotlib.pyplot as plt 
import random 
from tqdm import tqdm
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder

In [2]:
action_data_path = "./data/action_head.csv"
product_data_path = "./data/jdata_product.csv"

In [3]:
def read_action_data():
    action_data_df = pd.read_csv(action_data_path, 
                                 usecols=['user_id', 'sku_id', 'action_time', 'type'],
                                 parse_dates=['action_time'])
    # 将缺失的side info sku删除
    jdata_product_df = pd.read_csv(product_data_path, 
                                   usecols=['sku_id', 'brand', 'shop_id', 'cate'])
    
    jdata_product_df = jdata_product_df.dropna()
    
    # 只保留在行为序列中出现过的sku的 side info
    side_info_data_df = pd.merge(action_data_df, jdata_product_df, on='sku_id', how='left')
    
    # 对sku的side info类别编码
    for side_info in ['brand', 'shop_id', 'cate']:
        lbe = LabelEncoder()
        side_info_data_df[side_info] = lbe.fit_transform(side_info_data_df[side_info])
    
    # 统计sku_id的频率
    sku_count_dict = side_info_data_df.groupby('sku_id')['user_id'].count().to_dict()
    
    # 按照频率降序排序（为了后续使用sampled_softmax_loss）
    sku_id_sort_by_count_list = sorted(sku_count_dict.items(), key=lambda x: x[1], reverse=True)
    all_sku_ids = [x[0] for x in sku_id_sort_by_count_list]
    
    # 对sku做类别编码
    sku_map_dict = {}
    index2id_dict = {}
    for i, sku_id in enumerate(all_sku_ids):
        sku_map_dict[sku_id] = i
        index2id_dict[i] = sku_id
        
    side_info_data_df['sku_id'] = side_info_data_df['sku_id'].map(sku_map_dict)
    
    # 填充缺失值
    side_info_data_df = side_info_data_df.fillna(0)
    
    return side_info_data_df

In [4]:
# 通过用户的行为序列，构造多个session,最终是通过session来构建图
def get_one_user_session(df, time_cut=30, cut_type=2):
    """
    time_cut: 指的是多长时间的序列做截断
    cut_type: 指的是如果最后一次是下单也做截断
    """
    # 先把当前用户的sku_list, time_list, 以及type_list取出来
    sku_list = df['sku_id']
    action_time_list = df['action_time']
    action_type_list = df['type']
    user_session_list = []
    tmp_session_list = []
    # 遍历用户的整个行为序列
    for i, sku in enumerate(sku_list):
        # 如果当前sku是下单、浏览时间间隔为30分钟(指的是当前sku距离下一次浏览的sku的间隔)、
        # 或者是最后一个sku则对当前的session做截断
        if action_time_list[i] == cut_type or i == len(sku_list) - 1 or \
            (i < len(sku_list) -1 and (action_time_list[i+1] - action_time_list[i]).\
             seconds/60 > 30):
            tmp_session_list.append(sku)
            user_session_list.append(tmp_session_list)
            tmp_session_list = []
        else:
            tmp_session_list.append(sku)
    return user_session_list

In [5]:
def gene_all_user_session(action_data_df):
    action_types = [1, 2, 3, 4, 5]
    action_data_df = action_data_df[action_data_df['type'].isin(action_types)]

    # 按照时间，从小到大进行排序
    action_data_df = action_data_df.sort_values(['user_id', 'action_time'], ascending=True)

    # 将用户的行为序列聚合在一起，并根据规则将用户的行为序列切分成多个session
    user_action_list = action_data_df.groupby('user_id').agg(list).apply(get_one_user_session, axis=1)

    # 将session中长度大于1的都保存起来构图
    final_user_session_list = []

    for user_sessions in user_action_list:
        for session in user_sessions:
            if len(session) > 1:
                final_user_session_list.append(session)

    return final_user_session_list

In [6]:
def gene_graph(all_user_session_list):
    edges_dict = {} # 字典的key表示的是边，value表示的是这条边出现的次数，也可以认为是权重
    for session in all_user_session_list:
        for i in range(len(session) - 1):
            if (session[i], session[i + 1]) not in edges_dict:
                edges_dict[(session[i], session[i + 1])] = 1
            else:
                edges_dict[(session[i], session[i + 1])] += 1

    # 为了方便处理，一般会把图结构存下来
    src_nodes_list = [x[0] for x in list(edges_dict.keys())]
    dst_nodes_list = [x[1] for x in list(edges_dict.keys())]
    edge_weight_list = list(edges_dict.values())

    graph_df = pd.DataFrame({'src_node': src_nodes_list, 'dst_node': dst_nodes_list,
                            'weight': edge_weight_list})
    # 保存图结构的时候，用空格隔开，方便后面读取图
    graph_df.to_csv('./graph.csv', sep=' ', header=False, index=False)

    # 从边文件中构造图结构
    G = nx.read_edgelist('./graph.csv', create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])


In [7]:
def main():
    # 读取用户行为数据
    side_info_data_df = read_action_data()
    # 生成所有用户的session
    all_user_session_list = gene_all_user_session(side_info_data_df[\
                            ['user_id','sku_id', 'action_time', 'type']])
    # 根据用户session构图
    gene_graph(all_user_session_list)
    
    # 保存side info特征
    side_info_data_df.to_csv('./sideinfo.csv', sep=',', header=True, index=False)
    
    # sku_side_info
    sku_side_info = side_info_data_df[['sku_id', 'brand', 'shop_id', \
                        'cate']].drop_duplicates(subset=['sku_id'])
    sku_side_info.to_csv('./sku_sideinfo.csv', sep=',', header=True, index=False)
    
if __name__ == '__main__':
    main()