In [10]:
import pandas as pd
import numpy as np
import os
from collections import defaultdict, Counter
from sklearn.preprocessing import LabelEncoder

In [2]:
user_act = pd.read_csv('data/action_head.csv', parse_dates=['action_time'], header=0)
user_act = user_act.sort_values(by='action_time')
user_act.head()

Unnamed: 0,user_id,sku_id,action_time,module_id,type
9138,577836,98373,2018-02-01 00:00:19,8036133,1
54112,539770,289214,2018-02-01 00:00:28,4717767,1
28545,745171,320439,2018-02-01 00:00:33,6107582,1
9137,577836,343787,2018-02-01 00:01:03,8036133,1
31159,402115,47830,2018-02-01 00:01:10,10323941,1


In [4]:
user_act.dtypes

user_id                 int64
sku_id                  int64
action_time    datetime64[ns]
module_id               int64
type                    int64
dtype: object

In [7]:
product = pd.read_csv('data/jdata_product.csv', header=0)
print(product.shape)
# 仅使用出现过的sku_id
product = product[product.sku_id.isin(set(user_act.sku_id))]
print(product.shape)
product.head(2)

(352539, 5)
(33344, 5)


Unnamed: 0,sku_id,brand,shop_id,cate,market_time
3,366931,2698,10252,79,2016-09-11 15:00:22.0
21,133835,3274,6700,79,2016-03-11 12:59:06.0


In [8]:
# label encoder
lbe_dict = {}
lbe_sku = LabelEncoder()

# label encoder + 1, 防止需要padding
user_act['sku_id'] = lbe_sku.fit_transform(user_act['sku_id'])+1
lbe_dict['sku_id'] = lbe_sku

product['sku_id'] = lbe_sku.transform(product['sku_id'])+1
# product encoder
for col in ['brand', 'shop_id', 'cate']:
    lbe = LabelEncoder()
    product[col] = lbe.fit_transform(product[col])+1
    lbe_dict[col] = lbe


In [9]:
df_corpus = user_act.groupby('user_id')['sku_id'].agg(list)
df_corpus.head(2)

user_id
28                [12066, 28627, 22327]
40    [8709, 32572, 4302, 22404, 15008]
Name: sku_id, dtype: object

In [7]:
# 生成训练pairs
def generate_context_pairs(corpus, window=5):
    """
    copurs: 语料；
    window: 窗口，中心词到两端的最远距离；
    """
    all_pairs = []
    for k in range(len(corpus)):
        sent = corpus[k]
        if len(sent)<2: continue
        for i in range(len(sent)):
            for j in range(max(i-window, 0), min(i+window+1, len(sent))):
                if i!=j:
                    all_pairs.append([sent[i], sent[j]])
    return all_pairs


In [8]:
all_pairs = generate_context_pairs(df_corpus.values, window=5)

In [9]:
df_pair = pd.DataFrame(all_pairs, columns=['sku_id', 'label'])

In [22]:
# 计算词频
word_counts = Counter(user_act['sku_id'].tolist()).items()
# 按label encoder 进行排序，因为需要跟后面Embedding table采样保持一致
word_counts = sorted(word_counts, key=lambda x:x[0])

34048

In [24]:
word_counts

[(1, 6),
 (2, 2),
 (3, 2),
 (4, 4),
 (5, 1),
 (6, 1),
 (7, 5),
 (8, 1),
 (9, 1),
 (10, 6),
 (11, 5),
 (12, 8),
 (13, 5),
 (14, 1),
 (15, 10),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 7),
 (20, 2),
 (21, 1),
 (22, 3),
 (23, 1),
 (24, 3),
 (25, 1),
 (26, 1),
 (27, 2),
 (28, 1),
 (29, 1),
 (30, 2),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 2),
 (35, 2),
 (36, 1),
 (37, 1),
 (38, 5),
 (39, 1),
 (40, 1),
 (41, 1),
 (42, 2),
 (43, 1),
 (44, 7),
 (45, 1),
 (46, 1),
 (47, 2),
 (48, 2),
 (49, 1),
 (50, 2),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 1),
 (55, 2),
 (56, 1),
 (57, 1),
 (58, 1),
 (59, 1),
 (60, 2),
 (61, 3),
 (62, 1),
 (63, 1),
 (64, 3),
 (65, 7),
 (66, 10),
 (67, 4),
 (68, 1),
 (69, 1),
 (70, 2),
 (71, 1),
 (72, 1),
 (73, 1),
 (74, 1),
 (75, 5),
 (76, 1),
 (77, 1),
 (78, 1),
 (79, 8),
 (80, 3),
 (81, 1),
 (82, 1),
 (83, 4),
 (84, 1),
 (85, 1),
 (86, 1),
 (87, 1),
 (88, 2),
 (89, 1),
 (90, 2),
 (91, 1),
 (92, 30),
 (93, 1),
 (94, 1),
 (95, 1),
 (96, 2),
 (97, 2),
 (98, 1),
 (99, 1),
 (100, 1),
 (101