In [1]:
import pandas as pd
import numpy as np
import os, pickle
import torch
from collections import defaultdict, Counter
from sklearn.preprocessing import LabelEncoder

In [2]:
OUT_PATH = 'data_cache/'
if not os.path.exists(OUT_PATH):
    os.makedirs(OUT_PATH)

In [3]:
user_act = pd.read_csv('data/action_head.csv', parse_dates=['action_time'], header=0)
user_act = user_act.sort_values(by='action_time')[['user_id', 'sku_id']]
user_act.head()

Unnamed: 0,user_id,sku_id
9138,577836,98373
54112,539770,289214
28545,745171,320439
9137,577836,343787
31159,402115,47830


In [4]:
product = pd.read_csv('data/jdata_product.csv', header=0)
print(product.shape)
# 仅使用出现过的sku_id
product = product[product.sku_id.isin(set(user_act.sku_id))][['sku_id','brand','shop_id','cate']]
print(product.shape)
product.head(2)

(352539, 5)
(33344, 4)


Unnamed: 0,sku_id,brand,shop_id,cate
3,366931,2698,10252,79
21,133835,3274,6700,79


In [5]:
# 为简单，这里仅保留product中出现过的sku
user_act = user_act.merge(product, on='sku_id', how='inner')
user_act.head(2)

Unnamed: 0,user_id,sku_id,brand,shop_id,cate
0,577836,98373,262,7082,7
1,812116,98373,262,7082,7


In [8]:
# label encoder
lbe_dict = {}

# label encoder + 1, 防止需要padding
for col in ['sku_id', 'brand', 'shop_id', 'cate']:
    lbe = LabelEncoder()
    user_act[col] = lbe.fit_transform(user_act[col])+1
    lbe_dict[col] = lbe

pickle.dump(lbe_dict, open(f'{OUT_PATH}/label_dict.pkl','wb'))

In [9]:
product = user_act[['sku_id', 'brand', 'shop_id', 'cate']].drop_duplicates(subset=['sku_id', 'brand', 'shop_id', 'cate'])
product.shape

(33344, 4)

In [11]:
# 计算词频
word_counts = Counter(user_act['sku_id'].tolist()).items()
# 按label encoder 进行排序，因为需要跟后面Embedding table采样保持一致
word_counts = sorted(word_counts, key=lambda x:x[0])
counts = np.array([wc[1] for wc in word_counts])

noise_dist = torch.from_numpy(counts**(0.75)/np.sum(counts**(0.75)))

pickle.dump(word_counts, open(f'{OUT_PATH}/word_counts.pkl', 'wb')) 

In [12]:
df_corpus = user_act.groupby('user_id')['sku_id'].agg(list)
df_corpus.head(2)

user_id
28                [21864, 11800, 28044]
40    [4215, 31898, 8519, 21941, 14684]
Name: sku_id, dtype: object

In [13]:
# 生成训练pairs
def generate_context_pairs(corpus, window=5):
    """
    copurs: 语料；
    window: 窗口，中心词到两端的最远距离；
    """
    all_pairs = []
    for k in range(len(corpus)):
        sent = corpus[k]
        if len(sent)<2: continue
        for i in range(len(sent)):
            for j in range(max(i-window, 0), min(i+window+1, len(sent))):
                if i!=j:
                    all_pairs.append([sent[i], sent[j]])
    return all_pairs


In [14]:
all_pairs = generate_context_pairs(df_corpus.values, window=5)

In [15]:
df_pair = pd.DataFrame(all_pairs, columns=['sku_id', 'label'])
print(df_pair.shape)
df_pair.head(2)

(704626, 2)


Unnamed: 0,sku_id,label
0,21864,11800
1,21864,28044


In [16]:
df_pair = df_pair.merge(product, on='sku_id', how='left')

df_pair.head()

Unnamed: 0,sku_id,label,brand,shop_id,cate
0,21864,11800,2347,4110,6
1,21864,28044,2347,4110,6
2,11800,21864,748,1624,6
3,11800,28044,748,1624,6
4,28044,21864,748,1624,6


In [18]:
df_pair.to_pickle(f'{OUT_PATH}/pairs.pkl')