In [1]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb

In [2]:
raw_train = pd.read_csv('./data/train.csv')
raw_test = pd.read_csv('./data/test.csv')
submit_df = pd.read_csv('./data/submit_example.csv')

In [3]:
display(raw_train, raw_test, submit_df)

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:00:00 UTC,cart,5773203,1.490000e+18,,runail,2.62,1,26dd6e6e-4dac-4778-8d2c-92e149dab885
1,2019-10-01 00:00:03 UTC,cart,5773353,1.490000e+18,,runail,2.62,1,26dd6e6e-4dac-4778-8d2c-92e149dab885
2,2019-10-01 00:00:07 UTC,cart,5723490,1.490000e+18,,runail,2.62,1,26dd6e6e-4dac-4778-8d2c-92e149dab885
3,2019-10-01 00:02:32 UTC,cart,5857283,1.490000e+18,,runail,2.62,1,26dd6e6e-4dac-4778-8d2c-92e149dab885
4,2019-10-01 00:02:40 UTC,cart,5723523,1.490000e+18,,runail,2.62,1,26dd6e6e-4dac-4778-8d2c-92e149dab885
...,...,...,...,...,...,...,...,...,...
416957,2019-10-03 08:23:30 UTC,view,5696227,1.490000e+18,,bluesky,82.54,53972,12749b0a-cfb8-82df-912e-e726f6aeafc4
416958,2019-10-03 08:26:50 UTC,view,5707747,1.490000e+18,,,73.02,53972,12749b0a-cfb8-82df-912e-e726f6aeafc4
416959,2019-10-03 08:13:42 UTC,view,5689159,1.490000e+18,,markell,5.79,53973,c8e37477-ef06-4121-951e-46d090a667a6
416960,2019-10-03 08:13:44 UTC,view,5809856,1.780000e+18,,grattol,15.08,53974,d1122de6-9f24-433f-9b5e-179fad5532e1


Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-03 08:14:04 UTC,view,5651977,1.490000e+18,,,31.73,53978,ee964fbc-668b-41c9-94d5-46e5bd82d174
1,2019-10-03 08:14:41 UTC,view,37987,1.490000e+18,,runail,42.86,53978,ee964fbc-668b-41c9-94d5-46e5bd82d174
2,2019-10-03 08:14:08 UTC,view,5753054,1.490000e+18,,,0.79,53980,3f482f26-5f0c-4bd7-bc3d-29a1ad5103ee
3,2019-10-03 08:17:21 UTC,cart,5753054,1.490000e+18,,,0.79,53980,3f482f26-5f0c-4bd7-bc3d-29a1ad5103ee
4,2019-10-03 08:17:24 UTC,cart,5753054,1.490000e+18,,,0.79,53980,3f482f26-5f0c-4bd7-bc3d-29a1ad5103ee
...,...,...,...,...,...,...,...,...,...
5159,2019-10-04 08:43:18 UTC,view,5870297,1.490000e+18,,estel,3.97,67469,91404fcc-31fa-4134-8dd0-2f85ed6c8964
5160,2019-10-04 08:43:19 UTC,view,5870125,1.490000e+18,,estel,5.56,67469,60b32523-a303-43f7-8cd7-b69d0fd18405
5161,2019-10-04 08:43:21 UTC,view,5870126,1.490000e+18,,estel,9.29,67469,e66ceee2-dc38-418c-a05f-52cf07ad2e43
5162,2019-10-04 08:43:21 UTC,view,5870296,1.490000e+18,,estel,3.97,67469,75abd9a1-e3ab-47c8-8c0d-09d8d21d1b05


Unnamed: 0,user_id,product_id
0,53978,4229
1,53980,4229
2,53982,4229
3,53985,4229
4,53986,4229
...,...,...
553,67454,4229
554,67455,4229
555,67461,4229
556,67467,4229


# 预处理

In [4]:
for df in [raw_train, raw_test]:
    # 处理空值
    for f in ['category_code', 'brand']:
        df[f].fillna('<unkown>', inplace=True)

    # 处理时间
    df['event_time'] = pd.to_datetime(df['event_time'], format='%Y-%m-%d %H:%M:%S UTC')
    df['timestamp'] = df['event_time'].apply(lambda x: time.mktime(x.timetuple()))
    df['timestamp'] = df['timestamp'].astype(int)

In [5]:
# 排序
raw_train = raw_train.sort_values(['user_id', 'timestamp'])
raw_test = raw_test.sort_values(['user_id', 'timestamp'])

In [6]:
# 处理非数值特征
df = pd.concat([raw_train, raw_test], ignore_index=True)

for f in ['event_type', 'category_code', 'brand']:
    # 构建编码器
    le = LabelEncoder()
    le.fit(df[f])

    # 设置新值
    raw_train[f] = le.transform(raw_train[f])
    raw_test[f] = le.transform(raw_test[f])

In [7]:
# 删除无用列
useless = ['event_time', 'user_session', 'timestamp']
for df in [raw_train, raw_test]:
    df.drop(columns=useless, inplace=True)

# 滑动窗口构造数据集
为了让机器学习模型能够处理时序数据，必须通过滑动窗口构造数据，后一个时间点的作为前一个时间点的预测值

In [8]:
# 训练集数据生成：滑动窗口
# 用前一个时间节点的数据预测后一个时间节点是商品
train_df = pd.DataFrame()
user_ids = raw_train['user_id'].unique()
for uid in tqdm(user_ids):
    user_data = raw_train[raw_train['user_id'] == uid].copy(deep=True)
    if user_data.shape[0] < 2:
        # 小于两条的，直接忽略
        continue

    user_data['y'] = user_data['product_id'].shift(-1)
    user_data = user_data.head(user_data.shape[0]-1)
    train_df = train_df.append(user_data)

train_df['y'] = train_df['y'].astype(int)
train_df = train_df.reset_index(drop=True)

100%|██████████| 53975/53975 [05:54<00:00, 152.32it/s]


In [9]:
# 测试集数据生成，只取每个用户最后一次操作用来做预测
test_df = raw_test.groupby(['user_id'], as_index=False).last()

In [10]:
train_df.drop(columns=['user_id'], inplace=True)

In [11]:
display(train_df, test_df)

Unnamed: 0,event_type,product_id,category_id,category_code,brand,price,y
0,0,5773203,1.490000e+18,0,182,2.62,5773353
1,0,5773353,1.490000e+18,0,182,2.62,5723490
2,0,5723490,1.490000e+18,0,182,2.62,5857283
3,0,5857283,1.490000e+18,0,182,2.62,5723523
4,0,5723523,1.490000e+18,0,182,2.62,5773313
...,...,...,...,...,...,...,...
362982,3,5692613,1.490000e+18,0,197,6.98,5758032
362983,3,5758032,1.490000e+18,0,0,55.56,5793292
362984,3,5793292,1.490000e+18,0,98,55.65,5877795
362985,3,5877795,1.490000e+18,0,0,158.73,5696227


Unnamed: 0,user_id,event_type,product_id,category_id,category_code,brand,price
0,53978,3,37987,1.490000e+18,0,182,42.86
1,53980,1,5873653,1.490000e+18,0,0,3.02
2,53982,3,5747406,1.490000e+18,0,141,6.33
3,53985,3,5689124,1.490000e+18,0,197,23.57
4,53986,3,5692527,1.490000e+18,0,0,46.02
...,...,...,...,...,...,...,...
553,67454,3,5759279,1.490000e+18,0,94,1.11
554,67455,0,5869144,1.780000e+18,0,40,7.94
555,67461,2,5886754,1.490000e+18,0,0,1.59
556,67467,2,5854319,1.490000e+18,0,182,2.38


# 训练模型&预测

In [12]:
user_ids = test_df['user_id'].unique()

In [16]:
preds = []
for uid in tqdm(user_ids):
    pids = raw_test[raw_test['user_id'] == uid]['product_id'].unique()

    # 找到训练集中有这些product_id的数据作为当前用户的训练集
    p_train = train_df[train_df['product_id'].isin(pids)]
    
    # 只取最后一条进行预测
    user_test = test_df[test_df['user_id'] == uid].drop(columns=['user_id'])

    X_train = p_train.iloc[:, :-1]
    y_train = p_train['y']

    if len(X_train) > 0:
        # 训练
        clf = lgb.LGBMClassifier(**{'seed': int(time.time())})
        clf.fit(X_train, y_train)
    
        # 预测
        pred = clf.predict(user_test)[0]
    else:
        # 训练集中无对应数据
        # 直接取最后一条数据作为预测值
        pred = user_test['product_id'].iloc[0]

    preds.append(pred)

100%|██████████| 558/558 [00:02<00:00, 202.01it/s]


In [17]:
submit_df['product_id'] = preds

# 分数 0.206
submit_df.to_csv('baseline.csv', index=False)

# To Do
* 异常数据处理
* 特征工程
* 处理时间特征（比如是否有周期性、下单的高峰时间点等）
* 处理session
* 交叉验证&调参
* 模型融合