In [14]:
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
import gc, os
import logging
import time
import lightgbm as lgb
from gensim.models import Word2Vec
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
import pickle

In [3]:
data_path = '../data/raw/'
save_path = "../temp/"

In [4]:
def trn_val_split(all_click_df, sample_user_nums):
    all_click = all_click_df
    all_user_ids = all_click.user_id.unique()
    
    # replace=True表示可以重复抽样，反之不可以
    sample_user_ids = np.random.choice(all_user_ids, size=sample_user_nums, replace=False) 
    
    click_val = all_click[all_click['user_id'].isin(sample_user_ids)]
    click_trn = all_click[~all_click['user_id'].isin(sample_user_ids)]
    
    # 将验证集中的最后一次点击给抽取出来作为答案
    click_val = click_val.sort_values(['user_id', 'click_timestamp'])
    val_ans = click_val.groupby('user_id').tail(1)
    
    click_val = click_val.groupby('user_id').apply(lambda x: x[:-1]).reset_index(drop=True)
    
    # 去除val_ans中某些用户只有一个点击数据的情况，如果该用户只有一个点击数据，又被分到ans中，
    # 那么训练集中就没有这个用户的点击数据，出现用户冷启动问题，给自己模型验证带来麻烦
    val_ans = val_ans[val_ans.user_id.isin(click_val.user_id.unique())] # 保证答案中出现的用户再验证集中还有
    click_val = click_val[click_val.user_id.isin(val_ans.user_id.unique())]
    
    return click_trn, click_val, val_ans

In [5]:
def get_hist_and_last_click(all_click):
    all_click = all_click.sort_values(by=['user_id', 'click_timestamp'])
    click_last_df = all_click.groupby('user_id').tail(1)

    # 如果用户只有一个点击，hist为空了，会导致训练的时候这个用户不可见，此时默认泄露一下
    def hist_func(user_df):
        if len(user_df) == 1:
            return user_df
        else:
            return user_df[:-1]

    click_hist_df = all_click.groupby('user_id').apply(hist_func).reset_index(drop=True)

    return click_hist_df, click_last_df

In [2]:
emb = pd.read_csv('../data/raw/articles_emb.csv')
emb.head

<bound method NDFrame.head of         article_id     emb_0     emb_1     emb_2     emb_3     emb_4  \
0                0 -0.161183 -0.957233 -0.137944  0.050855  0.830055   
1                1 -0.523216 -0.974058  0.738608  0.155234  0.626294   
2                2 -0.619619 -0.972960 -0.207360 -0.128861  0.044748   
3                3 -0.740843 -0.975749  0.391698  0.641738 -0.268645   
4                4 -0.279052 -0.972315  0.685374  0.113056  0.238315   
...            ...       ...       ...       ...       ...       ...   
364042      364042 -0.055038 -0.962136  0.869436 -0.071523 -0.725294   
364043      364043 -0.136932 -0.995471  0.991298  0.031871 -0.915621   
364044      364044 -0.251390 -0.976243  0.586097  0.643631 -0.663359   
364045      364045  0.224342 -0.923288 -0.381742  0.687890 -0.773911   
364046      364046 -0.257134 -0.994631  0.983792 -0.190975 -0.953720   

           emb_5     emb_6     emb_7     emb_8  ...   emb_240   emb_241  \
0       0.901365 -0.335148 -0.

In [3]:
emb_cols = [f'emb_{i}' for i in range(250)]  # 根据实际列数调整

# 直接使用numpy数组构建字典(最快)
article_ids = emb['article_id'].values
embedding_matrix = emb[emb_cols].values

item_emb_dict = {
    article_id: embedding_matrix[idx] 
    for idx, article_id in enumerate(article_ids)
}

In [4]:
item_emb_dict[2]

array([-0.61961854, -0.9729604 , -0.20736018, -0.12886102,  0.04474759,
       -0.387535  , -0.73047674, -0.06612612, -0.75489885, -0.24200428,
        0.670484  , -0.2803883 , -0.557285  , -0.08414505,  0.02778196,
        0.29407424,  0.36269727, -0.3685494 ,  0.14796   , -0.01175088,
        0.03020873,  0.10631693,  0.6280128 ,  0.388849  ,  0.6159109 ,
       -0.44511306,  0.10602808,  0.13710949, -0.09553552,  0.3425321 ,
        0.5926465 , -0.26179096,  0.34212252,  0.7045392 , -0.43306684,
        0.1041543 ,  0.7859709 ,  0.5886402 , -0.62768734, -0.14329416,
        0.39983153, -0.70823455, -0.73296404, -0.95824176, -0.629325  ,
       -0.28223997,  0.0551875 , -0.70930463,  0.5806534 , -0.5183282 ,
        0.0590419 ,  0.66433567,  0.37024036, -0.22426963, -0.22767073,
        0.6944705 ,  0.16796917,  0.10058454,  0.9468768 , -0.47480643,
        0.91217107, -0.43829462, -0.04617592,  0.80739474, -0.2778143 ,
       -0.6002078 , -0.5066402 , -0.00820139, -0.8228875 ,  0.20

In [5]:
click = pd.read_csv('../data/raw/train_click_log.csv')
click.head()

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,199999,160417,1507029570190,4,1,17,1,13,1
1,199999,5408,1507029571478,4,1,17,1,13,1
2,199999,50823,1507029601478,4,1,17,1,13,1
3,199998,157770,1507029532200,4,1,17,1,25,5
4,199998,96613,1507029671831,4,1,17,1,25,5


In [6]:
articles = pd.read_csv('../data/raw/articles.csv')
articles.head()

Unnamed: 0,article_id,category_id,created_at_ts,words_count
0,0,0,1513144419000,168
1,1,1,1405341936000,189
2,2,1,1408667706000,250
3,3,1,1408468313000,230
4,4,1,1407071171000,162


In [3]:
import pickle
import os

In [6]:
path = "../temp/all_recall_results.pkl"
with open(path, 'rb') as f:
    all_recall_results = pickle.load(f)

In [12]:
all_recall_results[131078]

[(124352, 0.9627451735014284),
 (124228, 0.9252055640072008),
 (124177, 0.9221347840781757),
 (123289, 0.920215561294154),
 (124350, 0.9160100563142084),
 (124194, 0.914803620585804),
 (123909, 0.9125198739466032),
 (158046, 0.911674797610559),
 (20249, 0.9116732013313792),
 (140646, 0.9116731574053989),
 (30064, 0.9116702021379017),
 (109812, 0.9116677594911227),
 (57771, 0.9116624471645811),
 (84020, 0.9116552238314753),
 (76002, 0.9116539948097181),
 (216448, 0.911653145725048),
 (158023, 0.9116511880754326),
 (136599, 0.9116497244798838),
 (160565, 0.9116487092010409),
 (158047, 0.911646830799328),
 (129520, 0.911646629736078),
 (140627, 0.9116449840967761),
 (13635, 0.9116435934092977),
 (58265, 0.9116427447774726),
 (158850, 0.9116414754524964),
 (201797, 0.9116395757670611),
 (106313, 0.9116386551328561),
 (123368, 0.9116195179612896),
 (140676, 0.9115379769795942),
 (124176, 0.9106024230639465)]

In [13]:
all_recall_results[163862]

[(30064, 0.911680424664485),
 (140646, 0.9116803259442406),
 (20249, 0.911680301037757),
 (158046, 0.9116664032198762),
 (158023, 0.9116624349377619),
 (140627, 0.9116582107981336),
 (57771, 0.9116569387560864),
 (84020, 0.9116539028821512),
 (76002, 0.9116536719311209),
 (109812, 0.9116535954002892),
 (160565, 0.91165017370228),
 (129520, 0.9116456769498669),
 (136599, 0.9116447748823133),
 (158850, 0.9116444914012448),
 (87212, 0.9116418526725125),
 (13635, 0.911641201028331),
 (106313, 0.9116378024254247),
 (216448, 0.9116343146120224),
 (174691, 0.9116321654089053),
 (187642, 0.9116311125439144),
 (214800, 0.910226777439227),
 (111210, 0.908954388983774),
 (71076, 0.9088510757017482),
 (160132, 0.9082284493551545),
 (233478, 0.9081793381521457),
 (293301, 0.9080102784012332),
 (40969, 0.9077640172247813),
 (257291, 0.9075642500453636),
 (297639, 0.9074363038684117),
 (159762, 0.9071887493317825)]

In [5]:
with open('../temp/user_history_dict.pkl', 'rb') as f:
    user_history_dict = pickle.load(f)

In [17]:
sp = list(user_history_dict.values())

In [18]:
lens = [len(x) for x in sp]

In [21]:
np.mean(lens), np.median(lens), np.max(lens), np.min(lens)

(4.642982456140351, 3.0, 179, 1)

In [22]:
with open('../temp/user_profile_dict.pkl', 'rb') as f:
    user_profile_dict = pickle.load(f)

In [27]:
user_profile_dict['60.0']

{'user_click_count': 0.0056179775280898875,
 'user_avg_time_gap': 0.000134400085980978,
 'device_group': '4.0',
 'avg_click_time': 1.0,
 'avg_word_count': 217.0}

In [28]:
with open('../temp/item_features_dict.pkl', 'rb') as f:
    item_features_dict = pickle.load(f)

In [29]:
item_features_dict.keys()

dict_keys(['258', '291', '860', '906', '1405', '1639', '1641', '1686', '1873', '1877', '1885', '1932', '1946', '1975', '2008', '2022', '2075', '2116', '2175', '2179', '2215', '2243', '2256', '2288', '2303', '2315', '2377', '2381', '2420', '2547', '2609', '2623', '2624', '2626', '2628', '2647', '2709', '2764', '2854', '3022', '3027', '3091', '3106', '3140', '3144', '3145', '3148', '3171', '3230', '3240', '3241', '3244', '3285', '3320', '3329', '3394', '3415', '3420', '3434', '3449', '3467', '3472', '3473', '3542', '3598', '3606', '3638', '3666', '3713', '3714', '3797', '3808', '3828', '3867', '3872', '3899', '3902', '4178', '4306', '4332', '4374', '4395', '4521', '4528', '4533', '4549', '4568', '4570', '4578', '4596', '4598', '4639', '4673', '4696', '4708', '4789', '4798', '4867', '4899', '4907', '4956', '4967', '4970', '5059', '5072', '5107', '5113', '5116', '5127', '5130', '5252', '5254', '5258', '5259', '5272', '5292', '5314', '5341', '5349', '5366', '5387', '5391', '5408', '5457', '

In [30]:
with open('../temp/feature_lists.pkl', 'rb') as f:
    feature_lists = pickle.load(f)

In [31]:
feature_lists

{'user_profile_features': ['user_click_count',
  'user_avg_time_gap',
  'device_group',
  'avg_click_time',
  'avg_word_count'],
 'item_features': ['category_id',
  'article_popularity',
  'created_at_ts',
  'words_count'],
 'context_features': ['score',
  'sim_1',
  'time_diff_1',
  'word_diff_1',
  'sim_2',
  'time_diff_2',
  'word_diff_2',
  'sim_3',
  'time_diff_3',
  'word_diff_3',
  'sim_max',
  'sim_mean',
  'sim_min',
  'sim_std',
  'item_user_sim',
  'recall_in_user_cat']}

In [32]:
with open('../temp/item_features_dict.pkl', 'rb') as f:
    item_features_dict = pickle.load(f)
    

In [35]:
item_features_dict['258']

{'category_id': 1,
 'article_popularity': 0.0,
 'created_at_ts': 1390406534000,
 'words_count': 177}

In [3]:
with open('../temp/all_recall_results.pkl', 'rb') as f:
    all_recall_results = pickle.load(f)

In [4]:
all_recall_results[131078]

[(124352, 0.8601620976338931),
 (124228, 0.8464642484093324),
 (124177, 0.837503989945276),
 (123289, 0.8336910527551649),
 (124350, 0.8331822210304173),
 (123909, 0.830144937640578),
 (48266, 0.8298004061084594),
 (71946, 0.8297993060295618),
 (62584, 0.8297991164318499),
 (83546, 0.8297983320743594),
 (41671, 0.829798119395187),
 (121546, 0.8297978997091426),
 (1946, 0.8297975893459317),
 (905, 0.8297973074223776),
 (166638, 0.8297972332319686),
 (14741, 0.8297971619267421),
 (26085, 0.8297967650080541),
 (48273, 0.8297967485212966),
 (89677, 0.8297965527410507),
 (85290, 0.829796442691944),
 (288281, 0.8297961562345315),
 (68615, 0.8297961508763353),
 (5515, 0.8297953306601471),
 (118843, 0.8297942367637836),
 (227579, 0.82979407890308),
 (10242, 0.82979407890308),
 (123368, 0.8297091285674957),
 (124176, 0.8296622224757202),
 (124194, 0.8287823832103026),
 (124749, 0.8279648475500317)]

In [5]:
len(all_recall_results)

200000

In [7]:
main_features = pd.read_csv('../temp/main_features.csv')

In [8]:
main_features.sample(30)

Unnamed: 0,user_id,item_id,score,sim_1,time_diff_1,word_diff_1,sim_2,time_diff_2,word_diff_2,sim_3,...,user_click_count,user_avg_time_gap,device_group,avg_click_time,avg_word_count,category_id,article_popularity,created_at_ts,words_count,label
690601,23020,70127,9,1,6,5,3,0,0,3,...,0,2,3.0,6,0.0,3,0,2,7,0
346029,11534,19781,6,5,3,7,3,0,0,3,...,0,2,1.0,7,0.0,1,2,7,7,0
1990863,66362,107150,9,6,2,8,4,1,3,3,...,3,5,1.0,6,0.0,5,4,7,2,0
5380834,179361,16536,8,1,5,5,0,3,3,0,...,4,4,1.0,5,0.0,0,0,2,5,0
1075474,35849,16536,8,1,5,3,3,0,0,3,...,0,2,3.0,6,0.0,0,0,2,5,0
2610658,87021,126655,0,3,7,3,3,0,0,3,...,0,2,1.0,3,0.0,7,0,0,9,0
4934455,164481,96210,3,7,2,8,4,1,4,4,...,3,5,1.0,3,0.0,5,6,9,9,1
1892693,63089,315105,1,7,1,7,3,0,0,3,...,0,2,3.0,4,0.0,9,5,6,0,0
2453293,81776,81975,5,0,7,2,0,4,2,0,...,2,4,3.0,4,0.0,4,0,0,4,0
1380023,46000,292726,0,8,0,2,3,0,0,3,...,0,2,1.0,5,0.0,9,4,8,6,0


In [4]:
main_features['item_user_sim'].value_counts()

item_user_sim
0.0    7500000
Name: count, dtype: int64

In [9]:
main_features['time_diff_3'].value_counts()

time_diff_3
0    4501987
4     755569
1     754362
3     746935
2     741147
Name: count, dtype: int64

In [6]:
main_features['user_click_count'].value_counts()

user_click_count
0    3949140
2    1170780
3     981180
4     846660
1     552240
Name: count, dtype: int64

In [10]:
ture_main_features = main_features[main_features['label']==1]

In [11]:
ture_main_features.shape

(76790, 28)

In [12]:
ture_main_features.sample(20)

Unnamed: 0,user_id,item_id,score,sim_1,time_diff_1,word_diff_1,sim_2,time_diff_2,word_diff_2,sim_3,...,user_click_count,user_avg_time_gap,device_group,avg_click_time,avg_word_count,category_id,article_popularity,created_at_ts,words_count,label
5241988,174732,331116,1,7,2,7,5,1,2,4,...,4,5,3.0,5,0.0,9,6,9,1,1
4865578,162185,236428,1,8,1,1,5,0,0,4,...,3,5,1.0,4,0.0,8,4,9,1,1
2091532,69717,48403,0,7,0,4,3,0,0,3,...,0,2,3.0,4,0.0,1,5,6,9,1
4574514,152483,168623,0,7,0,2,5,0,3,3,...,1,2,1.0,0,0.0,7,6,5,2,1
839302,27976,202476,0,8,0,5,3,0,0,3,...,0,2,3.0,6,0.0,8,4,9,1,1
2159810,71993,336223,1,7,0,6,3,0,0,3,...,0,2,1.0,3,0.0,9,6,6,0,1
5832201,194406,111043,3,7,3,6,5,0,1,4,...,4,5,3.0,1,0.0,6,6,8,5,1
5531154,184371,284547,0,8,0,3,3,0,0,3,...,0,2,1.0,0,0.0,9,5,5,4,1
204652,6821,285849,0,8,0,3,3,0,0,3,...,0,2,1.0,7,0.0,9,5,9,4,1
4716656,157221,293114,1,7,2,6,5,1,0,4,...,2,5,1.0,1,0.0,9,6,5,6,1


In [15]:
len(set(main_features['user_id']))

250000

In [16]:
train_data = pd.read_csv('../data/raw/train_click_log.csv')
len(set(train_data['user_id']))

200000