# 赛题信息

>目标：基于用户历史点击新闻文章的行为，预测用户未来点击新闻文章行为

>数据来源：某新闻App平台的用户交互数据

>数据集：用户点击日志、新闻文章信息数据、新闻文章embedding向量表示

>数据量：用户30万（其中20万作为训练集、5万作为测试集A，5万作为测试集B），点击300万次、新闻文章36万篇

>预测结果：用户点击新闻文章Top5的article_id

>结果提交格式（csv文件）：user_id,article_1,article_2,article_3,article_4,article_5

# 评估指标

评价指标公式如下：

$$
score(user) = \sum_{k=1}^5 \frac{s(user, k)}{k}
$$

对比推荐的TOP5篇新闻文章是否命中用户最后一条购买数据。如果用户1真实点击的文章为article1，即article1命中，则s(user1,1)=1,s(user1,2-5)=0,user1的预测结果score(user1)=sum(s(user1,k)/k)=1(k为预测文章的排名，1-5)

该评价指标为命中文章推荐排名的倒数，即命中的文章推荐排名越靠前，指标值越高；越靠后，指标值越低；没有命中，则为0

# 赛题理解

——————————————————————————————

# Baseline

In [24]:
# 导入基础包
import pandas as pd
import numpy as np
import time,math,os,random
from collections import defaultdict
from tqdm import tqdm
import pickle
import collections
import warnings
warnings.filterwarnings('ignore')

### 定义一个节省内存的函数

In [2]:
def reduce_mem(df):
    starttime = time.time()
    numerics = ['int16','int32','int64','float16','float32','float64']
    star_mem = df.memry_usage().sum()/1024**2
    for col in df.columns:
        col_type=df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.iinfo(np.float16).min and c_max < np.iinfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.iinfo(np.float32).min and c_max < np.iinfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memry_usage().sum()/1024**2
    print('—— Mem. usage dereased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(
        end_mem,100*(start_mem-end_mem)/start_mem),(time.time()-starttime)/60)
    return df
                        

### 读取数据 

In [20]:
data_path = '../download'
save_path = './tmp_results'

In [4]:
# debug模式，从训练集中划出一部分数据来调试代码
def get_all_click_sample(data_path,sample_nums=10000):
#     读取点击数据训练集
    click_path = os.path.join(data_path,'train_click_log.csv') 
    click = pd.read_csv(click_path)
    user = click['user_id'].unique()
#     获取采样用户点击数据
    sample_user = np.random.choice(user,size=sample_nums,replace=False)
    click = click[click['user_id'].isin(sample_user)]
    
    click = click.drop_duplicates(['user_id','click_article_id','click_timestamp'])
    return click
# 样本测试
df = get_all_click_sample(data_path=data_path)
df.head()

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
24,199992,272143,1507029683969,4,1,17,1,25,2
25,199992,348111,1507029702470,4,1,17,1,25,2
26,199992,299697,1507029732470,4,1,17,1,25,2
48,199986,5408,1507029596628,4,3,2,1,25,1
49,199986,272143,1507029626628,4,3,2,1,25,1


In [5]:
# 读取全部点击数据
def get_all_click_data(data_path,contains_test=False):
    '''
    contains_test:是否读取测试数据
    '''
    path_train = os.path.join(data_path,'train_click_log.csv')
    train_data = pd.read_csv(path_train)
    if contains_test:
        path_test = os.path.join(data_path,'testA_click_log.csv')
        test_data = pd.read_csv(path_test)
        data = train_data.append(test_data)
    else:
        data = train_data
    data = data.drop_duplicates(['user_id','click_article_id','click_timestamp'])
    return data

train_data = get_all_click_data(data_path=data_path)
train_data.head()

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,199999,160417,1507029570190,4,1,17,1,13,1
1,199999,5408,1507029571478,4,1,17,1,13,1
2,199999,50823,1507029601478,4,1,17,1,13,1
3,199998,157770,1507029532200,4,1,17,1,25,5
4,199998,96613,1507029671831,4,1,17,1,25,5


### 获取 用户-文章-点击时间 字典

In [16]:
# 按点击时间排序，得到用户点击文章序列 {user1:[item1:time1,item2:time2,...]...}
def get_user_item_time(click_data):
    click_data = click_data.sort_values('click_timestamp')
#     这里设计很巧妙
    def make_item_time_pair(df):
        return list(zip(df['click_article_id'],df['click_timestamp']))
    
    user_item_time_df = click_data.groupby('user_id')['click_article_id','click_timestamp'].apply(
        lambda x:make_item_time_pair(x)).reset_index().rename(columns={0:'item_time_list'})
    user_item_time_dict = dict(zip(user_item_time_df['user_id'],user_item_time_df['item_time_list']))
    
    return user_item_time_dict

### 获取点击次数最多的topK篇文章 

In [11]:
def get_item_topk_click(click_data,k):
    topk_click = click_data['click_article_id'].value_counts().index[:k]
    return topk_click

### itemcf的物品相似度计算 

In [25]:
def itemcf_sim(df):
    '''
    文章与文章之间的相似性矩阵计算
    tips：基于物品的协同过滤
    '''
    user_item_time_dict = get_user_item_time(df)
    
#     计算物品相似度
    i2i_sim = {}
    item_cnt = defaultdict(int)
    for user,item_time_list in tqdm(user_item_time_dict.items()):
        for i ,i_click_time in item_time_list:
            item_cnt[i] += 1
            i2i_sim.setdefault(i,{})
            for j,j_click_time in item_time_list:
                if(i==j):
                    continue
                i2i_sim[i].setdefault(j,0)
                i2i_sim[i][j] += 1 / math.log(len(item_time_list)+1)  #点击次数越多，权重越高
    i2i_sim_ = i2i_sim.copy()
    for i,related_items in i2i_sim.items():
        for j,wij in related_items.items():
            i2i_sim_[i][j] = wij/math.sqrt(item_cnt[i]*item_cnt[j])
            
#     保存相似度矩阵至本地
    pickle.dump(i2i_sim_,open(os.path.join(save_path, 'itemcf_i2i_sim.pkl'),'wb'))
    return i2i_sim_

相似性矩阵权重计算公式：$$
w_ij =  {\frac{n_j}{log(n+1)} \cdot \frac{1}{\sqrt{{n_i \cdot n_j}}}}
$$
其中，n表示用户的总点击次数，n_i和n_j分别表示用户点击文章i和文章j的次数。

In [None]:
i2i_sim = itemcf_sim(train_data)

100%|██████████| 200000/200000 [00:15<00:00, 12726.11it/s]


### itemcf的文章推荐

In [36]:
# 基于商品的召回i2i
def item_based_recommend(user_id,user_item_time_dict,i2i_sim,sim_item_topk,recall_item_num,item_topk_click):
    user_hist_items = user_item_time_dict[user_id]
    
    item_rank = {}
    for loc,(i,click_time) in enumerate(user_hist_items):
        for j,wij in sorted(i2i_sim[i].items(),key=lambda x:[1],reverse=True)[:sim_item_topk]:
            if j in user_hist_items:
                continue
            item_rank.setdefault(j,0)
            item_rank[j] += wij
    
    if len(item_rank)<recall_item_num:
        for i,item in enumerate(item_topk_click):
            if item in item_rank.items():
                continue
            item_rank[item] = -i-100
            if len(item_rank) == recall_item_num:
                break
    item_rank = sorted(item_rank.items(),key=lambda x:x[1],reverse=True)[:recall_item_num]
    return item_rank

### 根据物品的协同过滤，给每个用户推荐文章

In [None]:
# 初始化
user_recall_items_dict = collections.defaultdict(dict)

# 获取用户-文章-点击时间的字典
user_item_time_dict = get_user_item_time(train_data)

# 获取文章相似度
i2i_sim = pickle.load(open(os.path.join(save_path, 'itemcf_i2i_sim.pkl'),'rb'))

# 相似性文章的数量
sim_item_topk = 10

# 召回文章的数量
recall_item_num = 10

# 用户热度补全
item_topk_click = get_item_topk_click(train_data,k=50)

for user in tqdm(train_data['user_id'].unique()):
    user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, 
                                                        sim_item_topk, recall_item_num, item_topk_click)


 29%|██▉       | 58960/200000 [22:35<49:26, 47.55it/s]  

### 召回字典转为df

In [None]:
# 将字典的形式转换成df
user_item_score_list = []

for user, items in tqdm(user_recall_items_dict.items()):
    for item, score in items:
        user_item_score_list.append([user, item, score])

recall_df = pd.DataFrame(user_item_score_list, columns=['user_id', 'click_article_id', 'pred_score'])

### 生成提交文件 

In [None]:
# 生成提交文件
def submit(recall_df, topk=5, model_name=None):
    recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])
    recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 判断是不是每个用户都有5篇文章及以上
    tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())
    assert tmp.min() >= topk
    
    del recall_df['pred_score']
    submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()
    
    submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
    # 按照提交格式定义列名
    submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', 
                                                  3: 'article_3', 4: 'article_4', 5: 'article_5'})
    
    save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'
    submit.to_csv(save_name, index=False, header=True)

In [None]:
# 获取测试集
tst_path = os.path.join(data_path, 'testA_click_log.csv')
tst_click = pd.read_csv(tst_path)
tst_users = tst_click['user_id'].unique()

# 从所有的召回数据中将测试集中的用户选出来
tst_recall = recall_df[recall_df['user_id'].isin(tst_users)]

# 生成提交文件
submit(tst_recall, topk=5, model_name='itemcf_baseline')

通过本次学习，了解了协同过滤推荐的底层原理和基本实现逻辑。在代码实现层面，也学习到了很多处理和计算的实现技巧，受益良多。