In [4]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
from collections import defaultdict
from tqdm import tqdm
import math
import random
import pickle

In [5]:
data_path = "../data/raw/"
processed_path = "../data/processed/"
save_path = "../temp/"

In [6]:
def get_all_df(data_path):
    res = {}
    for file in os.listdir(data_path):
        if file.endswith('.csv'):
            df_name = file[:-4]  # Remove the .csv extension
            df = pd.read_csv(os.path.join(data_path, file))
            res[df_name] = df
    return res

In [7]:
all_df = get_all_df(data_path)

In [8]:
all_df.keys()

dict_keys(['articles', 'articles_emb', 'sample_submit', 'testA_click_log', 'train_click_log'])

In [9]:
articles_emb = all_df['articles_emb']
articles_emb.sample(20)

Unnamed: 0,article_id,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_240,emb_241,emb_242,emb_243,emb_244,emb_245,emb_246,emb_247,emb_248,emb_249
169535,169535,-0.368783,-0.940789,-0.431039,-0.684624,0.041565,-0.24136,-0.489654,-0.253758,-0.215321,...,-0.443312,-0.627066,-0.198369,0.03848,0.644269,0.261365,0.271666,-0.426192,0.604907,0.490121
279156,279156,-0.283818,-0.954815,-0.576323,-0.469454,-0.789595,-0.662833,0.699317,-0.299253,0.780396,...,0.078113,-0.072709,-0.367586,0.071114,0.287709,0.46854,-0.628069,0.528959,0.014406,0.365248
79576,79576,-0.155804,-0.960373,0.56744,0.625684,0.954314,0.922273,-0.87695,-0.765548,-0.804947,...,-0.164496,0.483379,0.223184,0.190291,0.557699,-0.923739,0.571085,-0.407846,0.845708,-0.523922
126507,126507,0.212696,-0.962258,-0.175824,-0.714267,-0.693524,0.755666,0.189305,0.332018,-0.217813,...,-0.805358,-0.201853,0.011489,0.864082,-0.762041,-0.627261,-0.480457,0.809765,-0.595714,-0.301312
3966,3966,0.624653,-0.962614,-0.218231,0.547921,0.269613,-0.060113,-0.677101,0.593439,-0.785595,...,0.868047,0.464974,0.564226,-0.812679,-0.799268,0.183372,-0.867555,-0.614014,0.849402,-0.314416
294351,294351,-0.2751,-0.949044,-0.514448,-0.255317,-0.340626,-0.338412,-0.233462,0.053927,0.236905,...,-0.440699,-0.49058,0.351281,-0.601271,-0.155818,0.193644,0.284049,0.160646,-0.365132,0.42119
326210,326210,-0.504071,-0.972745,0.795081,-0.779154,0.214263,0.25492,-0.259205,-0.358713,0.020972,...,-0.878421,-0.14511,-0.720375,0.853182,-0.726454,-0.106918,-0.369482,-0.222595,0.678439,-0.544396
285828,285828,-0.296962,-0.952933,-0.434784,-0.404847,-0.733319,-0.702255,0.545108,0.257443,0.729414,...,-0.60131,-0.452804,-0.011703,-0.328207,0.096456,0.287115,0.106132,0.784979,0.295381,0.709023
254082,254082,-0.663384,-0.955483,0.021333,0.087545,-0.08532,0.453641,-0.780075,-0.528583,-0.374687,...,0.505162,-0.272647,0.817992,-0.354067,0.622335,0.480096,0.680024,0.681751,-0.163072,0.609136
275569,275569,0.251643,-0.95797,0.586932,-0.553405,-0.940079,-0.184117,0.377309,0.178475,0.799666,...,-0.747341,-0.142402,-0.444283,-0.207818,-0.630611,0.207124,-0.086322,0.663645,-0.603653,0.452201


In [10]:
articles_df = all_df['articles']
articles_df.sample(20)

Unnamed: 0,article_id,category_id,created_at_ts,words_count
42478,42478,67,1499793656000,141
323697,323697,434,1514359724000,182
184740,184740,302,1384341993000,139
144517,144517,269,1487087111000,178
11047,11047,7,1508847300000,192
259198,259198,395,1421946527000,224
328438,328438,436,1486740647000,235
325531,325531,435,1390230646000,221
101015,101015,225,1510063106000,211
138413,138413,265,1381460164000,118


In [11]:
all_click_df = all_df['testA_click_log']
all_click_df.sample(20)

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
371197,242116,7744,1507644114904,4,1,17,1,21,1
194042,246134,159762,1507138728603,4,1,17,1,25,2
283769,225627,284985,1507376580700,4,1,17,1,9,1
252933,249853,206315,1507295116542,4,1,17,1,5,2
85416,218154,162655,1506989445521,4,1,17,1,27,2
30048,238193,206934,1507049227995,4,1,17,1,21,2
193120,223559,285719,1507137880369,4,3,2,1,19,2
417147,209351,336220,1507741556175,4,1,17,1,21,1
115312,207201,285343,1506999321821,4,3,2,1,13,2
89293,216709,64329,1506986445228,4,3,2,1,25,2


In [12]:
click_report = ProfileReport(all_click_df)
click_report.to_file(os.path.join(processed_path, "click_report.html"))

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 9/9 [00:00<00:00, 24.02it/s]


KeyboardInterrupt: 

# processing now!

In [None]:
def get_user_item_time(click_df):
    click_df = click_df.sort_values('click_timestamp')
    def make_item_time_pair(df):
        return list(zip(df['click_article_id'], df['click_timestamp']))
    
    user_item_time_df = click_df.groupby('user_id')[['click_article_id', 'click_timestamp']].apply(lambda x: make_item_time_pair(x)).reset_index().rename(columns={0: 'item_time_list'})
    user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))
    return user_item_time_dict

In [None]:
def get_top_k_articles(click_df, k=10):
    top_k_articles = click_df['click_article_id'].value_counts().index[:k].tolist()
    return top_k_articles

In [None]:
def itemcf_sim(df):
    """
        文章与文章之间的相似性矩阵计算
        :param df: 数据表
        :item_created_time_dict:  文章创建时间的字典
        return : 文章与文章的相似性矩阵
        思路: 基于物品的协同过滤(详细请参考上一期推荐系统基础的组队学习)， 在多路召回部分会加上关联规则的召回策略
    """
    
    user_item_time_dict = get_user_item_time(df)
    
    # 计算物品相似度
    i2i_sim = {}
    item_cnt = defaultdict(int)
    for user, item_time_list in tqdm(user_item_time_dict.items()):
        # 在基于商品的协同过滤优化的时候可以考虑时间因素
        for i, i_click_time in item_time_list:
            item_cnt[i] += 1
            i2i_sim.setdefault(i, {})
            for j, j_click_time in item_time_list:
                if(i == j):
                    continue
                i2i_sim[i].setdefault(j, 0)
                
                i2i_sim[i][j] += 1 / math.log(len(item_time_list) + 1)
                
    i2i_sim_ = i2i_sim.copy()
    for i, related_items in i2i_sim.items():
        for j, wij in related_items.items():
            i2i_sim_[i][j] = wij / math.sqrt(item_cnt[i] * item_cnt[j])
    
    # 将得到的相似性矩阵保存到本地
    pickle.dump(i2i_sim_, open(save_path + 'itemcf_i2i_sim.pkl', 'wb'))
    
    return i2i_sim_

In [None]:
i2i_sim = itemcf_sim(all_click_df)

100%|██████████| 50000/50000 [00:08<00:00, 5602.20it/s] 


In [None]:
i2i_sim[195839]

{191971: 0.08879174180575594,
 194300: 0.04849744097314668,
 96077: 0.009124909910757354,
 157300: 0.00788675938974907,
 84136: 0.007249815359901717,
 233717: 0.008857263301443919,
 284491: 0.008485267066830253,
 123909: 0.0040529179923550415,
 348111: 0.0025276915302732254,
 233728: 0.03307273795232257,
 336801: 0.01074909274175612,
 336607: 0.0034522380178676083,
 159762: 0.004021441078074126,
 285719: 0.0034939818083332655,
 87174: 0.0037111832033597135,
 87205: 0.00435069012756991,
 141004: 0.0019425738420398085,
 87228: 0.002952676984266401,
 59057: 0.0012027129049255022,
 195009: 0.015851746703995046,
 351517: 0.006202760918935099,
 214097: 0.004703476345751559,
 194645: 0.01585509644302779,
 190202: 0.02039941904604722,
 189935: 0.03434933457680129,
 186994: 0.0266068801538516,
 299716: 0.02103458567932558,
 233658: 0.0057547609256292315,
 315105: 0.0019965145184872887,
 234698: 0.0018330239266187657,
 236446: 0.007268445765422086,
 288321: 0.007848402663256076,
 336221: 0.00423

In [None]:
def item_based_recommend(user_id, user_item_time_dict, i2i_sim, sim_item_topk, recall_item_num, item_topk_click):
    """
        基于文章协同过滤的召回
        :param user_id: 用户id
        :param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列   {user1: [(item1, time1), (item2, time2)..]...}
        :param i2i_sim: 字典，文章相似性矩阵
        :param sim_item_topk: 整数， 选择与当前文章最相似的前k篇文章
        :param recall_item_num: 整数， 最后的召回文章数量
        :param item_topk_click: 列表，点击次数最多的文章列表，用户召回补全        
        return: 召回的文章列表 {item1:score1, item2: score2...}
        注意: 基于物品的协同过滤(详细请参考上一期推荐系统基础的组队学习)， 在多路召回部分会加上关联规则的召回策略
    """
    
    # 获取用户历史交互的文章
    user_hist_items = user_item_time_dict[user_id]
    user_hist_items_ = {item_id for item_id, _ in user_hist_items}
    
    item_rank = {}
    for loc, (i, click_time) in enumerate(user_hist_items):
        for j, wij in sorted(i2i_sim[i].items(), key=lambda x: x[1], reverse=True)[:sim_item_topk]:
            if j in user_hist_items_:
                continue
                
            item_rank.setdefault(j, 0)
            item_rank[j] +=  wij
    
    不足10个，用热门商品补全
    if len(item_rank) < recall_item_num:
        for i, item in enumerate(item_topk_click):
            if item in item_rank.items(): # 填充的item应该不在原来的列表中
                continue
            item_rank[item] = - i - 100 # 随便给个负数就行
            if len(item_rank) == recall_item_num:
                break
    
    item_rank = sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]
        
    return item_rank

In [43]:
user_recall_items_dict = defaultdict(dict)

# 获取 用户 - 文章 - 点击时间的字典
user_item_time_dict = get_user_item_time(all_click_df)

# 去取文章相似度
i2i_sim = pickle.load(open(save_path + 'itemcf_i2i_sim.pkl', 'rb'))

# 相似文章的数量
sim_item_topk = 10

# 召回文章数量
recall_item_num = 10

# 用户热度补全
item_topk_click = get_top_k_articles(all_click_df, k=50)

for user in tqdm(all_click_df['user_id'].unique()):
    user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, 
                                                        sim_item_topk, recall_item_num, item_topk_click)

100%|██████████| 50000/50000 [06:33<00:00, 127.13it/s]


In [45]:
user_recall_items_dict.keys()

dict_keys([249999, 249998, 249997, 249996, 249995, 249994, 249993, 249992, 249991, 249990, 249989, 249988, 249987, 249986, 249985, 249984, 249983, 249982, 249981, 249980, 249979, 249978, 249977, 249976, 249975, 249974, 249973, 249972, 249971, 249970, 249969, 249968, 249967, 249966, 249965, 249964, 249963, 249962, 249961, 249960, 249959, 249958, 249957, 249956, 249955, 249954, 249953, 249952, 249951, 249950, 249949, 249948, 249947, 249946, 249945, 249944, 249943, 249942, 249941, 249940, 249939, 249938, 249937, 249936, 249935, 249934, 249933, 249932, 249931, 249930, 249929, 249928, 249927, 249926, 249925, 249924, 249923, 249922, 249921, 249920, 249919, 249918, 249917, 249916, 249915, 249914, 249913, 249912, 249911, 249910, 249909, 249908, 249907, 249906, 249905, 249904, 249903, 249902, 249901, 249900, 249899, 249898, 249897, 249896, 249895, 249894, 249893, 249892, 249891, 249890, 249889, 249888, 249887, 249886, 249885, 249884, 249883, 249882, 249881, 249880, 249879, 249878, 249877, 24987

In [1]:
import sys
import os
# --- 关键代码 ---
# 1. 获取项目根目录的绝对路径
#    os.getcwd() 会获取当前 notebook 所在的目录 (即 notebooks/)
#    os.path.abspath() 转换为绝对路径
#    os.path.join(..., '..') 回到上一级目录，即项目根目录 "新闻推荐/"
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
# 2. 将项目根目录添加到 sys.path
#    我们先检查一下，避免重复添加
if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added '{project_root}' to sys.path")

Added 'c:\Users\17322\Desktop\fdu\camp\dsba\项目\新闻推荐' to sys.path


In [2]:
from src.model.CF_only import CF_Recaller

In [3]:
from pathlib import Path
PROJECT_ROOT = Path.cwd().parent
# --- 基于项目根目录构建其他路径 ---
# 使用 / 操作符来连接路径，pathlib 会自动处理分隔符
DATA_PATH = PROJECT_ROOT / "data" / "raw"
SAVE_PATH = PROJECT_ROOT / "temp"

cf_recaller = CF_Recaller(data_path=DATA_PATH, save_path=SAVE_PATH)
cf_recaller.train()
cf_recaller.predict()

100%|██████████| 200000/200000 [00:10<00:00, 19950.04it/s]


TypeError: cannot unpack non-iterable int object