- Popularityで用いるデータフレームは，
- ①各ユーザに上位10個推薦したデータ
- ②test data用の`user_id`, `item_id`の2つ．
- 上記の2つのデータで，`recall@10`, `precision@10`, `ndcg@10`, `mrr@10`, `hit@10` を求めることができる．

In [8]:
import numpy as np, pandas as pd

print("ok")

ok


In [10]:
cols = ['user_id', 'item_id', 'rating', 'timestamp']

# sep='\t'はタブ区切り
# header=Noneはヘッダ行がないことを示す
# names=colsは列名を指定する
# engine='python'はエンジンをPythonに指定する
df_data = pd.read_csv('u.data', sep='\t', header=None, names=cols, engine='python')

df_data.head(10)

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


In [29]:
# 1. Leave-One-Out 分割
# 最新のレビュー値をtest data にする
df_data['rank'] = df_data.groupby('user_id')['timestamp'].rank(method='first', ascending=False)
train_df = df_data[df_data['rank'] > 1] # 1より大きい正解データのみ
test_df = df_data[df_data['rank'] == 1] # 

# 2. 人気アイテムのランキング
# 学習データ内で最も多く評価された映画を10件出力する
# **このデータがPopylarity-baseな推薦リストになる**
top_items = train_df['item_id'].value_counts().head(10).index.tolist()
print("Top 10 popular items:", top_items)

# 3. 各ユーザーに上位10を推薦
# 各ユーザの正解アイテムのidの出力
recommendations = {user: top_items for user in test_df['user_id']}
ground_truth = test_df.set_index('user_id')['item_id'].to_dict()
print("Number of users:", len(ground_truth))
print("user_id.user_id", len(test_df['user_id']))

Top 10 popular items: [50, 181, 100, 258, 294, 286, 288, 1, 300, 121]
Number of users: 943
user_id.user_id 943


In [36]:
print("train_df shape: ", train_df.shape)
train_df.head()

train_df shape:  (99057, 5)


Unnamed: 0,user_id,item_id,rating,timestamp,rank
0,196,242,3,881250949,37.0
1,186,302,3,891717742,19.0
2,22,377,1,878887116,76.0
3,244,51,2,880606923,61.0
4,166,346,1,886397596,13.0


In [37]:
print("test_df shape: ", test_df.shape)
test_df.head()

test_df shape:  (943, 5)


Unnamed: 0,user_id,item_id,rating,timestamp,rank
52,260,322,4,890618898,1.0
53,25,181,5,885853415,1.0
70,189,512,4,893277702,1.0
109,265,118,4,875320714,1.0
167,155,323,2,879371261,1.0


In [38]:
# 評価指標`Evaluation_index.py`をimportする
from Evaluation_index import recall_at_k, precision_at_k, ndcg_at_k, mrr_at_k, hit_at_k

# 上位10件を出力
k = 10
recall = recall_at_k(recommendations, ground_truth, k)
precision = precision_at_k(recommendations, ground_truth, k)
ndcg = ndcg_at_k(recommendations, ground_truth, k)
mrr = mrr_at_k(recommendations, ground_truth, k)
hit = hit_at_k(recommendations, ground_truth, k)

# 各評価指標で`Popularity`を評価する
print(f"=== Poplarityモデル評価結果（Top-{k}）===")
print(f"Recall@{k}:    {recall:.4f}")
print(f"Precision@{k}: {precision:.4f}")
print(f"NDCG@{k}:      {ndcg:.4f}")
print(f"MRR@{k}:       {mrr:.4f}")
print(f"Hit@{k}:       {hit:.4f}")

# RecBoleによるPopularityモデルの評価結果
print("=== RecBole (Popularity Model) 評価結果 ===")
print(f"Recall@10    : {0.0358:.4f}")
print(f"Precision@10 : {0.0494:.4f}")
print(f"NDCG@10      : {0.0556:.4f}")
print(f"MRR@10       : {0.1095:.4f}")
print(f"Hit@10       : {0.2891:.4f}")


=== Poplarityモデル評価結果（Top-10）===
Recall@10:    0.0498
Precision@10: 0.0050
NDCG@10:      0.0241
MRR@10:       0.0165
Hit@10:       0.0498
=== RecBole (Popularity Model) 評価結果 ===
Recall@10    : 0.0358
Precision@10 : 0.0494
NDCG@10      : 0.0556
MRR@10       : 0.1095
Hit@10       : 0.2891
