- WideDeep: DeepLearning * Wide Linear Modelの組み合わせ

- カテゴリカルデータと混合データセット(数値+カテゴリ)を扱う推薦システムや分類問題に適している．

- 実装手順
- Leave-One-Out分割
- user_idとitem_idをLabel_Encodingする
- Wide側をOne-Hot-Encodingして，Deep側をuserとitemをIDとしてEmbeddingする
- モデル構築(Wide部分とDeep部分して，結合と出力する(Wide側とDeep出力をConcatenateして，出力層でsigmoidなどの活性化関数を用いる))
- モデル学習(損失関数は，binary_crossentropy, 適当なbatch_sizeとepoch数で実行する)
- 評価指標算出

In [1]:
pip install pytorch-widedeep

Collecting pytorch-widedeep
  Downloading pytorch_widedeep-1.6.5-py3-none-any.whl (22.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.0/22.0 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
[?25hCollecting torchmetrics>=1.3.1
  Downloading torchmetrics-1.7.2-py3-none-any.whl (962 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m962.5/962.5 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hCollecting fastparquet>=2024.2.0
  Downloading fastparquet-2024.11.0-cp310-cp310-macosx_11_0_arm64.whl (684 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m684.1/684.1 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m[31m8.4 MB/s[0m eta [36m0:00:01[0m
[?25hCollecting spacy
  Downloading spacy-3.8.7-cp310-cp310-macosx_11_0_arm64.whl (6.3 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[

In [5]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp310-cp310-macosx_12_0_arm64.whl (252.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m252.5/252.5 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting ml-dtypes<1.0.0,>=0.5.1
  Downloading ml_dtypes-0.5.1-cp310-cp310-macosx_10_9_universal2.whl (671 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m671.5/671.5 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hCollecting h5py>=3.11.0
  Downloading h5py-3.14.0-cp310-cp310-macosx_11_0_arm64.whl (2.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
Collecting google-pasta>=0.1.1
  Downloading google_pasta-0.2.0-py3-none-any.whl (57 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.5/57.5 kB

In [6]:
pip list

Package                      Version
---------------------------- --------------
absl-py                      2.3.0
annotated-types              0.7.0
anyio                        4.9.0
appnope                      0.1.4
argon2-cffi                  23.1.0
argon2-cffi-bindings         21.2.0
arrow                        1.3.0
asttokens                    3.0.0
astunparse                   1.6.3
async-lru                    2.0.5
attrs                        25.3.0
babel                        2.17.0
beautifulsoup4               4.13.4
bleach                       6.2.0
blis                         1.2.1
catalogue                    2.0.10
certifi                      2025.4.26
cffi                         1.17.1
charset-normalizer           3.4.2
click                        8.2.1
cloudpathlib                 0.21.1
comm                         0.2.2
confection                   0.1.5
contourpy                    1.3.2
cramjam                      2.10.0
cycler                       0.

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate
from tensorflow.keras.models import Model

import warnings
warnings.filterwarnings('ignore')
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # 0:all, 1:filter INFO, 2:filter WARNING, 3:only ERROR

# データ読み込み
cols = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('u.data', sep='\t', names=cols)
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Leave-One-Out分割
df['rank'] = df.groupby('user_id')['timestamp'].rank(method='first', ascending=False)
train_df = df[df['rank'] > 1].copy()
test_df = df[df['rank'] == 1].copy()

# ラベルエンコード
user_enc = LabelEncoder()
item_enc = LabelEncoder()
train_df['user'] = user_enc.fit_transform(train_df['user_id'])
train_df['item'] = item_enc.fit_transform(train_df['item_id'])
test_df = test_df[test_df['user_id'].isin(user_enc.classes_)]
test_df = test_df[test_df['item_id'].isin(item_enc.classes_)]
test_df['user'] = user_enc.transform(test_df['user_id'])
test_df['item'] = item_enc.transform(test_df['item_id'])

num_users = train_df['user'].nunique()
num_items = train_df['item'].nunique()

print("ok")

ok


In [11]:
# Wide側のOne-Hot-Encoding

# Train用
enc = OneHotEncoder()
X_wide_train = enc.fit_transform(train_df[['user', 'item']]).toarray()

# Test用
X_wide_test = enc.transform(test_df[['user', 'item']]).toarray()


In [12]:
# Deep側は埋め込み + MLP

# 入力層
user_input = Input(shape=(1,), name='user_input')
item_input = Input(shape=(1,), name='item_input')

# 埋め込み層
user_embed = Embedding(input_dim=num_users, output_dim=8)(user_input)
item_embed = Embedding(input_dim=num_items, output_dim=8)(item_input)

# Flatten
user_vec = Flatten()(user_embed)
item_vec = Flatten()(item_embed)

# 結合 & MLP
deep_concat = Concatenate()([user_vec, item_vec])
deep_out = Dense(32, activation='relu')(deep_concat)
deep_out = Dense(16, activation='relu')(deep_out)

In [13]:
# Deep側入力データ
X_user_train = train_df['user'].values
X_item_train = train_df['item'].values
X_user_test = test_df['user'].values
X_item_test = test_df['item'].values

y_train = (train_df['rating'] >= 4).astype(int).values  # 二値分類
y_test = (test_df['rating'] >= 4).astype(int).values

In [14]:
# モデル構築

# 入力層
wide_input = Input(shape=(X_wide_train.shape[1],), name='wide_input')
user_input = Input(shape=(1,), name='user_input')
item_input = Input(shape=(1,), name='item_input')

# 埋め込み層
user_embed = Embedding(input_dim=num_users, output_dim=8)(user_input)
item_embed = Embedding(input_dim=num_items, output_dim=8)(item_input)
user_vec = Flatten()(user_embed)
item_vec = Flatten()(item_embed)

# Deep部分（MLP）
deep_concat = Concatenate()([user_vec, item_vec])
deep_out = Dense(32, activation='relu')(deep_concat)
deep_out = Dense(16, activation='relu')(deep_out)

# Wide + Deep 結合 & 出力
combined = Concatenate()([wide_input, deep_out])
output = Dense(1, activation='sigmoid')(combined)

model = Model(inputs=[wide_input, user_input, item_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [15]:
# モデル学習
model.fit([X_wide_train, X_user_train, X_item_train], y_train,
          epochs=5, batch_size=128, validation_split=0.1)

Epoch 1/5
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6120 - loss: 0.6429 - val_accuracy: 0.7020 - val_loss: 0.5711
Epoch 2/5
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 771us/step - accuracy: 0.7239 - loss: 0.5511 - val_accuracy: 0.7094 - val_loss: 0.5636
Epoch 3/5
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 824us/step - accuracy: 0.7264 - loss: 0.5436 - val_accuracy: 0.7127 - val_loss: 0.5598
Epoch 4/5
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 846us/step - accuracy: 0.7282 - loss: 0.5358 - val_accuracy: 0.7107 - val_loss: 0.5578
Epoch 5/5
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 797us/step - accuracy: 0.7342 - loss: 0.5243 - val_accuracy: 0.7111 - val_loss: 0.5544


<keras.src.callbacks.history.History at 0x32311b850>

In [26]:
from collections import defaultdict

n_negative = 99
all_items = set(train_df['item'].unique())

user_item_score = defaultdict(list)

for user in test_df['user'].unique():
    # 正解アイテム1件取得（安全な取得方法）
    gt_item = test_df[test_df['user'] == user].iloc[0]['item']

    # 負例アイテムのサンプリング
    negatives = list(all_items - {gt_item})
    sampled_negatives = np.random.choice(negatives, size=n_negative, replace=False)

    # 評価対象アイテム集合（正解含む）
    items_to_score = np.append(sampled_negatives, gt_item)

    # 入力データ作成
    wide_feat = enc.transform([[user, item] for item in items_to_score]).toarray()
    user_input_batch = np.full(len(items_to_score), user)
    item_input_batch = items_to_score

    # 推薦スコア予測
    scores = model.predict([wide_feat, user_input_batch, item_input_batch], verbose=0).flatten()

    # スコア保存
    user_item_score[user] = list(zip(items_to_score, scores))

# 推薦リスト作成（Top-10）
recommendations = {
    user: [item for item, _ in sorted(items, key=lambda x: x[1], reverse=True)[:10]]
    for user, items in user_item_score.items()
}

# Ground truth 作成
ground_truth = {
    user: [test_df[test_df['user'] == user].iloc[0]['item']]
    for user in user_item_score
}


In [28]:
# 評価

from Evaluation_index import recall_at_k, precision_at_k, ndcg_at_k, mrr_at_k, hit_at_k

print("=== WideDeep モデル評価結果（Top-10）===")
print(f"Recall@10    : {recall_at_k(recommendations, ground_truth, 10):.4f}")
print(f"Precision@10 : {precision_at_k(recommendations, ground_truth, 10):.4f}")
print(f"NDCG@10      : {ndcg_at_k(recommendations, ground_truth, 10):.4f}")
print(f"MRR@10       : {mrr_at_k(recommendations, ground_truth, 10):.4f}")
print(f"Hit@10       : {hit_at_k(recommendations, ground_truth, 10):.4f}")

print("\n=== RecBole モデル評価結果（Top-10）===")
print(f"Recall@10    : 0.0534")
print(f"Precision@10 : 0.0620")
print(f"NDCG@10      : 0.0774")
print(f"MRR@10       : 0.1564")
print(f"Hit@10       : 0.3560")

=== WideDeep モデル評価結果（Top-10）===
Recall@10    : 0.1413
Precision@10 : 0.0141
NDCG@10      : 0.0706
MRR@10       : 0.0492
Hit@10       : 0.1413

=== RecBole モデル評価結果（Top-10）===
Recall@10    : 0.0534
Precision@10 : 0.0620
NDCG@10      : 0.0774
MRR@10       : 0.1564
Hit@10       : 0.3560
