- DeepFM

- FMの2次項 + Deep部分(MLP)を統合して，手動で特徴交差しなくても深い非線形関係も学習できる強力なモデル．

| コンポーネント   | 役割                              |
| --------- | ------------------------------- |
| **線形項**   | 通常の線形回帰（Wideに相当）                |
| **FM項**   | 2次の特徴交差（Factorization Machines） |
| **Deep項** | Embedding後に MLP に通す（Deep部分）     |
| **出力**    | 全部を結合して最終的に1つのスコアを出す（sigmoid）   |

- 実装手順
- データ前処理(One-Hot-Encoding or IDベース)
- Embeddingレイヤー作成
- FM2次項の内積計算
- MLP(Deep部分)の定義
- 結合→出力→学習

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader

# データ読み込み
cols = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('u.data', sep='\t', names=cols)
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Leave-One-Out分割
df['rank'] = df.groupby('user_id')['timestamp'].rank(method='first', ascending=False)
train_df = df[df['rank'] > 1].copy()
test_df = df[df['rank'] == 1].copy()

# IDを数値に変換
user_enc = LabelEncoder()
item_enc = LabelEncoder()
train_df['user'] = user_enc.fit_transform(train_df['user_id'])
train_df['item'] = item_enc.fit_transform(train_df['item_id'])
test_df = test_df[test_df['user_id'].isin(user_enc.classes_)]
test_df = test_df[test_df['item_id'].isin(item_enc.classes_)]
test_df['user'] = user_enc.transform(test_df['user_id'])
test_df['item'] = item_enc.transform(test_df['item_id'])

# ラベル（クリック or 高評価）
train_df['label'] = (train_df['rating'] >= 4).astype(int)
test_df['label'] = (test_df['rating'] >= 4).astype(int)

num_users = train_df['user'].nunique()
num_items = train_df['item'].nunique()

print("ok")

ok


In [4]:
class FM_Dataset(Dataset):
    def __init__(self, df):
        self.users = torch.LongTensor(df['user'].values)
        self.items = torch.LongTensor(df['item'].values)
        self.labels = torch.FloatTensor(df['label'].values)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]


In [5]:
import torch.nn as nn

class DeepFM(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=16, hidden_dims=[32, 16]):
        super(DeepFM, self).__init__()
        # Embedding
        self.user_embed = nn.Embedding(num_users, embedding_dim)
        self.item_embed = nn.Embedding(num_items, embedding_dim)

        # 線形項
        self.linear_user = nn.Embedding(num_users, 1)
        self.linear_item = nn.Embedding(num_items, 1)

        # Deep部分
        self.deep_input_dim = embedding_dim * 2
        layers = []
        input_dim = self.deep_input_dim
        for h in hidden_dims:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.ReLU())
            input_dim = h
        self.mlp = nn.Sequential(*layers)

        # 出力
        self.final_linear = nn.Linear(1 + 1 + hidden_dims[-1], 1)

    def forward(self, user, item):
        # Embedding
        u_emb = self.user_embed(user)
        i_emb = self.item_embed(item)

        # 線形項
        linear_part = self.linear_user(user) + self.linear_item(item)

        # FM 2次項：<u_emb, i_emb>
        fm_part = torch.sum(u_emb * i_emb, dim=1, keepdim=True)

        # Deep部分
        deep_input = torch.cat([u_emb, i_emb], dim=1)
        deep_out = self.mlp(deep_input)

        # 出力
        concat = torch.cat([linear_part, fm_part, deep_out], dim=1)
        out = torch.sigmoid(self.final_linear(concat)).squeeze(1)
        return out


In [9]:
from torch.optim import Adam
from sklearn.metrics import accuracy_score

train_dataset = FM_Dataset(train_df)
test_dataset = FM_Dataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DeepFM(num_users, num_items).to(device)
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=0.001)

# 学習ループ
for epoch in range(100):
    model.train()
    total_loss = 0
    for users, items, labels in train_loader:
        users, items, labels = users.to(device), items.to(device), labels.to(device)
        preds = model(users, items)
        loss = criterion(preds, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss:.4f}")


Epoch 1 Loss: 535.2153
Epoch 2 Loss: 495.1932
Epoch 3 Loss: 463.0873
Epoch 4 Loss: 445.1308
Epoch 5 Loss: 434.4732
Epoch 6 Loss: 427.6331
Epoch 7 Loss: 422.2981
Epoch 8 Loss: 418.0742
Epoch 9 Loss: 414.9157
Epoch 10 Loss: 412.2711
Epoch 11 Loss: 409.5938
Epoch 12 Loss: 407.2626
Epoch 13 Loss: 404.8838
Epoch 14 Loss: 402.4906
Epoch 15 Loss: 400.5358
Epoch 16 Loss: 398.4673
Epoch 17 Loss: 396.3944
Epoch 18 Loss: 394.3774
Epoch 19 Loss: 392.0106
Epoch 20 Loss: 389.9773
Epoch 21 Loss: 387.6903
Epoch 22 Loss: 385.5766
Epoch 23 Loss: 383.2103
Epoch 24 Loss: 381.3129
Epoch 25 Loss: 379.4293
Epoch 26 Loss: 376.9597
Epoch 27 Loss: 374.6642
Epoch 28 Loss: 372.7654
Epoch 29 Loss: 370.6815
Epoch 30 Loss: 368.3481
Epoch 31 Loss: 366.3197
Epoch 32 Loss: 364.1278
Epoch 33 Loss: 362.2289
Epoch 34 Loss: 359.9002
Epoch 35 Loss: 358.0486
Epoch 36 Loss: 355.9696
Epoch 37 Loss: 354.2663
Epoch 38 Loss: 352.0190
Epoch 39 Loss: 349.9987
Epoch 40 Loss: 348.0459
Epoch 41 Loss: 346.2574
Epoch 42 Loss: 344.2243
E

In [7]:
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for users, items, labels in test_loader:
        users, items = users.to(device), items.to(device)
        preds = model(users, items)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.numpy())

binary_preds = (np.array(all_preds) >= 0.5).astype(int)
acc = accuracy_score(all_labels, binary_preds)
print(f"\n✅ DeepFM Test Accuracy: {acc:.4f}")



✅ DeepFM Test Accuracy: 0.6748


In [11]:
# 評価用のスコア生成(正解 + 負例99件)

from collections import defaultdict

n_negative = 99
all_items = set(train_df['item'].unique())
user_item_score = defaultdict(list)

model.eval()
with torch.no_grad():
    for user in test_df['user'].unique():
        gt_item = test_df[test_df['user'] == user].iloc[0]['item']
        negatives = list(all_items - {gt_item})
        sample_size = min(n_negative, len(negatives))
        sampled_negatives = np.random.choice(negatives, size=sample_size, replace=False)

        # 評価対象アイテム（正解 + 負例）
        items_to_score = np.append(sampled_negatives, gt_item)

        # Tensor化
        user_tensor = torch.LongTensor([user] * len(items_to_score)).to(device)
        item_tensor = torch.LongTensor(items_to_score).to(device)

        # モデルスコアを出力
        scores = model(user_tensor, item_tensor).cpu().numpy()
        user_item_score[user] = list(zip(items_to_score, scores))


In [12]:
recommendations = {
    user: [item for item, _ in sorted(items, key=lambda x: x[1], reverse=True)[:10]]
    for user, items in user_item_score.items()
}

ground_truth = {
    user: [test_df[test_df['user'] == user].iloc[0]['item']]
    for user in user_item_score
}

# 正解1件だけ取り出す形式に変換
gt_single = {u: items[0] for u, items in ground_truth.items()}


In [14]:
from Evaluation_index import recall_at_k, precision_at_k, ndcg_at_k, mrr_at_k, hit_at_k

# 評価の実行
recall = recall_at_k(recommendations, gt_single, 10)
precision = precision_at_k(recommendations, gt_single, 10)
ndcg = ndcg_at_k(recommendations, gt_single, 10)
mrr = mrr_at_k(recommendations, gt_single, 10)
hit = hit_at_k(recommendations, gt_single, 10)

print("=== DeepFM モデル評価結果（Top-10）===")
print(f"Recall@10    : {recall:.4f}")
print(f"Precision@10 : {precision:.4f}")
print(f"NDCG@10      : {ndcg:.4f}")
print(f"MRR@10       : {mrr:.4f}")
print(f"Hit@10       : {hit:.4f}")

print("=== RecBole モデル評価結果（Top-10）===")
print(f"Recall@10    : 0.0540")
print(f"Precision@10 : 0.0621")
print(f"NDCG@10      : 0.0779")
print(f"MRR@10       : 0.1579")
print(f"Hit@10       : 0.3566")


=== DeepFM モデル評価結果（Top-10）===
Recall@10    : 0.0967
Precision@10 : 0.0097
NDCG@10      : 0.0363
MRR@10       : 0.0190
Hit@10       : 0.0967
=== RecBole モデル評価結果（Top-10）===
Recall@10    : 0.0540
Precision@10 : 0.0621
NDCG@10      : 0.0779
MRR@10       : 0.1579
Hit@10       : 0.3566
