In [None]:
import os
import torch
import pandas as pd
import numpy as np

from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from collections import defaultdict

In [1]:
class MovielensDataset(torch.utils.data.Dataset):
    def __init__(self, data, sparse_feature_names):
        self.sparse_feature_names = sparse_feature_names
        self.data = data[:, :-1]
        self.label = data[:, [-1]].astype(float)
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {
            self.sparse_feature_names[0]: self.data[idx, 0],
            self.sparse_feature_names[1]: self.data[idx, 1],
        }, self.label[idx]
        
class MovielensMatrixFactorization(torch.nn.Module):
    def __init__(
        self,
        num_sparse_features: dict[str, int],
        latent_dim: int,
    ):
        super().__init__()
        self.num_sparse_features = num_sparse_features
        self.latent_dim = latent_dim
        self.sparse_feature_names = list(num_sparse_features.keys())

        self.embeddings = torch.nn.ModuleDict({
            name : torch.nn.Embedding(num_sparse_feature, latent_dim)
            for name, num_sparse_feature in num_sparse_features.items()
        })
        
        
    def forward(self, sparse_features: dict[str, torch.LongTensor]) -> torch.FloatTensor:
        emb1 = self.embeddings[self.sparse_feature_names[0]](sparse_features[self.sparse_feature_names[0]])
        emb2 = self.embeddings[self.sparse_feature_names[1]](sparse_features[self.sparse_feature_names[1]])
        logits = (emb1 * emb2).sum(dim=1, keepdim=True)
        return logits

In [2]:
ratings = pd.read_csv("movielens/raw/u.data", sep="\t", names=["user_id", "item_id", "rating", "timestamp"])
users = pd.read_csv("movielens/raw/u.user", sep="|", encoding="latin-1", names=["user_id", "age", "gender", "occupation", "zip"])
items = pd.read_csv("movielens/raw/u.item", sep="|", encoding="latin-1", names=[
        "item_id",
        "title",
        "release_date",
        "video_release_date",
        "imdb_url",
        "unknown",
        "action",
        "adventure",
        "animation",
        "children",
        "comedy",
        "crime",
        "documentary", 
        "drama", 
        "fantasy",
        "film-noir",
        "horror",
        "musical",
        "mystery",
        "romance",
        "sci-Fi",
        "thriller",
        "war",
        "western",
    ])
num_sparse_features = {
    "user_id": users["user_id"].nunique(),
    "item_id": items["item_id"].nunique(),
}
vocabulary = {
    "user_id": np.unique(users["user_id"]),
    "item_id": np.unique(items["item_id"])
}

In [3]:
# 전처리
processed = ratings.copy()

# user_id, item_id 정보가 있는 데이터만 남기기
processed = processed[(
    processed["user_id"].isin(vocabulary["user_id"])
    & processed["item_id"].isin(vocabulary["item_id"])
)]
unknown = len(ratings) - len(processed)
print("삭제된 알 수 없는 데이터:", unknown)


# user_id, item_id를 index 매핑
processed["user_id"] = processed["user_id"].map({
    uid : i 
    for i, uid in enumerate(vocabulary["user_id"])
})

processed["item_id"] = processed["item_id"].map({
    iid : i 
    for i, iid in enumerate(vocabulary["item_id"])
})

삭제된 알 수 없는 데이터: 0


In [4]:
# Dataset Hyperparameter
maximum_positive_sample = 100
num_negative_sample = 5
all_item_indices = np.arange(num_sparse_features["item_id"])

# Leave-one-out split
# Downsampling
# Random Negative Sampling
train, test = [], []
for u, group in tqdm(processed.sort_values(by="timestamp").groupby("user_id")):
    # Leave one
    last = group.iloc[-1, :2].values
    test.append([last[0], last[1], 1])
    
    
    # Downsampling
    group = group.iloc[:-1]
    num_positive_sample = len(group)
    num_positive_sample = min(maximum_positive_sample, num_positive_sample)
    group = group.tail(num_positive_sample)

    # positive samples
    positive = group[["user_id", "item_id"]].values
    positive = np.column_stack([positive, np.ones(len(positive), dtype=int)])

    # negative sampling
    positive_item_indices = group["item_id"].unique()
    negative_item_indices = np.setdiff1d(all_item_indices, positive_item_indices)

    size = num_positive_sample*num_negative_sample
    replace_flag = size > len(negative_item_indices)
    negative_item_indices = np.random.choice(
        negative_item_indices,
        size=size,
        replace=replace_flag
    )

    negative = np.column_stack([
        np.full(size, u, dtype=int),
        negative_item_indices,
        np.zeros(size, dtype=int)
    ])
    train.append(positive)
    train.append(negative)

train = np.vstack(train)
test = np.vstack(test)

# save in processed
np.save("movielens/processed/train.npy", train)
np.save("movielens/processed/test.npy", test)

100%|███████████████████████████████████████| 943/943 [00:00<00:00, 2432.71it/s]


Ellipsis

In [6]:
# Training Hyperparameter
batch_size = 64
latent_dim = 16
lr = 1e-3
epochs = 10

# Define Dataset
train_ds = MovielensDataset(train, list(num_sparse_features.keys()))
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=False)

device = torch.device("cpu") # torch.mps.is_available()
model = MovielensMatrixFactorization(num_sparse_features, latent_dim)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [7]:
# Train MF
history = defaultdict(list)
model.train()
auc = 0
for epoch in range(epochs):
    total_loss = 0
    all_labels = []
    all_preds = []
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
    for sparse_features, labels in pbar:
        sparse_features = {k: v.to(device) for k, v in sparse_features.items()}
        labels = labels.to(device)

        
        # forward
        logits = model(sparse_features)
        loss = torch.nn.functional.binary_cross_entropy_with_logits(logits, labels)

        
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        

        # metrics
        total_loss += loss.item()
        probs = logits.sigmoid().detach().cpu().numpy()
        all_preds.extend(probs.flatten())
        all_labels.extend(labels.detach().cpu().numpy().flatten())
        
        # tqdm update
        pbar.set_postfix(
            logloss=loss.item(),
            auc=f"{auc:.4f}"
            
        )

        
    avg_loss = total_loss / len(train_loader)
    acc = ((np.array(all_preds) > 0.5) == np.array(all_labels)).mean()
    auc = roc_auc_score(all_labels, all_preds)
    history["accuracy"].append(acc)
    history["auroc"].append(auc)
    history["logloss"].append(avg_loss)
    
    pbar.set_postfix(
        logloss=f"{avg_loss:.4f}",
        auc=f"{auc:.4f}"
    )
    pbar.refresh()

Epoch 1/10: 100%|███| 5747/5747 [00:08<00:00, 646.10it/s, auc=0.0000, loss=1.01]
Epoch 2/10: 100%|██| 5747/5747 [00:08<00:00, 649.86it/s, auc=0.4996, loss=0.838]
Epoch 3/10: 100%|██| 5747/5747 [00:09<00:00, 636.15it/s, auc=0.5102, loss=0.702]
Epoch 4/10: 100%|██| 5747/5747 [00:09<00:00, 615.46it/s, auc=0.5239, loss=0.576]
Epoch 5/10: 100%|██| 5747/5747 [00:10<00:00, 573.68it/s, auc=0.5426, loss=0.441]
Epoch 6/10: 100%|██| 5747/5747 [00:10<00:00, 571.27it/s, auc=0.5701, loss=0.303]
Epoch 7/10: 100%|██| 5747/5747 [00:09<00:00, 585.68it/s, auc=0.6101, loss=0.198]
Epoch 8/10: 100%|██| 5747/5747 [00:10<00:00, 570.81it/s, auc=0.6604, loss=0.141]
Epoch 9/10: 100%|██| 5747/5747 [00:09<00:00, 608.47it/s, auc=0.7113, loss=0.118]
Epoch 10/10: 100%|█| 5747/5747 [00:09<00:00, 599.97it/s, auc=0.7555, loss=0.111]


In [40]:
# TopK recommendation
topk = 10

user_recommendations = {}
model.eval()
with torch.no_grad():
    all_item_indices = torch.arange(num_sparse_features["item_id"]).to(device)
    pbar = tqdm(range(num_sparse_features["user_id"]), desc="TopK Recommend")
    for user_id in pbar:
        sparse_features = {
            "user_id":torch.full(size=(len(all_item_indices),), fill_value=user_id).to(device),
            "item_id":all_item_indices,
        }
        logits = model(sparse_features)
        values, indices = logits.flatten().topk(topk)
        

        user_recommendations[user_id] = indices.detach().cpu().numpy()

        pbar.set_postfix(user_id=user_id)
        
user_recommendations[user_id]

TopK Recommend: 100%|██████████| 943/943 [00:00<00:00, 1242.16it/s, user_id=942]


array([450,  87, 171,  55, 549, 738, 203, 194, 160, 173])

In [36]:
# (TODO) NDCG@topk, Precision@topk

def ndcg(true, pred):
    pass
def precision(true, pred):
    pass

for line in test:
    user_id, real, _ = line
    recommendations = user_recommendations[user_id]
    break

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156
