In [76]:
import os 
import numpy as np
import pandas as pd 

import torch 
import torch.nn as nn 
import torch.nn.functional as F

In [77]:
os.getcwd()

'/opt/ml/input/2023-Summer-Internship-DSAIL/Recsys/PMF'

In [78]:
dpath = '../ml-100k/'

def load_csv(dpath):
    data = pd.read_csv(os.path.join(dpath,'u.data'), sep='\t', header=None)
    data.columns = ['user_id', 'item_id', 'rating', 'timestamp']
    return data

df = load_csv(dpath)
user2idx = {j:i for i,j in enumerate(df.user_id.unique())}
item2idx = {j:i for i,j in enumerate(df.item_id.unique())}

df['user_id'] = df['user_id'].map(user2idx)
df['item_id'] = df['item_id'].map(item2idx)

def rating_func(x,k):
    tx = (x-1) / (k-1)
    return tx 

df['rating'] = df['rating'].apply(rating_func, args=(5,))
    

df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,0,0.5,881250949
1,1,1,0.5,891717742
2,2,2,0.0,878887116
3,3,3,0.25,880606923
4,4,4,0.0,886397596


In [79]:
class PMF(nn.Module):
    def __init__(self, n_users, n_items, hidden_dim, sigma_u, sigma_v):
        super(PMF, self).__init__()
        self.user_emb = nn.Embedding(n_users, hidden_dim)
        self.item_emb = nn.Embedding(n_items, hidden_dim)
        
        nn.init.normal_(self.user_emb.weight, 0, sigma_u)
        nn.init.normal_(self.item_emb.weight, 0, sigma_v)
        
    def forward(self, user, item):
        return torch.sigmoid(torch.sum(self.user_emb(user) * self.item_emb(item), dim=1))

In [112]:
class PMF_Loss(nn.Module):
    def __init__(self):
        super(PMF_Loss, self).__init__()
    
    def forward(self, pred, label, model,lambda_u, lambda_v):
        sum_of_square = torch.sum((label - pred) ** 2)
        regularization = (lambda_u * (torch.norm(model.user_emb.weight, p='fro',dim=1).sum()) + (lambda_v * torch.norm(model.item_emb.weight, p='fro',dim=1).sum()))
        
        return (sum_of_square + regularization) / 2

In [122]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

n_user = len(user2idx)
n_item = len(item2idx)
hidden_dim = 30
sigma_u = 1
sigma_v = 1 

model = PMF(n_user, n_item, hidden_dim, sigma_u, sigma_v).to(device)

EPOCHS = 100 
batch_size = 512 
lr = 0.01
lambda_u = 0.01
lambda_v = 0.001

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = PMF_Loss().to(device)

In [123]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split

class MovieLens(Dataset):
    def __init__(self, df):
        self.user = df['user_id'].values
        self.item = df['item_id'].values
        self.target = df['rating'].values

    def __len__(self):
        return len(self.user)

    def __getitem__(self, idx):
        user = self.user[idx]
        item = self.item[idx]
        target = self.target[idx]

        return user, item, target
    
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = MovieLens(train_df)
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True, drop_last = False)

test_dataset = MovieLens(test_df)
test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle = False, drop_last = False)

In [124]:
for epoch in range(EPOCHS):
    print('Epoch: {}'.format(epoch))

    model.train()
    train_loss = 0
    test_loss = 0

    for user, item, rating in train_loader:
        user,item,rating = user.to(device), item.to(device), rating.to(device)
        pred = model(user, item)
        loss = criterion(pred, rating, model, lambda_u, lambda_v)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()

    train_loss /= len(train_loader)

    print('Train Loss: {:.4f}'.format(train_loss))

    model.eval()
    for user, item, rating in test_loader:
        user, item, rating = user.to(device), item.to(device), rating.to(device)
        pred = model(user, item)
        loss = criterion(pred, rating, model, lambda_u, lambda_v)
        
        test_loss += loss.item()
    
    test_loss /= len(test_loader)

    print('Test Loss: {:.4f}'.format(test_loss))
    


Epoch: 0
Train Loss: 95.0255
Test Loss: 89.5621
Epoch: 1
Train Loss: 78.1271
Test Loss: 83.6255
Epoch: 2
Train Loss: 65.6486
Test Loss: 76.3422
Epoch: 3
Train Loss: 54.6916
Test Loss: 67.0254
Epoch: 4
Train Loss: 44.3262
Test Loss: 56.2747
Epoch: 5
Train Loss: 35.1249
Test Loss: 46.3012
Epoch: 6
Train Loss: 27.9738
Test Loss: 38.8510
Epoch: 7
Train Loss: 23.1981
Test Loss: 33.6543
Epoch: 8
Train Loss: 20.1533
Test Loss: 30.1724
Epoch: 9
Train Loss: 18.2403
Test Loss: 27.9242
Epoch: 10
Train Loss: 17.0023
Test Loss: 26.5038
Epoch: 11
Train Loss: 16.1747
Test Loss: 25.6148
Epoch: 12
Train Loss: 15.5439
Test Loss: 25.0482
Epoch: 13
Train Loss: 15.0541
Test Loss: 24.7196
Epoch: 14
Train Loss: 14.6509
Test Loss: 24.6098
Epoch: 15
Train Loss: 14.2971
Test Loss: 24.3565
Epoch: 16
Train Loss: 14.0265
Test Loss: 24.2675
Epoch: 17
Train Loss: 13.7747
Test Loss: 24.1544
Epoch: 18
Train Loss: 13.5354
Test Loss: 24.1537
Epoch: 19
Train Loss: 13.3336
Test Loss: 24.0985
Epoch: 20
Train Loss: 13.1758
