In [1]:
import numpy as np
import pandas as pd
import os
np.random.seed(0)

In [2]:
from sklearn.model_selection import train_test_split

def split_dataset(path_to_ml_25m = "/Users/shanoop/Downloads/ml-25m",  random_state: int = 42):
    train_ratio = 0.75
    validation_ratio = 0.15
    test_ratio = 0.10
    names = ['user_id', 'item_id', 'rating', 'timestamp']
    ratings_df = pd.read_csv(os.path.join(path_to_ml_25m, 'ratings.csv'),names=names, index_col=False, skiprows=1)

    n_users = ratings_df.user_id.max()
    n_items = ratings_df.item_id.max()

    # train is now 75% of the entire data set
    train, test = train_test_split(
        ratings_df,                                    
        test_size=1 - train_ratio,
        random_state=random_state)

    # test is now 10% of the initial data set
    # validation is now 15% of the initial data set
    val, test = train_test_split(   
        test,
        test_size=test_ratio / (test_ratio + validation_ratio),
        random_state=random_state)
    
    return train, test, val, (n_users, n_items)

In [3]:
import torch
from torch.autograd import Variable

In [46]:
def hit(ng_item, pred_items):
	if ng_item in pred_items:
		return 1
	return 0


def ndcg(ng_item, pred_items):
	if ng_item in pred_items:
		index = pred_items.index(ng_item)
		return np.reciprocal(np.log2(index+2))
	return 0


def metrics(model, test_loader, top_k):
	HR, NDCG = [], []
	x = 0

	for user, item, label in test_loader:
		x += 1
		#user = user.to(device)
		#item = item.to(device)

		predictions = model(user, item)
		_, indices = torch.topk(predictions, top_k)
		recommends = torch.take(
				item, indices).cpu().numpy().tolist()

		ng_item = item[0].item() # leave one-out evaluation has only one item per user
		HR.append(hit(ng_item, recommends))
		NDCG.append(ndcg(ng_item, recommends))
		if x > 50:
			break
		

	return np.mean(HR), np.mean(NDCG)

class BiasedMatrixFactorization(torch.nn.Module):
    
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        self.n_items = n_items
        self.user_factors = torch.nn.Embedding(n_users+1, 
                                               n_factors,
                                               sparse=False)
        self.item_factors = torch.nn.Embedding(n_items+1, 
                                               n_factors,
                                               sparse=False)
        #self.user_biases = torch.nn.Embedding(n_users+1, 
        #                                      1,
        #                                      sparse=True)
        #self.item_biases = torch.nn.Embedding(n_items+1,
        #                                      1,
        #                                      sparse=True)
        
        self.linear = torch.nn.Linear(in_features=n_factors, out_features=1)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, user, item):
        user_embedding = self.user_factors(user)
        item_embedding = self.item_factors(item)
        embeddding_vector = torch.mul(user_embedding, item_embedding)
        rating = self.sigmoid(self.linear(embeddding_vector))
        return rating
    
train, test, val, (n_users, n_items) = split_dataset()

model = BiasedMatrixFactorization(n_users, n_items, n_factors=40)


#optimizer = torch.optim.SGD(model.parameters(), lr=1e-6,
#                                weight_decay=1e-5)

optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
#optimizer = torch.optim.SparseAdam(model.parameters(), lr=1e-3)
#optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size= 10, gamma=0.1)

#loss_func = torch.nn.MSELoss(reduce='mean')
loss_func = torch.nn.L1Loss()
#loss_func = torch.nn.BCELoss()

In [47]:
from torch.utils.data import Dataset
class cData(Dataset):
    def __init__(self, split_df):
        self.df = split_df

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        data = self.df.iloc[[idx]]
        user = int(data.user_id.values[0])
        movie = int(data.item_id.values[0])
        rating = float(data.rating.values[0])
        return torch.tensor(user).long(), torch.tensor(movie).long(), torch.tensor(rating).float()
    
    
from torch.utils.data import DataLoader
train_d = cData(train)
test_d = cData(test)
train_dataloader = DataLoader(train_d, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_d, batch_size=64, shuffle=True)

In [48]:
from tqdm.notebook import tqdm

for train_iter in range(30):
    print(train_iter)
    model.train()
    t_loss = 0
    t_count = 0
    for row, col,rating in tqdm(train_dataloader):
        optimizer.zero_grad()

        # Predict and calculate loss
        #try:
        prediction = model(row, col)
        #except Exception as e:
        #print(f"R:{row}, C:{col}")
        loss = loss_func(prediction, rating.unsqueeze(1))
        t_loss += loss
        t_count += 1
    
        # Backpropagate
        loss.backward()
    
        # Update the parameters
        optimizer.step()

    scheduler.step()
    model.eval()
    te_loss = 0
    te_count = 0
    print('Evaluating')
    with torch.no_grad():
        #HR, NDCG = metrics(model, test_dataloader, 5)
        for row, col,rating in test_dataloader:
            prediction = model(row, col)
            loss = loss_func(prediction, rating.unsqueeze(1))
            te_loss += loss
            te_count += 1

    #print(f"HR: {HR} NDCG:{NDCG}")
    print(f"Test loss: {te_loss/te_count}")
    print(f"Train loss: {t_loss/t_count}")

0


  0%|          | 0/292970 [00:00<?, ?it/s]