In [1]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path: sys.path.append(dir1)

In [2]:
import os
import time
import random
import argparse
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from tqdm import tqdm
import re
from tensorboardX import SummaryWriter
from metrics.metrics import metrics
from architectures.NeuMF.neu_mf import NeuMF
from loaders.create_dataloader import CreateDataloader

### Setting Arguments

Here is the brief description of important ones:
- Learning rate is 0.001
- Dropout rate is 0.2
- Running for 10 epochs
- HitRate@10 and NDCG@10
- 4 negative samples for each positive one

In [3]:
DATASET_NAME = 'ml-1m'
MODEL_NAME = 'NeuMF'
DATASET_FILE = 'ratings.dat'
MAIN_PATH = f'../data/{DATASET_NAME}/'
DATA_PATH = MAIN_PATH + DATASET_FILE
MODEL_PATH = f'../models/{DATASET_NAME}/'
MODEL = f'{DATASET_NAME}-{MODEL_NAME}'

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [5]:
parser = argparse.ArgumentParser()
parser.add_argument("--seed",
	type=int,
	default=51,
	help="Seed")
parser.add_argument("--lr",
	type=float,
	default=0.001,
	help="learning rate")
parser.add_argument("--dropout",
	type=float,
	default=0.2,
	help="dropout rate")
parser.add_argument("--batch_size",
	type=int,
	default=256,
	help="batch size for training")
parser.add_argument("--epochs",
	type=int,
	default=10,
	help="training epoches")
parser.add_argument("--top_k",
	type=int,
	default=10,
	help="compute metrics@top_k")
parser.add_argument("--factor_num",
	type=int,
	default=32,
	help="predictive factors numbers in the model")
parser.add_argument("--layers",
    nargs='+',
    default=[64,32,16,8],
    help="MLP layers. Note that the first layer is the concatenation of user \
    and item embeddings. So layers[0]/2 is the embedding size.")
parser.add_argument("--num_ng",
	type=int,
	default=4,
	help="Number of negative samples for training set")
parser.add_argument("--num_ng_test",
	type=int,
	default=100,
	help="Number of negative samples for test set")
parser.add_argument("--out",
	default=True,
	help="save model or not")

_StoreAction(option_strings=['--out'], dest='out', nargs=None, const=None, default=True, type=None, choices=None, help='save model or not', metavar=None)

## Training NeuMF Model

In [6]:
# set device and parameters
args = parser.parse_args("")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
writer = SummaryWriter()

# seed for Reproducibility
seed_everything(args.seed)

# load data
ml_1m = pd.read_csv(
	DATA_PATH,
	sep="::",
	names = ['user_id', 'item_id', 'rating', 'timestamp'],
	engine='python')

# set the num_users, items
num_users = ml_1m['user_id'].nunique()+1
num_items = ml_1m['item_id'].nunique()+1

ml_1m.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
def _reindex(ratings):
    """
    Process dataset to reindex userID and itemID, also set rating as binary feedback
    """
    user_list = list(ratings['user_id'].drop_duplicates())
    user2id = {w: i for i, w in enumerate(user_list)}

    item_list = list(ratings['item_id'].drop_duplicates())
    item2id = {w: i for i, w in enumerate(item_list)}

    ratings['user_id'] = ratings['user_id'].apply(lambda x: user2id[x])
    ratings['item_id'] = ratings['item_id'].apply(lambda x: item2id[x])
    ratings['rating'] = ratings['rating'].apply(lambda x: float(x > 0))
    return ratings

def _leave_one_out(ratings):
    """
    leave-one-out evaluation protocol in paper https://www.comp.nus.edu.sg/~xiangnan/papers/ncf.pdf
    """
    ratings['rank_latest'] = ratings.groupby(['user_id'])['timestamp'].rank(method='first', ascending=False)
    test = ratings.loc[ratings['rank_latest'] == 1]
    train = ratings.loc[ratings['rank_latest'] > 1]
    assert train['user_id'].nunique()==test['user_id'].nunique(), 'Not Match Train User with Test User'
    return train[['user_id', 'item_id', 'rating']], test[['user_id', 'item_id', 'rating']]

In [8]:
ml_1m = _reindex(ml_1m)
train_ml_1m, test_ml_1m = _leave_one_out(ml_1m)

In [9]:
# construct the train and test datasets
data = CreateDataloader(args, train_ml_1m, test_ml_1m, MAIN_PATH)
train_loader = data.get_train_instance()
test_loader = data.get_test_instance()

# set model and loss, optimizer
model = NeuMF(args, num_users, num_items)
model = model.to(device)
print(device)
loss_function = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=args.lr)

(994169, 3) (6040, 3) (1000209, 3)
cuda:0


  from .autonotebook import tqdm as notebook_tqdm


In [11]:

# train, evaluation
best_hr = 0
for epoch in range(1, 5+1):
	model.train() # Enable dropout (if have).
	start_time = time.time()

	for user, item, label, _ in tqdm(train_loader, total=len(train_loader)):
		user = user.to(device)
		item = item.to(device)
		label = label.to(device)
		# print(user.shape, item.shape, label.shape)

		optimizer.zero_grad()
		prediction = model(user, item)
		# print(prediction.shape)
		loss = loss_function(prediction, label)
		loss.backward()
		optimizer.step()
		writer.add_scalar('loss/Train_loss', loss.item(), epoch)

	model.eval()
	HR, NDCG, MRR, RECALL, PRECISION = metrics(model, test_loader, args.top_k, device, args.num_ng_test)
	writer.add_scalar('Perfomance/HR@10', HR, epoch)
	writer.add_scalar('Perfomance/NDCG@10', NDCG, epoch)

	elapsed_time = time.time() - start_time
	print("The time elapse of epoch {:03d}".format(epoch) + " is: " +
			time.strftime("%H: %M: %S", time.gmtime(elapsed_time)))
	print("HR: {:.3f}\tNDCG: {:.3f}\tMRR: {:.3f}".format(np.mean(HR), np.mean(NDCG), np.mean(MRR)))

	if HR > best_hr:
		best_hr, best_ndcg, best_mrr, best_epoch = HR, NDCG, MRR, epoch
		if args.out:
			if not os.path.exists(MODEL_PATH):
				os.mkdir(MODEL_PATH)
			torch.save(model,
				'{}{}.pth'.format(MODEL_PATH, MODEL))

writer.close()

100%|██████████| 19418/19418 [00:58<00:00, 330.03it/s]
100%|██████████| 6040/6040 [00:09<00:00, 635.62it/s]


The time elapse of epoch 001 is: 00: 01: 08
HR: 0.736	NDCG: 0.262	MRR: 0.739


100%|██████████| 19418/19418 [00:59<00:00, 327.05it/s]
100%|██████████| 6040/6040 [00:09<00:00, 607.85it/s]


The time elapse of epoch 002 is: 00: 01: 09
HR: 0.851	NDCG: 0.303	MRR: 0.853


100%|██████████| 19418/19418 [01:01<00:00, 316.85it/s]
100%|██████████| 6040/6040 [00:10<00:00, 582.54it/s]


The time elapse of epoch 003 is: 00: 01: 11
HR: 0.873	NDCG: 0.311	MRR: 0.875


100%|██████████| 19418/19418 [01:00<00:00, 319.06it/s]
100%|██████████| 6040/6040 [00:09<00:00, 605.21it/s]


The time elapse of epoch 004 is: 00: 01: 10
HR: 0.895	NDCG: 0.319	MRR: 0.896


100%|██████████| 19418/19418 [01:03<00:00, 303.89it/s]
100%|██████████| 6040/6040 [00:10<00:00, 563.59it/s]

The time elapse of epoch 005 is: 00: 01: 14
HR: 0.891	NDCG: 0.317	MRR: 0.892





In [None]:

start_time = time.time()
model.eval()
HR, NDCG, MRR, RECALL, PRECISION = metrics(model, test_loader, args.top_k, device, args.num_ng_test)
writer.add_scalar('Perfomance/HR@10', HR, epoch)
writer.add_scalar('Perfomance/NDCG@10', NDCG, epoch)

elapsed_time = time.time() - start_time
print("The time elapse of epoch {:03d}".format(epoch) + " is: " +
		time.strftime("%H: %M: %S", time.gmtime(elapsed_time)))
print("HR: {:.3f}\tNDCG: {:.3f}\tMRR: {:.3f}".format(np.mean(HR), np.mean(NDCG), np.mean(MRR)))


writer.close()

100%|██████████| 6040/6040 [00:08<00:00, 701.40it/s]

The time elapse of epoch 003 is: 00: 00: 08
HR: 0.839	NDCG: 0.299	MRR: 0.841





## Final Output

In [None]:
print("Best epoch {:03d}: HR@10 = {:.3f}, NDCG@10 = {:.3f}, MRR@10 = {:.3f}".format(
									best_epoch, best_hr, best_ndcg, best_mrr))

Best epoch 003: HR@10 = 0.838, NDCG@10 = 0.299, MRR@10 = 0.840
