In [2]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path: sys.path.append(dir1)

In [3]:
import os
import time
import random
import argparse
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from tensorboardX import SummaryWriter
from architectures.NeuMF.neu_mf import NeuMF
from loaders.create_dataloader import CreateDataloader
from tqdm import tqdm
import re
import pickle

In [4]:
DATASET_NAME = 'MOOCCubeX'
MODEL_NAME = 'NeuMF'
TRAIN_DATASET_FILE = 'train.feather'
TEST_DATASET_FILE = 'test.feather'
MAIN_PATH = f'../data/{DATASET_NAME}/'
TRAIN_DATA_PATH = MAIN_PATH + TRAIN_DATASET_FILE
TEST_DATA_PATH = MAIN_PATH + TEST_DATASET_FILE
MODEL = f'{DATASET_NAME}-{MODEL_NAME}'
MODEL_PATH = f'../models/{DATASET_NAME}/{MODEL}.pth'

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [5]:
def _reindex(ratings):
    """
    Process dataset to reindex userID and itemID, also set rating as binary feedback
    """
    user2id = pickle.load(open(MAIN_PATH + 'user2id.pkl', 'rb'))

    item2id = pickle.load(open(MAIN_PATH + 'item2id.pkl', 'rb'))

    ratings['user_id'] = ratings['user_id'].apply(lambda x: user2id[x])
    ratings['item_id'] = ratings['item_id'].apply(lambda x: item2id[x])
    ratings['rating'] = ratings['rating'].apply(lambda x: float(x > 0))
    return ratings

In [6]:
# set device and parameters
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")
writer = SummaryWriter()

# seed for Reproducibility
seed_everything(42)

Using cuda:0 device


In [7]:
# load data
print(TRAIN_DATA_PATH)
train_rating_data = pd.read_feather(TRAIN_DATA_PATH)
test_rating_data = pd.read_feather(TEST_DATA_PATH)
print(test_rating_data.head())

train_rating_data = train_rating_data.rename(columns={'id': 'user_id', 'course_id': 'item_id'})
test_rating_data = test_rating_data.rename(columns={'id': 'user_id', 'course_id': 'item_id'})

# set the num_users, items
num_users = train_rating_data['user_id'].nunique()+1
num_items = train_rating_data['item_id'].nunique()+1

print(num_users, num_items)

../data/MOOCCubeX/train.feather
     id  course_id  rating
0  U_24     948415       1
1  U_24     948410       1
2  U_24    1808023       1
3  U_24    1906706       1
4  U_24     681414       1
694530 4506


In [8]:
train_rating_data = _reindex(train_rating_data)
test_rating_data = _reindex(test_rating_data)
test_rating_data

Unnamed: 0,user_id,item_id,rating
0,419025,2878,1.0
1,419025,2873,1.0
2,419025,3599,1.0
3,419025,3890,1.0
4,419025,681,1.0
...,...,...,...
1983579,627567,2415,1.0
1983580,627567,2453,1.0
1983581,627568,2331,1.0
1983582,627569,3135,1.0


In [9]:
parser = argparse.ArgumentParser()
parser.add_argument("--seed",
	type=int,
	default=42,
	help="Seed")
parser.add_argument("--lr",
	type=float,
	default=0.001,
	help="learning rate")
parser.add_argument("--dropout",
	type=float,
	default=0.2,
	help="dropout rate")
parser.add_argument("--batch_size",
	type=int,
	default=256,
	help="batch size for training")
parser.add_argument("--epochs",
	type=int,
	default=10,
	help="training epoches")
parser.add_argument("--top_k",
	type=int,
	default=10,
	help="compute metrics@top_k")
parser.add_argument("--factor_num",
	type=int,
	default=32,
	help="predictive factors numbers in the model")
parser.add_argument("--layers",
    nargs='+',
    default=[64,32,16,8],
    help="MLP layers. Note that the first layer is the concatenation of user \
    and item embeddings. So layers[0]/2 is the embedding size.")
parser.add_argument("--num_ng",
	type=int,
	default=4,
	help="Number of negative samples for training set")
parser.add_argument("--num_ng_test",
	type=int,
	default=100,
	help="Number of negative samples for test set")
parser.add_argument("--out",
	default=True,
	help="save model or not")
args = parser.parse_args("")
print(args)

Namespace(batch_size=256, dropout=0.2, epochs=10, factor_num=32, layers=[64, 32, 16, 8], lr=0.001, num_ng=4, num_ng_test=100, out=True, seed=42, top_k=10)


In [10]:
# construct the train and test datasets

data = CreateDataloader(args, train_rating_data, test_rating_data)
print('Create Train Data Loader')
train_loader = data.get_train_instance()
print('Create Test Data Loader')
test_loader = data.get_test_instance()

negative sampling
done
Create Train Data Loader


100%|██████████| 6683574/6683574 [00:09<00:00, 708980.26it/s]


Create Test Data Loader


100%|██████████| 1983584/1983584 [00:35<00:00, 55529.27it/s]


In [11]:
import numpy as np
import torch
from tqdm import tqdm

def hit(ng_items, pred_items):
	for ng_item in ng_items:
		if ng_item in pred_items:
			return 1
	return 0


def idcg(ng_items):
	idcg = 0
	for i in range(len(ng_items)):
		idcg += np.reciprocal(np.log2(i+2))
	return idcg


def ndcg(ng_items, pred_items):
	dcg = 0
	for ng_item in ng_items:
		if ng_item in pred_items:
			index = pred_items.index(ng_item)
			dcg += np.reciprocal(np.log2(index+2))
	return dcg / idcg(ng_items)


def mrr(ng_items, pred_items):
	min_index = 999
	for ng_item in ng_items:
		if ng_item in pred_items:
			index = pred_items.index(ng_item)
			if index < min_index:
				min_index = index
	if min_index != 999:
		return np.reciprocal(float(min_index+1))
	return 0


def metrics(model, test_loader, top_k, device):
	HR, NDCG, MRR = [], [], []
	
	print(len(test_loader))
	for user, item, label in tqdm(test_loader, total=len(test_loader)):
		user = user.to(device)
		item = item.to(device)

		predictions = model(user, item)
		_, indices = torch.topk(predictions, top_k)
		recommends = torch.take(
				item, indices).cpu().numpy().tolist()

		ng_items = []

		for i in range(user.size(0)):
			if label[i].item() != 0:
				ng_item = item[i].item()
				ng_items.append(ng_item)
			
		HR.append(hit(ng_items, recommends))
		NDCG.append(ndcg(ng_items, recommends))
		MRR.append(mrr(ng_items, recommends))
	return np.mean(HR), np.mean(NDCG), np.mean(MRR)

In [12]:
start_time = time.time()

# set model and loss, optimizer
model = torch.load(MODEL_PATH)
model = model.to(device)
print(model)


NeuMF(
  (embedding_user_mlp): Embedding(694530, 32)
  (embedding_item_mlp): Embedding(4701, 32)
  (embedding_user_mf): Embedding(694530, 32)
  (embedding_item_mf): Embedding(4701, 32)
  (fc_layers): ModuleList(
    (0): Linear(in_features=64, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=8, bias=True)
    (5): ReLU()
  )
  (affine_output): Linear(in_features=40, out_features=1, bias=True)
  (logistic): Sigmoid()
)


In [13]:
user_iter = iter(test_loader)
users, items, labels = next(user_iter)   
cusers, citems, clabels = next(user_iter)   

count = 0
while users.numpy()[0] == 419025:
   # print(count)
   users, items, labels = next(user_iter)
   count += 1
print(users)
print(count)
user = users.to(device)
item = items.to(device)

predictions = model(user, item)
print(predictions)

tensor([652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087,
        652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087,
        652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087,
        652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087,
        652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087,
        652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087,
        652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087,
        652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087,
        652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087,
        652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087,
        652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087, 652087,
        652087, 652087])
12
tensor([6.4101e-01, 5.1673e-04, 1.3970e-01, 4.0576e-01, 1.0045e-01, 1.0794e-02,
        2.11

In [19]:
user_iter = iter(test_loader)
users, items, labels = next(user_iter)   
torch.cat((cusers, users), 0).shape

torch.Size([202])

In [15]:


HR, NDCG, MRR = metrics(model, test_loader, args.top_k, device)
writer.add_scalar(f'Perfomance/HR@{args.top_k,}', HR, 10)
writer.add_scalar(f'Perfomance/NDCG@{args.top_k,}', NDCG, 10)
writer.add_scalar(f'Perfomance/MRR@{args.top_k,}', MRR, 10)

elapsed_time = time.time() - start_time
print("The time elapse of epoch {:03d}".format(10) + " is: " +
        time.strftime("%H: %M: %S", time.gmtime(elapsed_time)))
print("HR: {:.3f}\tNDCG: {:.3f}\tNDCG: {:.3f}".format(np.mean(HR), np.mean(NDCG), np.mean(MRR)))


writer.close()

1983584


  3%|▎         | 51347/1983584 [01:09<43:38, 738.04it/s]


KeyboardInterrupt: 