In [1]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path: sys.path.append(dir1)

In [2]:
import os
import time
import random
import argparse
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from tqdm import tqdm
import re
from tensorboardX import SummaryWriter
from metrics.metrics import metrics
from architectures.BertMF.bert_mf import BertMF
from loaders.create_dataloader import CreateDataloader

### Setting Arguments

Here is the brief description of important ones:
- Learning rate is 0.001
- Dropout rate is 0.2
- Running for 10 epochs
- HitRate@10 and NDCG@10
- 4 negative samples for each positive one

In [3]:
DATASET_NAME = 'ml-1m'
MODEL_NAME = 'BertMF'
DATASET_FILE = 'ratings.dat'
TEXT_INFO_FILE = 'movies.dat'
MAIN_PATH = f'../data/{DATASET_NAME}/'
DATA_PATH = MAIN_PATH + DATASET_FILE
TEXT_INFO_PATH = MAIN_PATH + TEXT_INFO_FILE
DATASET_NAME = 'ml-1m'
MODEL_NAME = 'BertMF'
MODEL_PATH = f'../models/{DATASET_NAME}/'
MODEL = f'{DATASET_NAME}-{MODEL_NAME}'

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [5]:
parser = argparse.ArgumentParser()
parser.add_argument("--seed",
	type=int,
	default=42,
	help="Seed")
parser.add_argument("--lr",
	type=float,
	default=0.001,
	help="learning rate")
parser.add_argument("--dropout",
	type=float,
	default=0.2,
	help="dropout rate")
parser.add_argument("--batch_size",
	type=int,
	default=256,
	help="batch size for training")
parser.add_argument("--epochs",
	type=int,
	default=10,
	help="training epoches")
parser.add_argument("--top_k",
	type=int,
	default=10,
	help="compute metrics@top_k")
parser.add_argument("--factor_num",
	type=int,
	default=32,
	help="predictive factors numbers in the model")
parser.add_argument("--layers",
    nargs='+',
    default=[64,32,16,8],
    help="MLP layers. Note that the first layer is the concatenation of user \
    and item embeddings. So layers[0]/2 is the embedding size.")
parser.add_argument("--num_ng",
	type=int,
	default=4,
	help="Number of negative samples for training set")
parser.add_argument("--num_ng_test",
	type=int,
	default=100,
	help="Number of negative samples for test set")
parser.add_argument("--out",
	default=True,
	help="save model or not")

_StoreAction(option_strings=['--out'], dest='out', nargs=None, const=None, default=True, type=None, choices=None, help='save model or not', metavar=None)

## Preparing text tokenization

In [6]:
text_info = pd.read_csv(
	TEXT_INFO_PATH,
	sep="::",
	names = ['item_id', 'name', 'genres'],
	engine='python',
    encoding='latin-1')

text_info['name'] = text_info['name'].apply(lambda x: 'Name: ' + x)
text_info['genres'] = text_info['genres'].apply(lambda x: 'Genres: ' + re.sub(r'\|', ', ', x))
text_info['text'] = (text_info['name'] + ' ' + text_info['genres']).astype("string")

text_info = text_info.drop('name', axis=1)
text_info = text_info.drop('genres', axis=1)
text_info['text']

0       Name: Toy Story (1995) Genres: Animation, Chil...
1       Name: Jumanji (1995) Genres: Adventure, Childr...
2       Name: Grumpier Old Men (1995) Genres: Comedy, ...
3       Name: Waiting to Exhale (1995) Genres: Comedy,...
4       Name: Father of the Bride Part II (1995) Genre...
                              ...                        
3878         Name: Meet the Parents (2000) Genres: Comedy
3879       Name: Requiem for a Dream (2000) Genres: Drama
3880                 Name: Tigerland (2000) Genres: Drama
3881          Name: Two Family House (2000) Genres: Drama
3882    Name: Contender, The (2000) Genres: Drama, Thr...
Name: text, Length: 3883, dtype: string

In [7]:
text_info.iloc[0]['text']

"Name: Toy Story (1995) Genres: Animation, Children's, Comedy"

## Training BertMF Model

In [8]:
# set device and parameters
args = parser.parse_args("")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
writer = SummaryWriter()

# seed for Reproducibility
seed_everything(args.seed)

# load data
ml_1m = pd.read_csv(
	DATA_PATH,
	sep="::",
	names = ['user_id', 'item_id', 'rating', 'timestamp'],
	engine='python')

ml_1m = ml_1m.merge(text_info, how='left', on='item_id')

# set the num_users, items
num_users = ml_1m['user_id'].nunique()+1
num_items = ml_1m['item_id'].nunique()+1

ml_1m.head()

Unnamed: 0,user_id,item_id,rating,timestamp,text
0,1,1193,5,978300760,Name: One Flew Over the Cuckoo's Nest (1975) G...
1,1,661,3,978302109,Name: James and the Giant Peach (1996) Genres:...
2,1,914,3,978301968,"Name: My Fair Lady (1964) Genres: Musical, Rom..."
3,1,3408,4,978300275,Name: Erin Brockovich (2000) Genres: Drama
4,1,2355,5,978824291,"Name: Bug's Life, A (1998) Genres: Animation, ..."


In [9]:
def _reindex(ratings):
    """
    Process dataset to reindex userID and itemID, also set rating as binary feedback
    """
    user_list = list(ratings['user_id'].drop_duplicates())
    user2id = {w: i for i, w in enumerate(user_list)}

    item_list = list(ratings['item_id'].drop_duplicates())
    item2id = {w: i for i, w in enumerate(item_list)}

    ratings['user_id'] = ratings['user_id'].apply(lambda x: user2id[x])
    ratings['item_id'] = ratings['item_id'].apply(lambda x: item2id[x])
    ratings['rating'] = ratings['rating'].apply(lambda x: float(x > 0))
    return ratings

def _leave_one_out(ratings):
    """
    leave-one-out evaluation protocol in paper https://www.comp.nus.edu.sg/~xiangnan/papers/ncf.pdf
    """
    ratings['rank_latest'] = ratings.groupby(['user_id'])['timestamp'].rank(method='first', ascending=False)
    test = ratings.loc[ratings['rank_latest'] == 1]
    train = ratings.loc[ratings['rank_latest'] > 1]
    assert train['user_id'].nunique()==test['user_id'].nunique(), 'Not Match Train User with Test User'
    return train[['user_id', 'item_id', 'text', 'rating']], test[['user_id', 'item_id', 'text','rating']]

In [10]:
ml_1m = _reindex(ml_1m)
train_ml_1m, test_ml_1m = _leave_one_out(ml_1m)
train_ml_1m.shape, test_ml_1m.shape, ml_1m.shape

((994169, 4), (6040, 4), (1000209, 6))

In [11]:
texts = ml_1m[['item_id', 'text']].drop_duplicates(subset=['item_id'])
texts.set_index('item_id', inplace=True)
texts

Unnamed: 0_level_0,text
item_id,Unnamed: 1_level_1
0,Name: One Flew Over the Cuckoo's Nest (1975) G...
1,Name: James and the Giant Peach (1996) Genres:...
2,"Name: My Fair Lady (1964) Genres: Musical, Rom..."
3,Name: Erin Brockovich (2000) Genres: Drama
4,"Name: Bug's Life, A (1998) Genres: Animation, ..."
...,...
3701,Name: Modulations (1998) Genres: Documentary
3702,Name: Broken Vessels (1998) Genres: Drama
3703,Name: White Boys (1999) Genres: Drama
3704,"Name: One Little Indian (1973) Genres: Comedy,..."


In [12]:
texts.iloc[2981]['text']

'Name: Cross of Iron (1977) Genres: War'

In [13]:
# construct the train and test datasets
data = CreateDataloader(args, train_ml_1m, test_ml_1m, MAIN_PATH, True, texts=texts)
train_loader = data.get_train_instance()
test_loader = data.get_test_instance()

# set model and loss, optimizer
model = BertMF(args, num_users, num_items)
model = model.to(device)
print(device)
loss_function = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=args.lr)

(994169, 4) (6040, 4) (1000209, 4)


Using cache found in /home/rctejon/.cache/torch/hub/huggingface_pytorch-transformers_main
  from .autonotebook import tqdm as notebook_tqdm
Using cache found in /home/rctejon/.cache/torch/hub/huggingface_pytorch-transformers_main


cuda:0


In [14]:
a = next(iter(test_loader))   
a[3], len(a[3])

(["Name: Pocahontas (1995) Genres: Animation, Children's, Musical, Romance",
  "Name: Herbie Goes to Monte Carlo (1977) Genres: Adventure, Children's, Comedy",
  'Name: Grease 2 (1982) Genres: Comedy, Musical, Romance',
  'Name: Mission, The (1986) Genres: Drama',
  'Name: Queen Margot (La Reine Margot) (1994) Genres: Drama, Romance',
  'Name: Jail Bait (1954) Genres: Crime, Drama',
  'Name: Happy, Texas (1999) Genres: Comedy',
  'Name: Battle for the Planet of the Apes (1973) Genres: Action, Sci-Fi',
  "Name: Adventures of Milo and Otis, The (1986) Genres: Children's",
  'Name: Alien³ (1992) Genres: Action, Horror, Sci-Fi, Thriller',
  'Name: Rosetta (1999) Genres: Drama',
  'Name: Mouth to Mouth (Boca a boca) (1995) Genres: Comedy',
  'Name: Midaq Alley (Callejón de los milagros, El) (1995) Genres: Drama',
  'Name: Homegrown (1998) Genres: Comedy, Thriller',
  'Name: Airport 1975 (1974) Genres: Drama',
  'Name: Meet Wally Sparks (1997) Genres: Comedy',
  'Name: Crimes and Misdemeanor

In [15]:

# train, evaluation
best_hr = 0
for epoch in range(1, 5+1):
	model.train() # Enable dropout (if have).
	start_time = time.time()

	for user, item, label, text in tqdm(train_loader, total=len(train_loader)):
		user = user.to(device)
		item = item.to(device)
		label = label.to(device)
		# print(user.shape, item.shape, len(tokenization), len(masks), label.shape)

		optimizer.zero_grad()
		prediction = model(user, item, text)
		loss = loss_function(prediction, label)
		loss.backward()
		optimizer.step()

	model.eval()
	HR, NDCG, MRR, RECALL, PRECISION = metrics(model, test_loader, args.top_k, device, args.num_ng_test)

	elapsed_time = time.time() - start_time
	print("The time elapse of epoch {:03d}".format(epoch) + " is: " +
			time.strftime("%H: %M: %S", time.gmtime(elapsed_time)))
	print("HR: {:.3f}\tNDCG: {:.3f}\tMRR: {:.3f}".format(np.mean(HR), np.mean(NDCG), np.mean(MRR)))

	if HR > best_hr:
		best_hr, best_ndcg, best_mrr, best_epoch = HR, NDCG, MRR, epoch
		if args.out:
			if not os.path.exists(MODEL_PATH):
				os.mkdir(MODEL_PATH)
			torch.save(model,
				'{}{}.pth'.format(MODEL_PATH, MODEL))

writer.close()

100%|██████████| 19418/19418 [3:24:10<00:00,  1.59it/s]  
  0%|          | 0/6040 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got 

The time elapse of epoch 001 is: 03: 47: 49
HR: 0.666	NDCG: 0.238	MRR: 0.671


  0%|          | 0/19418 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disablin

The time elapse of epoch 002 is: 03: 46: 17
HR: 0.948	NDCG: 0.338	MRR: 0.948


  0%|          | 0/19418 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disablin

The time elapse of epoch 003 is: 03: 48: 47
HR: 0.975	NDCG: 0.347	MRR: 0.975


  0%|          | 0/19418 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disablin

The time elapse of epoch 004 is: 03: 50: 38
HR: 0.978	NDCG: 0.348	MRR: 0.978


  0%|          | 0/19418 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disablin

The time elapse of epoch 005 is: 03: 51: 22
HR: 0.976	NDCG: 0.348	MRR: 0.976


## Final Output

In [16]:
print("Best epoch {:03d}: HR@10 = {:.3f}, NDCG@10 = {:.3f}, MRR@10 = {:.3f}".format(
									best_epoch, best_hr, best_ndcg))

IndexError: Replacement index 3 out of range for positional args tuple