In [35]:
import numpy as np 
import pandas as pd 
import scipy.sparse as sp

import torch.utils.data as data
import os
os.chdir('./')

import random as random

In [36]:
random.seed(0)

In [37]:
def load_all_custom(test_num=100):
	""" We load all the three file here to save time in each epoch. """
    
	'''train_data = pd.read_csv(
		'./data/train_df', header=None, names=['user', 'item'], 
		usecols=[0, 1], dtype={0: np.int32, 1: np.int32}) '''
	#train_data = pd.read_csv('./data/train_df')    
	#train_data = pd.read_csv('./data/synthetic/train_df')
	train_data = pd.read_csv('./train_df')    
    
	train_data = train_data[['uid', 'sid']]
	train_data['uid'] = train_data['uid'].apply(lambda x : int(x))
	train_data['sid'] = train_data['sid'].apply(lambda x : int(x))    
	train_data.columns = ['user', 'item']
    
	user_num = train_data['user'].max() + 1
	item_num = train_data['item'].max() + 1

	train_data = train_data.values.tolist()

	# load ratings as a dok matrix
	train_mat = sp.dok_matrix((user_num, item_num), dtype=np.float32)
	for x in train_data:
		train_mat[x[0], x[1]] = 1.0
	total_mat = train_mat
    
    
	'''test_data = []
	for user in list(range(6040)):
		tmp = test_df_true_neg[test_df_true_neg['user'] == user][['user', 'item', 'type']]
		tmp = tmp.values.tolist()
		test_data.append(tmp)
	'''

	return train_data, user_num, item_num, train_mat, total_mat

In [38]:
class BPRData(data.Dataset):
	def __init__(self, features, 
				num_item, train_mat=None, total_mat=None, num_ng=0, is_training=None, sample_mode = None):
		super(BPRData, self).__init__()
		""" Note that the labels are only useful when training, we thus 
			add them in the ng_sample() function.
		"""
		self.features = features
		self.num_item = num_item
		self.train_mat = train_mat
		self.total_mat = total_mat
		self.num_ng = num_ng
		self.is_training = is_training
		# self.labels = [0 for _ in range(len(features))]

	def ng_sample(self):

		if True:
			assert self.is_training, 'no need to sampling when testing'
			self.features_fill = []
			### self.features is train [user, pos item] list
			tmp = pd.DataFrame(self.features)
			tmp.columns = ['uid', 'sid']
            
			### [user pos] -> [user pos1 pos2] 
			### by groupby uid, then shuffling sid
			tmp = tmp.sort_values('uid')
			tmp_list = list(range(tmp.shape[0]))
			random.shuffle(tmp_list)
			tmp['rng'] = tmp_list
			sid2 = tmp.sort_values(['uid', 'rng']).sid
			tmp['sid2'] = sid2.reset_index().sid
			tmp = tmp[['uid', 'sid', 'sid2']]
			tmp = tmp.sort_index()
			self.features2 = tmp.values.tolist()         
                
		for x in self.features2:
			u, pos1, pos2 = x[0], x[1], x[2]
			for t in range(self.num_ng):
				if u == 0:
					neg1, neg2 = 199, 199                    
				elif u == 1:
					neg1, neg2 = 198, 199                    
				elif u <= 199:
					neg1, neg2 = np.random.randint(199-u + 1, 200, size = 2)                    
				elif u >= 200:
					neg1, neg2 = 199, 199                    
				else:
					# index가 998 이면 1 까지가 있고, 1번부터 999 까지 중에 하나
					# index가 997 이면 2 까지가 있고, 2번인덱스부터 999 까지 중에 하나
					# 즉 start index가 999 - u                    
					neg1, neg2 = np.random.randint(0, u-199, size = 2)
				self.features_fill.append([u, pos1, pos2, neg1, neg2])


	def __len__(self):
		return self.num_ng * len(self.features) if self.is_training \
					else len(self.features)

	def __getitem__(self, idx):
		features = self.features_fill if \
					self.is_training else self.features

		user = features[idx][0]
		item_i = features[idx][1]
		item_j = features[idx][2] if \
					self.is_training else features[idx][1]        
		return user, item_i, item_j

In [39]:
train_data, user_num, item_num, train_mat, total_mat = load_all_custom()

In [40]:
print('original user-pos tuple is')
train_data[0:10]

original user-pos tuple is


[[0, 0],
 [1, 0],
 [2, 0],
 [3, 0],
 [4, 0],
 [5, 0],
 [6, 0],
 [7, 0],
 [8, 0],
 [9, 0]]

In [41]:
train_dataset = BPRData(train_data, item_num, train_mat, total_mat, num_ng=1, is_training=True, sample_mode=None)

In [42]:
train_dataset.ng_sample()

In [43]:
negative_samples = train_dataset.features_fill

In [44]:
print('new (user, pos1, pos2, neg1, neg2) tuple is')
negative_samples[0:10]

new (user, pos1, pos2, neg1, neg2) tuple is


[[0, 0, 116, 199, 199],
 [1, 0, 88, 198, 199],
 [2, 0, 85, 198, 198],
 [3, 0, 40, 199, 197],
 [4, 0, 72, 196, 197],
 [5, 0, 193, 198, 195],
 [6, 0, 64, 195, 198],
 [7, 0, 9, 197, 196],
 [8, 0, 191, 194, 199],
 [9, 0, 190, 193, 194]]

In [45]:
tmp1 = np.array(negative_samples)[:, 1]
tmp2 = np.array(negative_samples)[:, 2]

In [46]:
print('ratio of pos1 > pos2')
print(np.mean(tmp1 > tmp2))

ratio of pos1 > pos2
0.49262981574539366


#### Generate Epoch Training Data for Faster, Reproducible Training

In [47]:
random.seed(0)

In [48]:
total_epochs = 10
num_ng = 3

In [49]:
import pickle
for i in range(total_epochs):
    print(i)
    train_list = []
    for j in range(num_ng):
        train_dataset.ng_sample()
        train_samples = train_dataset.features_fill
        train_list += train_samples
    with open(f'./train_samples/train_samples_{i}', 'wb') as fp:
        pickle.dump(train_list, fp)

0
1
2
3
4
5
6
7
8
9
