### Data evaluation (mainly sampling)

In [6]:
dataset_name = "Beauty"
from tqdm import tqdm
from src.data_preprocessing import *
from src.config import *
from src.train import *
from src.sampler import *
from src.utils import *

In [7]:
def random_neq(l, r, s):
    """ random integer between l and r but avoiding s """
    t = np.random.randint(l, r)
    while t in s:
        t = np.random.randint(l, r)
    return t

In [10]:
def evaluate_valid(model, dataset, args, cxtdict, cxtsize):
    [train, validation, _, unum, inum] = copy.deepcopy(dataset)
    NDCG, HT, valid_user, Auc = 0.0, 0.0, 0.0, 0.0
    metrics = (NDCG, HT, valid_user, Auc)
    return validation_sample(train, args, validation, unum, inum, cxtdict, cxtsize, metrics, "validation", model)


def validation_sample(train, args, validation, users_total_num, items_total_num, cxtdict,
                      cxtsize, metrics, case, model, negnum=100):
    """
    This creates an example for validation or testing.
    :param train: reference dataset, Dict[int:List[int]]
    :param args: Args object with the configuration
    :param validation: validation/test dataset, Dict[int:List[int]]
    :param users_total_num: number of users
    :param items_total_num: number of items
    :param cxtdict: Dict[tuple: List[float]]
    :param cxtsize: number of context features (amazon 6)
    :param metrics: (NDCG, HT, valid_user, Auc)
    :param case: if it is validation or test, str
    :param negnum: number of negative examples to be created
    :return:
    """
    # users to be used
    list_users = (np.random.randint(1, users_total_num + 1) for _ in range(10000)) if users_total_num > 10000 \
        else range(1, users_total_num + 1)

    for u in list_users:
        # Skip when it is too short
        if len(train[u]) < 1 or len(validation[u]) < 1: continue

        # Init sequences with padding
        seq = np.zeros([args.maxlen], dtype=np.int32)
        seqcxt = np.zeros([args.maxlen, cxtsize], dtype=np.int32)

        # Init output sequence
        testitemscxt = list()
        idx = args.maxlen - 1

        # we add one item for the validation case: seq and context
        if not case.lower() == "validation":
            seq[idx] = validation[u][0]
            seqcxt[idx] = cxtdict[(u, validation[u][0])]
            idx -= 1

        for i in reversed(train[u]):
            # MAIN
            seq[idx] = i
            # CXT
            seqcxt[idx] = cxtdict[(u, i)]
            # LOOP UPDATE
            idx -= 1
            if idx == -1: break

        rated = set(train[u])
        rated.add(0)
        item_idx = [validation[u][0]]
        testitemscxt.append(cxtdict[(u, validation[u][0])])

        # negative examples loop
        for _ in range(negnum):
            t = random_neq(1, items_total_num + 1, rated)
            item_idx.append(t)
            testitemscxt.append(cxtdict[(u, validation[u][0])])
            (NDCG, HT, valid_user, Auc) = metrics_computation(model, metrics, u, seq, item_idx, seqcxt, testitemscxt)

    return NDCG / valid_user, HT / valid_user, Auc / valid_user


In [11]:
data, num_batch, args, ItemFeatures, CXTDict, UserFeatures = datapreproces(dataset_name)
[user_train, user_valid, user_test, users_total_num, items_total_num] = data

 The dataset Beauty contains 52204 users and 57289 items in total
average sequence length: {5.63}
ItemFeatures DF dimensions (57290, 6507)


In [12]:
dataset_to_compare = user_valid
train = user_train
case = "test"
cxtsize = args.cxt_size
cxtdict = CXTDict
negnum=100

In [14]:
list_users = (np.random.randint(1, users_total_num + 1) for _ in range(10000)) if users_total_num > 10000 \
        else range(1, users_total_num + 1)

In [20]:
for u in list_users:
    u = 34942
    # Skip when it is too short
    if len(train[u]) < 1 or len(dataset_to_compare[u]) < 1: continue

    # Init sequences with padding
    seq = np.zeros([args.maxlen], dtype=np.int32)
    seqcxt = np.zeros([args.maxlen, cxtsize], dtype=np.int32)

    # Init output sequence
    testitemscxt = list()
    print(dataset_to_compare[u])
    idx = args.maxlen - 1
    print("idx", idx)
    
    if not case.lower() == "validation":
        seq[idx] = dataset_to_compare[u][0]
        # Cxt
        seqcxt[idx] = cxtdict[(u, dataset_to_compare[u][0])]
        idx -= 1
    
    for i in reversed(train[u]):
        seq[idx] = i
        # Cxt
        seqcxt[idx] = cxtdict[(u, i)]
        idx -= 1
        if idx == -1: break
    
    rated = set(train[u])
    rated.add(0)
    print("rated", rated)
    item_idx = [dataset_to_compare[u][0]]
    # testitemscxt.append(cxtdict[(u, dataset_to_compare[u][0])])
    # for _ in range(negnum):
    #     t = np.random.randint(1, items_total_num + 1)
    #     while t in rated: t = np.random.randint(1, items_total_num + 1)
    #     item_idx.append(t)
    #     testitemscxt.append(cxtdict[(u, dataset_to_compare[u][0])])
    break

[21853]
idx 74
rated {0, 16616, 19412, 15705, 11259}


#### SAMPLERS AND DATALOADERS

In [59]:
data, num_batch, args, ItemFeatures, CXTDict, UserFeatures = datapreproces(dataset_name)
[user_train, user_valid, user_test, users_total_num, items_total_num] = data

 The dataset Beauty contains 52204 users and 57289 items in total
average sequence length: {5.63}
ItemFeatures DF dimensions (57290, 6507)


In [60]:
args.dataset

'Beauty'

In [11]:
sampler = WarpSampler(user_train, users_total_num, args,
                      items_total_num, CXTDict)
# Old case with numpy
counter = 0
for step in tqdm(range(int(num_batch)), total=int(num_batch), ncols=70, leave=False, unit='b'):
    u, seq, pos, neg, seqcxt, poscxt, negcxt = sampler.next_batch()
    counter += 1

                                                                      

In [49]:
device = check_device()
Psampler = PytorchSampler(users_total_num, items_total_num, user_train,
                           CXTDict, args, device, SEED=42, 
                           reverse=True, mask_user=False, only_finals=True)
# Old case with tensorflow
counter = 0
for step in tqdm(range(int(num_batch)), total=int(num_batch), ncols=70, leave=False, unit='b'):
    u, seq, pos, neg, seqcxt, poscxt, negcxt = Psampler[step]
    counter += 1

iterations_num 1469053


                                                                      

KeyboardInterrupt: 