In [1]:
import sys
import pickle
import random
import numpy as np
from collections import defaultdict

In [2]:
sys.path.append("..")

In [3]:
from carca.data import CARCADataset, load_attrs, load_ctx, load_profiles

In [4]:
def load_data(filename):
    try:
        with open(filename, "rb") as f:
            x = pickle.load(f)
    except:
        x = []
    return x

In [5]:
def data_partition(fname):
    usernum = 0
    itemnum = 0
    User = defaultdict(list)
    user_train = {}
    user_valid = {}
    user_test = {}
    # assume user/item index starting from 1
    f = open("../../data/%s.txt" % fname, "r")
    for line in f:
        u, i = line.rstrip().split(" ")
        u = int(u)
        i = int(i)
        usernum = max(u, usernum)
        itemnum = max(i, itemnum)
        User[u].append(i)

    for user in User:
        nfeedback = len(User[user])
        if nfeedback < 3:
            user_train[user] = User[user]
            user_valid[user] = []
            user_test[user] = []
        else:
            user_train[user] = User[user][:-2]
            user_valid[user] = []
            user_valid[user].append(User[user][-2])
            user_test[user] = []
            user_test[user].append(User[user][-1])
    return (user_train, user_valid, user_test, usernum, itemnum)

In [6]:
def get_ItemDataGames(itemnum):
    ItemFeatures = load_data("../../data/video_games_attrs.dat")
    ItemFeatures = np.vstack((np.zeros(ItemFeatures.shape[1]), ItemFeatures))
    return ItemFeatures

In [7]:
def random_neq(l, r, s):
    t = np.random.randint(l, r)
    while t in s:
        t = np.random.randint(l, r)
    return t

In [8]:
def sample_function(user_train, usernum, itemnum, cxtdict, cxtsize, maxlen):
    user = np.random.randint(1, usernum + 1)
    while len(user_train[user]) <= 1:
        user = np.random.randint(1, usernum + 1)

    seq = np.zeros([maxlen], dtype=np.int32)
    pos = np.zeros([maxlen], dtype=np.int32)
    neg = np.zeros([maxlen], dtype=np.int32)
    ###CXT
    seqcxt = np.zeros([maxlen, cxtsize], dtype=np.float32)
    poscxt = np.zeros([maxlen, cxtsize], dtype=np.float32)
    negcxt = np.zeros([maxlen, cxtsize], dtype=np.float32)
    ###

    nxt = user_train[user][-1]
    idx = maxlen - 1

    ts = set(user_train[user])
    for i in reversed(user_train[user][:-1]):

        seq[idx] = i
        pos[idx] = nxt
        neg_i = 0
        if nxt != 0:
            neg_i = random_neq(1, itemnum + 1, ts)
            neg[idx] = neg_i
        ###CXT
        seqcxt[idx] = cxtdict[(user, i)]
        poscxt[idx] = cxtdict[(user, nxt)]
        negcxt[idx] = cxtdict[(user, nxt)]
        ###

        nxt = i
        idx -= 1
        if idx == -1:
            break

    return (np.ones(maxlen) * user, seq, pos, neg, seqcxt, poscxt, negcxt)

In [19]:
def test_sample(train, valid, test, usernum, itemnum, seq_len, cxtdict, cxtsize, negnum=100):
    u = np.random.randint(1, usernum + 1)
    while len(train[u]) < 1 or len(test[u]) < 1:
        u = np.random.randint(1, usernum + 1)

    seq = np.zeros([seq_len], dtype=np.int32)
    seqcxt = np.zeros([seq_len, cxtsize], dtype=np.float32)
    testitemscxt = list()
    idx = seq_len - 1
    seq[idx] = valid[u][0]
    # Cxt
    seqcxt[idx] = cxtdict[(u, valid[u][0])]

    idx -= 1
    for i in reversed(train[u]):
        seq[idx] = i
        # Cxt
        seqcxt[idx] = cxtdict[(u, i)]

        idx -= 1
        if idx == -1:
            break

    rated = set(train[u])
    rated.add(0)
    item_idx = [test[u][0]]
    testitemscxt.append(cxtdict[(u, test[u][0])])
    for _ in range(negnum):
        t = np.random.randint(1, itemnum + 1)
        while t in rated:
            t = np.random.randint(1, itemnum + 1)
        item_idx.append(t)
        testitemscxt.append(cxtdict[(u, test[u][0])])
    
    return np.ones(seq_len) * u, seq, item_idx, seqcxt, np.array(testitemscxt)

In [20]:
def val_sample(train, valid, usernum, itemnum, seq_len, cxtdict, cxtsize, negnum=100):
    u = np.random.randint(1, usernum + 1)
    while len(train[u]) < 1 or len(valid[u]) < 1:
        u = np.random.randint(1, usernum + 1)

    seq = np.zeros([seq_len], dtype=np.int32)
    seqcxt = np.zeros([seq_len, cxtsize], dtype=np.float32)
    testitemscxt = list()
    idx = seq_len - 1
    for i in reversed(train[u]):
        seq[idx] = i
        # cxt
        seqcxt[idx] = cxtdict[(u, i)]
        idx -= 1
        if idx == -1:
            break

    rated = set(train[u])
    rated.add(0)
    item_idx = [valid[u][0]]
    testitemscxt.append(cxtdict[(u, valid[u][0])])
    for _ in range(negnum):
        t = np.random.randint(1, itemnum + 1)
        while t in rated:
            t = np.random.randint(1, itemnum + 1)
        item_idx.append(t)
        testitemscxt.append(cxtdict[(u, valid[u][0])])
    
    return np.ones(seq_len) * u, seq, item_idx, seqcxt, np.array(testitemscxt)

In [9]:
np.set_printoptions(linewidth=500)

In [10]:
p_seq_len = 50
t_seq_len = 100

In [11]:
user_train, user_valid, user_test, usernum, itemnum = data_partition("video_games")

In [12]:
ItemFeatures = get_ItemDataGames(itemnum)
CXTDict = load_data("../../data/video_games_ctx.dat")

In [13]:
uid, seq, pos, neg, seqcxt, poscxt, negcxt = sample_function(user_train, usernum, itemnum, CXTDict, 6, p_seq_len)

In [14]:
attrs = load_attrs("video_games")
ctx = load_ctx("video_games")
user_ids, item_ids, profiles = load_profiles("video_games")

In [15]:
n_items = len(item_ids) + 1
n_ctx = next(iter(ctx.values())).shape[0]
n_attrs = attrs.shape[1]

In [16]:
train_data = CARCADataset(
    user_ids=user_ids,
    item_ids=item_ids,
    profiles=profiles,
    attrs=attrs,
    ctx=ctx,
    profile_seq_len=p_seq_len,
    target_seq_len=t_seq_len,
    mode="train"
)
val_data = CARCADataset(
    user_ids=user_ids,
    item_ids=item_ids,
    profiles=profiles,
    attrs=attrs,
    ctx=ctx,
    profile_seq_len=p_seq_len,
    target_seq_len=t_seq_len,
    mode="val"
)
test_data = CARCADataset(
    user_ids=user_ids,
    item_ids=item_ids,
    profiles=profiles,
    attrs=attrs,
    ctx=ctx,
    profile_seq_len=p_seq_len,
    target_seq_len=t_seq_len,
    mode="test"
)

In [17]:
eps = 1e-4

for i in range(10_000):
    uid, seq, pos, neg, seqcxt, poscxt, negcxt = sample_function(user_train, usernum, itemnum, CXTDict, 6, 50)
    idx = train_data.user_ids.index(int(uid[0]))

    # try:
    #     idx = train_data.user_ids.index(int(uid[0]))
    # except:
    #     continue

    p_x, p_q, o_x, o_q, y_true = train_data[idx]
    pos_x, neg_x = np.split(o_x, 2)
    pos_q, neg_q = np.split(o_q, 2)

    p_a, p_c = p_q[:, :-6], p_q[:, -6:]
    pos_a, pos_c = pos_q[:, :-6], pos_q[:, -6:]
    neg_a, neg_c = neg_q[:, :-6], neg_q[:, -6:]

    checked += 1

    assert np.all(p_x == seq), f"Profile sequence, UID: {int(uid[0])}"
    assert np.all(pos_x == pos), f"Target sequence, UID: {int(uid[0])}"

    assert np.all(seqcxt == p_c), f"Profile ctx, UID: {int(uid[0])}"
    assert np.all(poscxt == pos_c), f"Target positive ctx, UID: {int(uid[0])}"
    assert np.all(negcxt == neg_c), f"Target negative ctx, UID: {int(uid[0])}"

    assert np.all(np.abs(ItemFeatures[seq] - p_a) < eps), f"Profile attrs, UID: {int(uid[0])}"
    assert np.all(np.abs(ItemFeatures[pos] - pos_a) < eps), f"Target attrs, UID: {int(uid[0])}"

print(checked)

10000


In [21]:
eps = 1e-4

for i in range(10_000):
    uid, seq, target, seqcxt, targetcxt = test_sample(user_train, user_valid, user_test, usernum, itemnum, p_seq_len, CXTDict, 6, negnum=t_seq_len)
    idx = test_data.user_ids.index(int(uid[0]))

    # try:
    #     idx = test_data.user_ids.index(int(uid[0]))
    # except:
    #     not_checked.append(int(uid[0]))
    #     continue

    p_x, p_q, o_x, o_q, y_true = test_data[idx]
    p_a, p_c = p_q[:, :-6], p_q[:, -6:]
    t_a, t_c = o_q[0, :-6], o_q[:, -6:]

    assert np.all(p_x == seq), f"Profile sequence, UID: {int(uid[0])}"
    assert target[0] == o_x[0], f"Target sequence, UID: {int(uid[0])}"

    assert np.all(seqcxt == p_c), f"Profile ctx, UID: {int(uid[0])}"
    assert np.all(np.abs(targetcxt - t_c) < eps), f"Target ctx, UID: {int(uid[0])}"

    assert np.all(np.abs(ItemFeatures[seq] - p_a) < eps), f"Profile attrs, UID: {int(uid[0])}"
    assert np.all(np.abs(ItemFeatures[target[0]] - t_a) < eps), f"Target attrs, UID: {int(uid[0])}"

In [22]:
not_checked = []
eps = 1e-4

for i in range(10_000):
    uid, seq, target, seqcxt, targetcxt = val_sample(user_train, user_valid, usernum, itemnum, p_seq_len, CXTDict, 6, negnum=t_seq_len)
    idx = val_data.user_ids.index(int(uid[0]))

    # try:
    #     idx = val_data.user_ids.index(int(uid[0]))
    # except:
    #     not_checked.append(int(uid[0]))
    #     continue

    p_x, p_q, o_x, o_q, y_true = val_data[idx]
    p_a, p_c = p_q[:, :-6], p_q[:, -6:]
    t_a, t_c = o_q[0, :-6], o_q[:, -6:]

    assert np.all(p_x == seq), f"Profile sequence, UID: {int(uid[0])}"
    assert target[0] == o_x[0], f"Target sequence, UID: {int(uid[0])}"

    assert np.all(seqcxt == p_c), f"Profile ctx, UID: {int(uid[0])}"
    assert np.all(np.abs(targetcxt - t_c) < eps), f"Target ctx, UID: {int(uid[0])}"

    assert np.all(np.abs(ItemFeatures[seq] - p_a) < eps), f"Profile attrs, UID: {int(uid[0])}"
    assert np.all(np.abs(ItemFeatures[target[0]] - t_a) < eps), f"Target attrs, UID: {int(uid[0])}"