In [174]:
import numpy as np
# need to add seed for reproducibility
np.random.seed(5)

# Dataset Generation using rules over the latent factors.

## Define the parameters
num_items, num_users,
num_user_attr, num_item_attr

In [175]:
num_items = 1000
num_users = 200
num_user_attr = 30
num_item_attr = 100
num_rules = 40
per_rule_sparsity = 0.30
attribute_sparsity = 0.20
data_dir = '../data/intersection_only/'


## Generate Attributes

In [176]:
# generate 50 users with 10 attributes
user_attr = np.random.rand(num_users, num_user_attr)
user_attr = (user_attr < attribute_sparsity).astype(int) # sparsify the user attributes
# generate 50 items with 10 attributes
item_attr = np.random.rand(num_items, num_item_attr)
item_attr = (item_attr < attribute_sparsity).astype(int) # sparsify the item attributes

In [177]:
print('user_attr shape: ', user_attr.shape)
print('item_attr shape: ', item_attr.shape)
print('user_attr[0]: ', user_attr[0])
print('item_attr[0]: ', item_attr[0])
print('user_attr[:,0]: ', user_attr[:,0])
print('item_attr[:,0]: ', item_attr[:,0])

user_attr shape:  (200, 30)
item_attr shape:  (1000, 100)
user_attr[0]:  [0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1]
item_attr[0]:  [0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 1 0 0 0
 0 0 1 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1 1]
user_attr[:,0]:  [0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1
 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1
 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0
 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0
 1 0 1 0 0 1 0 1 1 0 0 0 0 0 0]
item_attr[:,0]:  [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 0
 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 1 1 0 0 1 1 0 1 0 

## Generate the rules

In [178]:
rules = []
for i in range(num_rules):
    # sample variable number of attributes from user atrributes
    num_sample = np.random.randint(1, 5) ### 4 is a design choice, need to thing more on this
    user_attr_idx = np.random.choice(range(num_user_attr), num_sample, replace=False).tolist()
    
    # sample variable number of attributes from item atrributes
    num_sample = np.random.randint(1, 5) ### 4 is a design choice, need to thing more on this
    item_attr_idx = np.random.choice(range(num_item_attr), num_sample, replace=False).tolist()

    attr_idx = (user_attr_idx, item_attr_idx)
    rules.append(attr_idx)

In [179]:
rules

[([21, 4, 23, 24], [75, 60, 51]),
 ([29, 27, 26, 25], [53]),
 ([21, 6, 10], [68, 13, 32]),
 ([24, 1, 17, 16], [69, 59, 1]),
 ([17, 15, 20], [40, 58, 28]),
 ([3, 28, 27, 25], [77, 38]),
 ([10, 25], [43, 9, 64]),
 ([29], [87, 93, 45, 91]),
 ([10, 0, 16], [44, 59, 81, 48]),
 ([15], [95, 53]),
 ([12, 14, 24, 10], [10, 61, 82, 60]),
 ([2, 17, 16, 7], [50, 51]),
 ([19, 27, 1], [76, 97]),
 ([28, 5], [58, 90, 25, 70]),
 ([22, 21, 16, 14], [17, 77, 97]),
 ([13, 7], [42]),
 ([26, 12, 1, 21], [27, 88, 48, 91]),
 ([24, 3, 18, 11], [26, 28, 82, 42]),
 ([14], [85, 68, 17, 24]),
 ([10, 28, 11, 29], [46, 92, 87, 96]),
 ([23, 29, 3, 5], [81, 82, 16, 73]),
 ([7], [39, 1, 72]),
 ([0], [41]),
 ([8, 12, 4], [21, 48, 4]),
 ([22, 6, 2, 11], [32, 70]),
 ([11, 9], [96]),
 ([19, 23], [26, 7, 6]),
 ([23], [50]),
 ([12, 23, 24], [93, 6, 29, 38]),
 ([14, 26, 5], [42]),
 ([2, 21, 8], [35]),
 ([21], [43, 26, 85, 60]),
 ([7, 10], [2, 3, 33, 91]),
 ([2, 7, 20, 24], [76]),
 ([12, 0, 15], [79]),
 ([6, 7], [48]),
 ([12, 

## Look up function for user & item pairs that satisfy the rules

In [180]:
### Only intersection and positive of the RV is considered
def rule_look_up(rule):
    user_rule = rule[0]
    item_rule = rule[1]

    # Find all users that have the attributes 1 in the idx that is specified in the rule
    user_list = []
    for i in range(num_users):
        user_i = np.array(user_attr[i])
        if (user_i[user_rule] == 1).all():
            user_list.append(i)
    # Find all items that have the attributes 1 in the idx that is specified in the rule
    item_list = []
    for i in range(num_items):
        item_i = np.array(item_attr[i])
        if (item_i[item_rule] == 1).all():
            item_list.append(i)
    return user_list, item_list

## Generate the user-vs-item co-watch matrix

In [181]:
user_item = np.zeros((num_users, num_items))
user_item_tuple = []
for rule in rules:
    user_list, item_list = rule_look_up(rule)
    for user in user_list:
        # sample 30 % of the items
        item_sampled_list = np.random.choice(item_list, int(len(item_list) * per_rule_sparsity), replace=False)
        for item in item_sampled_list:
            user_item[user][item] = 1.0

In [182]:
user_item.sum() / (num_users * num_items)

0.034565

In [183]:
len(list(zip(*np.where(np.array(user_item) == 1.0))))

6913

In [184]:
final_vocab_user = set(np.where(np.array(user_item) == 1.0)[0])
final_vocab_item = set(np.where(np.array(user_item) == 1.0)[1])
print(len(final_vocab_user), len(final_vocab_item))

146 688


### Save the data

In [185]:
def np2csv(np_array, filename, headers):
    positive_list = list(zip(*np.where(np.array(np_array) == 1.0)))
    # save the positive list as csv
    with open(filename, 'w') as f:
        f.write(headers[0]+ ',' + headers[1] + '\n')
        for ele in positive_list:
            if filename == data_dir + 'user_item.csv' or filename == data_dir + 'train.csv':
                f.write(str(ele[0]) + ',' + str(ele[1]) + '\n')
            elif filename == data_dir + 'user_attr.csv':
                if ele[0] in final_vocab_user:
                    f.write(str(ele[0]) + ',' + str(ele[1]) + '\n')
            elif filename == data_dir + 'item_attr.csv':
                if ele[0] in final_vocab_item:
                    f.write(str(ele[0]) + ',' + str(ele[1]) + '\n')

In [186]:
np2csv(item_attr, data_dir + 'item_attr.csv', headers=['item_id', 'attr_id'])
np2csv(user_attr, data_dir + 'user_attr.csv', headers=['user_id', 'attr_id'])
np2csv(user_item, data_dir + 'user_item.csv', headers=['user_id', 'item_id'])

## Test/ Train splits.


In [187]:
test_dict = {}
count = 0
for i, user_profile in enumerate(user_item):
    viewd_items = np.where(user_profile == 1.0)[0]
    if len(viewd_items) <= 10:
        continue
    # sample 20% of the items from that list
    test_items = np.random.choice(viewd_items, int(len(viewd_items) * 0.2), replace=False)
    # set the test items to 0
    user_profile[test_items] = 0.0
    test_dict[i] = test_items
    count += len(test_items)

np2csv(user_item, data_dir + 'train.csv', headers=['user_id', 'item_id'])

In [188]:
count + len(list(zip(*np.where(np.array(user_item) == 1.0))))

6913

In [189]:
def save_test_file(test_dict):
    with open(data_dir + 'test.csv', 'w') as f:
        f.write('user_id,item_id\n')
        for user, items in test_dict.items():
            for item in items:
                f.write(str(user) + ',' + str(item) + '\n')
save_test_file(test_dict)

## Save rules for eval

In [190]:
def save_rules(rules):
    with open(data_dir + 'rules.csv', 'w') as f:
        f.write('user_attr_ids' + ',' + 'item_attr_ids' + '\n')
        for rule in rules:
            f.write(str(rule[0]) + ',' + str(rule[1]) + '\n')
save_rules(rules)

In [191]:
dir = '../data/intersection_only'
import pandas as pd
import os

full_data = pd.read_csv(os.path.join(dir, 'user_item.csv'))

In [192]:
len(set(full_data['user_id'].values)), len(set(full_data['item_id'].values))

(146, 688)

In [193]:
sorted(set(full_data['user_id'].unique().tolist()))

[0,
 1,
 2,
 3,
 6,
 7,
 8,
 9,
 10,
 11,
 13,
 14,
 15,
 17,
 18,
 19,
 23,
 26,
 27,
 28,
 29,
 30,
 32,
 35,
 36,
 38,
 40,
 42,
 43,
 44,
 45,
 47,
 49,
 50,
 51,
 52,
 54,
 57,
 58,
 59,
 61,
 62,
 63,
 64,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 81,
 82,
 83,
 84,
 86,
 87,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 98,
 99,
 100,
 102,
 103,
 106,
 107,
 109,
 110,
 111,
 113,
 114,
 115,
 117,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 127,
 128,
 129,
 130,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 141,
 142,
 143,
 144,
 145,
 147,
 148,
 149,
 150,
 151,
 154,
 156,
 159,
 160,
 161,
 163,
 164,
 165,
 166,
 167,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 180,
 181,
 185,
 186,
 187,
 188,
 190,
 191,
 192,
 193,
 194,
 195,
 196]

In [194]:
def get_id_dict(self, df, field='id'):
    ids = sorted(set(full_data['user_id'].unique().tolist()))
    id2id = {id: i for i, id in enumerate(ids)}
    return id2id
get_id_dict(full_data, 'user_id')

{0: 0,
 1: 1,
 2: 2,
 3: 3,
 6: 4,
 7: 5,
 8: 6,
 9: 7,
 10: 8,
 11: 9,
 13: 10,
 14: 11,
 15: 12,
 17: 13,
 18: 14,
 19: 15,
 23: 16,
 26: 17,
 27: 18,
 28: 19,
 29: 20,
 30: 21,
 32: 22,
 35: 23,
 36: 24,
 38: 25,
 40: 26,
 42: 27,
 43: 28,
 44: 29,
 45: 30,
 47: 31,
 49: 32,
 50: 33,
 51: 34,
 52: 35,
 54: 36,
 57: 37,
 58: 38,
 59: 39,
 61: 40,
 62: 41,
 63: 42,
 64: 43,
 67: 44,
 68: 45,
 69: 46,
 70: 47,
 71: 48,
 72: 49,
 73: 50,
 74: 51,
 75: 52,
 76: 53,
 77: 54,
 78: 55,
 79: 56,
 81: 57,
 82: 58,
 83: 59,
 84: 60,
 86: 61,
 87: 62,
 89: 63,
 90: 64,
 91: 65,
 92: 66,
 93: 67,
 94: 68,
 95: 69,
 96: 70,
 98: 71,
 99: 72,
 100: 73,
 102: 74,
 103: 75,
 106: 76,
 107: 77,
 109: 78,
 110: 79,
 111: 80,
 113: 81,
 114: 82,
 115: 83,
 117: 84,
 119: 85,
 120: 86,
 121: 87,
 122: 88,
 123: 89,
 124: 90,
 125: 91,
 127: 92,
 128: 93,
 129: 94,
 130: 95,
 132: 96,
 133: 97,
 134: 98,
 135: 99,
 136: 100,
 137: 101,
 138: 102,
 139: 103,
 141: 104,
 142: 105,
 143: 106,
 144: 107,
 14