In [1]:
import csv
import random
from collections import defaultdict

In [2]:
SEED = 2022
random.seed(SEED)

In [3]:
path = './Books.csv'

In [4]:
def read_csv(path):
    with open(path, 'rt') as f:
        c = csv.reader(f)
        header = next(c)
        for line in c:
            item, user, rating, time = line
            yield item, user, rating, time

In [5]:
def get_data_from_file(data_path):
    users_per_item = defaultdict(list)
    items_per_user = defaultdict(list)
    interactions = []
    for item, user, _, _ in read_csv(data_path):
        users_per_item[item].append(user)
        items_per_user[user].append(item)
        interactions.append((user, item))
    print(f"user num: {len(items_per_user)}, item num: {len(users_per_item)}, instances: {len(interactions)}")
    print(f"average user degree: {len(interactions) / len(items_per_user)}")
    print(f"average item degree: {len(interactions) / len(users_per_item)}")
    return interactions, users_per_item, items_per_user

In [6]:
interactions, users_per_item, items_per_user = get_data_from_file(path)

user num: 15362619, item num: 2930451, instances: 51311620
average user degree: 3.3400307590782536
average item degree: 17.50980309856742


In [7]:
# rank the item by degree
item_ranked_list = sorted(users_per_item.items(), key=lambda x: len(x[1]), reverse=True)

In [8]:
# set the index range to select items 
ITEM_START_IDX = 2000
SELECTED_ITEM_NUM = 25000

In [9]:
# extract the items for our dataset 
item_candidates = [item for (item, _) in item_ranked_list[ITEM_START_IDX:ITEM_START_IDX + SELECTED_ITEM_NUM]]

In [10]:
# for all the extracted items, get all the users that interact with them to generate a dict
items_per_user_selected = defaultdict(list)
avg_user_degree = 0
for item in item_candidates:
    for user in users_per_item[item]:
        items_per_user_selected[user].append(item)
        avg_user_degree += 1
avg_user_degree = avg_user_degree / len(items_per_user_selected)
print(f"user num: {len(items_per_user_selected)}, item num: {len(item_candidates)}, average user degree: {avg_user_degree}")

user num: 5441387, item num: 25000, average user degree: 2.289087138995995


In [11]:
user_ranked_list = sorted(items_per_user_selected.items(), key=lambda x: len(x[1]), reverse=True)

In [12]:
# set the index range to select users
USER_START_IDX = 1000
SELECTED_USER_NUM = 100000

In [13]:
# extract all the users for our dataset
user_candidates = [x for (x, _) in user_ranked_list[USER_START_IDX:USER_START_IDX + SELECTED_USER_NUM]]

In [14]:
item_candidate_set = set(item_candidates)
user_candidate_set = set(user_candidates)
print(f"user candidate num: {len(user_candidate_set)}, item candidate num: {len(item_candidate_set)}")

user candidate num: 100000, item candidate num: 25000


In [15]:
# generate the adjacent tables for the instances within our refined user-item space
def get_refined_data(interactions_ori, user_set, item_set):
    itemsPerUser = defaultdict(list)
    usersPerItem = defaultdict(list)
    interactions_processed = []
    for (user, item) in interactions_ori:
        if user not in user_set:
            continue
        if item not in item_set:
            continue
        itemsPerUser[user].append(item)
        usersPerItem[item].append(user)
        interactions_processed.append((user, item))
    print(f"user num: {len(itemsPerUser)}, item num: {len(usersPerItem)}, instances: {len(interactions_processed)}")
    print(f"average user degree: {len(interactions_processed) / len(itemsPerUser)}")
    print(f"average item degree: {len(interactions_processed) / len(usersPerItem)}")
    return interactions_processed, usersPerItem, itemsPerUser

In [16]:
new_interactions, new_users_per_item, new_items_per_user = get_refined_data(interactions, user_candidate_set, item_candidate_set)

user num: 100000, item num: 24972, instances: 2652446
average user degree: 26.52446
average item degree: 106.21680281915745


In [21]:
def generate_cross_domain_data(items_per_user, users_per_item, source_num=600000, test_data_num=20000):
    # the lists contains source, target, test instances
    source, target, test = [], [], []
    # set up the adjacent tables for target/test domain to help generation
    usersPerItemTarget = defaultdict(set)
    itemsPerUserTarget = defaultdict(set)
    usersPerItemTest = defaultdict(set)
    itemsPerUserTest = defaultdict(set)
    user_degree_dict = {k: len(v) for k, v in items_per_user.items()}
    
    # pick at most k users for a selected item to generate target domain data
    def pick_target(item, users, k=3, p=0.1):
        users = users.copy()
        if p * len(users) > 1:
            k = min(int(5 * k), int(p * len(users)))
        users.sort(key=lambda x : user_degree_dict[x])
        candidate_user_num = min(5 * k, len(users))
        candidate_users = users[:candidate_user_num]
        random.shuffle(candidate_users)
        if len(candidate_users) >= 2:
            candidate_users = candidate_users[:len(candidate_users) // 2]
        candidate_users = candidate_users[:min(k, len(candidate_users))]
        for i in range(len(candidate_users)):
            user = candidate_users[i]
            target.append((user, item))
            usersPerItemTarget[item].add(user)
            itemsPerUserTarget[user].add(item)

    for item, users in users_per_item.items():
        pick_target(item, users)
        
    target_user_set = set(itemsPerUserTarget.keys())
    target_item_set = set(usersPerItemTarget.keys())
    print(10 * "*"+" target domain " + 10 * "*")
    print(f"user num: {len(target_user_set)}, item num: {len(target_item_set)}, instances: {len(target)}")
    print(f"average user degree: {len(target) / len(target_user_set)}")
    print(f"average item degree: {len(target) / len(target_item_set)}")
    
    for item, users in users_per_item.items():
        for user in users:
            if item in target_item_set and user not in target_user_set:
                source.append((user, item, 1))
    print(10 * "*"+" source domain " + 10 * "*")
    print(f"total instances: {len(source)}")
    random.shuffle(source)
    source = random.sample(source, k=source_num)
    print(f"sampled instances: {len(source)}")
    source_item_set = set([v for (_, v, _) in source])
    source_user_set = set([u for (u, _, _) in source])
    print(f"source user num: {len(source_user_set)}")
    print(f"source item num: {len(source_item_set)}")
    print(f"average user degree: {len(source) / len(source_user_set)}")
    print(f"average item degree: {len(source) / len(source_item_set)}")
    
    # use the common items across source and target to make sure both domain share the same items
    common_items = source_item_set
    target_processed = []
    usersPerItemTarget = defaultdict(set)
    itemsPerUserTarget = defaultdict(set)
    for u, v in target:
        if v in common_items:
            target_processed.append((u, v))
            usersPerItemTarget[v].add(u)
            itemsPerUserTarget[u].add(v)
    print(10 * "*"+" processed target domain " + 10 * "*")
    print(f"user num: {len(itemsPerUserTarget)}, item num: {len(usersPerItemTarget)}, instances: {len(target_processed)}")
    print(f"average user degree: {len(target_processed) / len(itemsPerUserTarget)}")
    print(f"average item degree: {len(target_processed) / len(usersPerItemTarget)}")
    target = target_processed
    
    target_user_set = set(itemsPerUserTarget.keys())
    target_item_set = set(usersPerItemTarget.keys())
    target_item_list = list(target_item_set)

    # pick at most 1 user for a selected item to generate positive test data
#     def pick_test(item, users, k=1):
#         users.sort(key=lambda x : abs(len(itemsPerUserTest[x])))
#         num = 0
#         for user in users:
#             if user in usersPerItemTarget[item] or user not in target_user_set:
#                 continue
#             test.append((user, item))
#             usersPerItemTest[item].add(user)
#             itemsPerUserTest[user].add(item)
#             num += 1
#             if num >= k:
#                 break
    # pick at most k item for a selected user to generate positive test data
    
    def pick_test(user, items, k=2):
        random.shuffle(items)
        num = 0
        for item in items:
            if item in itemsPerUserTarget[user] or item not in target_item_set:
                continue
            test.append((user, item))
            usersPerItemTest[item].add(user)
            itemsPerUserTest[user].add(item)
            num += 1
            if num >= k:
                break
    
    for user in target_user_set:
        pick_test(user, items_per_user[user])
        
    itemsPerUserTest = {k: v for (k, v) in itemsPerUserTest.items() if len(v) > 0}
    print(10 * "*"+" test positive data in target domain " + 10 * "*")
    print(f"user num: {len(itemsPerUserTest)}, item num: {len(usersPerItemTest)}, instances: {len(test)}")
    print(f"average user degree: {len(test) / len(itemsPerUserTest)}")
    print(f"average item degree: {len(test) / len(usersPerItemTest)}")
    
    # generate negative samples for the test data
    test_all = []
    target_user_list = list(target_user_set)
    for (user, item) in test:
        test_all.append((user, item, 1))
        interacted_item_set = set(items_per_user[user])
        neg = random.choice(target_item_list)
        max_try = 10
        try_num = 0
        while neg in interacted_item_set and try_num < max_try:
            neg = random.choice(target_item_list)
            try_num += 1
        if try_num >= max_try:
            continue
        test_all.append((user, neg, 0))
    print(f"total test candidate num: {len(test_all)}")
    test_sampled = random.sample(test_all, k=test_data_num)
    print(f"sampled test num: {len(test_sampled)}")
    
    # add labels to the target data
    target = [(u, v, 1) for u, v in target]
    
    
    def save_data(data, path):
        import pandas as pd
        df = pd.DataFrame(data)
        df.columns = ['user', 'item', 'label']
        df.to_csv(path, index=False)
    
    save_data(source, 'data_source.csv')
    save_data(target, 'data_target.csv')
    save_data(test_sampled, 'data_test.csv')

In [22]:
generate_cross_domain_data(new_items_per_user, new_users_per_item)

********** target domain **********
user num: 75730, item num: 24972, instances: 210079
average user degree: 2.7740525551300674
average item degree: 8.412582091942976
********** source domain **********
total instances: 1229554
sampled instances: 600000
source user num: 24269
source item num: 23871
average user degree: 24.722897523589765
average item degree: 25.135101168782203
********** processed target domain **********
user num: 75260, item num: 23871, instances: 207902
average user degree: 2.7624501727345203
average item degree: 8.709396338653596
********** test positive data in target domain **********
user num: 75253, item num: 22642, instances: 150498
average user degree: 1.999893691945836
average item degree: 6.646850984895328
total test candidate num: 300996
sampled test num: 20000
