In [1]:
import numpy as np
import pandas as pd

from collections import defaultdict

from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [2]:
# target domain data processing
target_ratings = pd.read_csv(f'/content/data_target.csv', sep=',',
                      encoding='latin-1')

target_users = target_ratings.user.unique()
target_items = target_ratings.item.unique()

print("Target domain unique users: ", len(target_users))
print("Target domain unique items: ", len(target_items))
print("Target domain ratings: ", len(target_ratings.index))
print('Avg user degree in target domain: ', len(target_ratings.index) / len(target_users))
print('Avg item degree in target domain: ', len(target_ratings.index) / len(target_items))

Target domain unique users:  10660
Target domain unique items:  3589
Target domain ratings:  19334
Avg user degree in target domain:  1.8136960600375234
Avg item degree in target domain:  5.387015881861243


In [3]:
# test domain data processing
test_ratings = pd.read_csv(f'/content/data_test.csv', sep=',',
                      encoding='latin-1')

test_users = test_ratings.user.unique()
test_items = test_ratings.item.unique()

print("Target domain unique users: ", len(test_users))
print("Target domain unique items: ", len(test_items))
print("Target domain ratings: ", len(test_ratings.index))

Target domain unique users:  5019
Target domain unique items:  2158
Target domain ratings:  5019


In [4]:
# verify test domain data is subset of target domain data
print("Test user set is subset of target user set: ", set(test_users).issubset(set(target_users)))
print("Test item set is subset of target item set: ", set(test_items).issubset(set(target_items)))

Test user set is subset of target user set:  True
Test item set is subset of target item set:  True


In [5]:
# get useful lookup tables 
user_idx = range(len(target_users))
item_idx = range(len(target_items))
user_id = dict(zip(target_users, user_idx))
item_id = dict(zip(target_items, item_idx))
user_name = dict(zip(user_idx, target_users))
item_name = dict(zip(item_idx, target_items))

In [6]:
items_of_user = defaultdict(set)
item_popularity = defaultdict(int)
rows, cols, vals = [], [], []

for index, row in target_ratings.iterrows():
    u, i = row['user'], row['item'] 
    r, c = user_id[u], item_id[i]
    items_of_user[u].add(i)
    item_popularity[i] += 1
    rows.append(r)
    cols.append(c)
    vals.append(1.0)

# sparse rating matrix in target domain
Rt = sparse.csr_matrix((vals, (rows, cols)), shape=(len(target_users), len(target_items)))

In [7]:
SEED = 2022
np.random.seed(SEED)

def negative_sampling(pos_data, item_set, sample_size):
    user, item = pos_data
    pos_items = items_of_user[user] 
    pos_items.add(item)
    neg_items = [i for i in item_set if i not in pos_items]
    sampled_neg_items = np.random.choice(neg_items, size=(sample_size - 1), replace=False)
    sampled_data = [(user, i, 0) for i in sampled_neg_items]
    sampled_data.append((user, item, 1))
    return sampled_data

In [8]:
def generate_full_test_data(pos_test_data):
    full_test_data = []
    # for each positive test data, sample 99 negative data in target domain
    for index, row in pos_test_data.iterrows():
        u, i = row['user'], row['item']
        full_test_data.append(negative_sampling((u, i), target_items, 100))
    return full_test_data # shape: (len(test data), 100)

In [9]:
# change the problem to 'detect interaction', ratings = 1 means interacted
test_ratings['rating'] = 1
target_ratings['rating'] = 1

# negative sampling test data
full_test_data = generate_full_test_data(test_ratings)

In [10]:
# Baseline 1: naive ranked by popularity for all test users
def eval_baseline_popularity(item_popularity, test_data, k_list):

    hit_count = [0.0] * len(k_list)

    for data in test_data:
        pos_item = None
        pred_item_score = []
        for _, item, rating in data: # 1 pos with 99 sampled neg 
            score = item_popularity[item]
            pred_item_score.append((item, score))
            if rating != 0.0:
                pos_item = item

        pred_item_score = sorted(pred_item_score, key=lambda t: t[1], reverse=True)
        ranked_items = [i for i, _ in pred_item_score]

        for c, k in enumerate(k_list):
            if pos_item in ranked_items[:k]:
                hit_count[c] += 1
    
    print("Baseline: rank by popularity")
    for c, k in enumerate(k_list):
        print("hit@" + str(k), ":", hit_count[c] / len(test_data))

In [11]:
eval_baseline_popularity(item_popularity, full_test_data, [1, 5, 10, 15])

Baseline: rank by popularity
hit@1 : 0.02171747360031879
hit@5 : 0.10958358238692967
hit@10 : 0.21797170751145648
hit@15 : 0.3189878461844989


In [18]:
def eval_baseline_similarity(items_of_user, item_popularity, similarities, test_data, k_list):

    hit_count = [0.0] * len(k_list)

    for data in test_data:
        # get similar users of the current user in target domain
        cur_user_id = user_id[data[0][0]]
        similar_users = []
        for c in range(len(target_users)):
            sim = similarities[cur_user_id, c]
            if sim > 0.0 and c != cur_user_id:
                similar_user_name = user_name[c]
                similar_users.append((similar_user_name, sim))

        pos_item = None
        pred_item_score = []

        for _, item, rating in data:
            score = 0.0
            if len(similar_users) == 0: 
                # no similar user: use popularity
                score = item_popularity[item]
            else:
                # score = avg similarity among users who interact with same item
                score, count = 0.0, 0.0
                for user, sim in similar_users:
                    if item in items_of_user[user]:
                        score += sim
                        count += 1
                score = score / count if count > 0 else 0

            pred_item_score.append((item, score))
            if rating != 0.0:
                pos_item = item

        # sort 100 items by predicted score
        pred_item_score = sorted(pred_item_score, key=lambda t: t[1], reverse=True)

        # all scores are 0 (all items are not interacted by similar users)
        if pred_item_score[0][1] == 0.0: 
            # use popularity
            pred_item_score = []
            for _, item, rating in data: 
                score = item_popularity[item]
                pred_item_score.append((item, score))
            pred_item_score = sorted(pred_item_score, key=lambda t: t[1], reverse=True)

        ranked_items = [i for i, _ in pred_item_score]

        for c, k in enumerate(k_list):
            if pos_item in ranked_items[:k]:
                hit_count[c] += 1

    print("Baseline: rank by similarity")
    for c, k in enumerate(k_list):
        print("hit@" + str(k), ":", hit_count[c] / len(test_data))

In [13]:
similarities = cosine_similarity(Rt)
eval_baseline_similarity(items_of_user, item_popularity, similarities, full_test_data, [1, 5, 10, 15])

Baseline: rank by similarity
hit@1 : 0.07352062163777645
hit@5 : 0.1528192867105001
hit@10 : 0.19884439131301057
hit@15 : 0.2442717672843196


In [14]:
def jaccard_similarity(items_of_user):
    n = len(target_users)
    similarities = np.identity(n)
    for i in range(n):
        for j in range(i + 1, n):
            ui, uj = user_name[i], user_name[j]
            items_i, items_j = items_of_user[ui], items_of_user[uj]
            sim = len(items_i.intersection(items_j)) / len(items_i.union(items_j))
            similarities[i, j] = similarities[j, i] = sim
    return similarities

similarities = jaccard_similarity(items_of_user)

In [19]:
eval_baseline_similarity(items_of_user, item_popularity, similarities, full_test_data, [1, 5, 10, 15])

Baseline: rank by similarity
hit@1 : 0.755130504084479
hit@5 : 0.9996015142458657
hit@10 : 1.0
hit@15 : 1.0
