In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from itertools import combinations
from scipy import sparse
from scipy.sparse.linalg import svds
import implicit

import random
my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

import matplotlib.pyplot as plt
import json

In [2]:
import math
# ground_truth: list of items ordered by time
def nDCG_Time(ground_truth, _recList):
    rec_num = len(_recList) # topK
    # ground_truth is already sorted by time
    idealOrder = ground_truth
    idealDCG = 0.0
    for j in range(min(rec_num, len(idealOrder))):
        idealDCG += ((math.pow(2.0, len(idealOrder) - j) - 1) / math.log(2.0 + j))

    recDCG = 0.0
    for j in range(rec_num):
        item = _recList[j]
        if item in ground_truth:
            rank = len(ground_truth) - ground_truth.index(item) # why ground truth?
            recDCG += ((math.pow(2.0, rank) - 1) / math.log(1.0 + j + 1))

    return (recDCG / idealDCG)


def Recall(_test_set, _recList):
    hit = len(set(_recList).intersection(set(_test_set)))
    # return hit / float(len(_test_set))
    return hit / min(float(len(_test_set)), float(len(_recList)))

def Precision(_test_set, _recList):
    hit = len(set(_recList).intersection(set(_test_set)))
    return hit / float(len(_recList))

In [3]:
# local data
listening_df = pd.read_csv('./data/lastfm_2020/listening_events_2020.tsv', header=1, sep='\t',
                           names=['user_id', 'track_id', 'album_id', 'timestamp'])
user_df = pd.read_csv('./data/lastfm_2020/users_2020.tsv', header=1, sep='\t',
                     names=['user_id', 'country', 'age', 'gender', 'creation_time'])

In [4]:
listening_users = listening_df['user_id'].unique()
filed_users = user_df['user_id'].unique()

for id in listening_users:
    if id not in filed_users:
        print(id)

2


In [5]:
# user with id 2 is not in the `user_df`, so we delete their record from `listening_df` as well.
listening_df = listening_df[listening_df['user_id'] != 2]

In [6]:
# get the users with interaction number > 10
user_counts = listening_df['user_id'].value_counts()

In [11]:
filtered_users = user_counts[user_counts > 10]

In [12]:
filtered_user_df = filtered_users.to_frame().reset_index()

In [39]:
# group the users into three levels of interactions:
# (10, 1000], (1000, 5000], (5000, -)
level_one_user_df = filtered_user_df[filtered_user_df['count'] < 1001]
level_two_user_df = filtered_user_df[(filtered_user_df['count'] < 5001) & (filtered_user_df['count'] > 1000)]
level_thr_user_df = filtered_user_df[filtered_user_df['count'] > 5000]

In [41]:
print('Number of users in each group:')
print(f'10-1000: {level_one_user_df.shape[0]}')
print(f'1001-5000: {level_two_user_df.shape[0]}')
print(f'5001-: {level_thr_user_df.shape[0]}')

Number of users in each group:
10-1000: 6084
1001-5000: 7523
5001-: 1157


In [30]:
# only keep the records from filtered users
filtered_listening_df = listening_df.merge(filtered_user_df, on='user_id')

In [31]:
filtered_listening_df.shape[0]

30354942

### Collaborative filtering

In [32]:
def df_to_mat(df, user_n, item_n, user_id_to_iid, item_id_to_iid):
    """
    Convert DataFrame to sparse matrix.

    Arg:
        df: DataFrame, ratings dataframe with user_id, movie_id and rating

    Return:
        mat: scipy.sparse.csr_matrix, sparse ratings matrix with rows being users and cols being items
    """
    
    mat = sparse.lil_matrix((user_n, item_n))
    for _, row in df.iterrows():
        user_id = int(row[0])
        item_id = int(row[1])
        user_iid = user_id_to_iid[user_id]
        item_iid = item_id_to_iid[item_id]
        mat[user_iid, item_iid] = 1
    
    return mat

In [33]:
user_n = filtered_listening_df['user_id'].nunique()
item_n = filtered_listening_df['track_id'].nunique()

In [37]:
user_ids = filtered_listening_df['user_id'].unique()
item_ids = filtered_listening_df['track_id'].unique()

user_id_to_iid = {user_ids[i]:i for i in range(len(user_ids))}
user_iid_to_id = {i:user_ids[i] for i in range(len(user_ids))}

item_id_to_iid = {item_ids[i]:i for i in range(len(item_ids))}
item_iid_to_id = {i:item_ids[i] for i in range(len(item_ids))}

In [43]:
level_one_user_ids = level_one_user_df['user_id'].unique()
level_two_user_ids = level_two_user_df['user_id'].unique()
level_thr_user_ids = level_thr_user_df['user_id'].unique()

In [44]:
train_df, test_df = train_test_split(filtered_listening_df, test_size=0.2)

In [45]:
train_mat = df_to_mat(train_df, user_n, item_n, user_id_to_iid, item_id_to_iid)
train_mat = train_mat.tocsr()

test_mat = df_to_mat(test_df, user_n, item_n, user_id_to_iid, item_id_to_iid)
test_mat = test_mat.tocsr()

In [46]:
mf = implicit.als.AlternatingLeastSquares(factors=100, regularization=0.01, alpha=1.0)
mf.fit(train_mat)

  check_blas_config()


  0%|          | 0/15 [00:00<?, ?it/s]

In [55]:
level_one_recall = []
for user_id in level_one_user_ids:
    user_iid = user_id_to_iid[user_id]
    
    test_item_iids = list(np.argwhere(test_mat[user_iid] > 0)[:, 1])
    test_item_ids = [item_iid_to_id[iid] for iid in test_item_iids]

    if len(test_item_ids) > 0:
        top_item_iids = list(mf.recommend(user_iid, train_mat[user_iid], N=10, filter_already_liked_items=True)[0])
        top_item_ids = [item_iid_to_id[iid] for iid in top_item_iids]

        recall = Recall(test_item_ids, top_item_ids)
        level_one_recall.append(recall)

In [56]:
np.average(level_one_recall)

0.047330185917142435

In [57]:
level_two_recall = []
for user_id in level_two_user_ids:
    user_iid = user_id_to_iid[user_id]
    
    test_item_iids = list(np.argwhere(test_mat[user_iid] > 0)[:, 1])
    test_item_ids = [item_iid_to_id[iid] for iid in test_item_iids]

    if len(test_item_ids) > 0:
        top_item_iids = list(mf.recommend(user_iid, train_mat[user_iid], N=10, filter_already_liked_items=True)[0])
        top_item_ids = [item_iid_to_id[iid] for iid in top_item_iids]

        recall = Recall(test_item_ids, top_item_ids)
        level_two_recall.append(recall)

In [58]:
np.average(level_two_recall)

0.10817493021401037

In [59]:
level_thr_recall = []
for user_id in level_thr_user_ids:
    user_iid = user_id_to_iid[user_id]
    
    test_item_iids = list(np.argwhere(test_mat[user_iid] > 0)[:, 1])
    test_item_ids = [item_iid_to_id[iid] for iid in test_item_iids]

    if len(test_item_ids) > 0:
        top_item_iids = list(mf.recommend(user_iid, train_mat[user_iid], N=10, filter_already_liked_items=True)[0])
        top_item_ids = [item_iid_to_id[iid] for iid in top_item_iids]

        recall = Recall(test_item_ids, top_item_ids)
        level_thr_recall.append(recall)

In [60]:
np.average(level_thr_recall)

0.13336214347450304

In [None]:
cf_scores = [level_one_recall, level_two_recall, level_thr_recall]

### Content-based filtering

In [62]:
track_json_lst = []
with open('./data/lastfm/tags.json', 'r', encoding='utf-8') as f:
    for obj in f:
        track_dict = json.loads(obj)
        track_json_lst.append(track_dict)

In [63]:
track_tags_lst = []
for obj in track_json_lst:
    track_id = obj['i']
    tags = list(obj['tags'].keys())[:10]    # use the first 10 tags
    track_tags_lst.append([track_id, tags])

In [64]:
tag_df = pd.DataFrame(track_tags_lst, columns=['track_id', 'tags'])

In [109]:
user_id = 45536
np.asarray(train_df[(train_df['user_id']==user_id)]['track_id'].value_counts().index)

array([18604619,  8722087, 23875642, ..., 15335759, 21859388, 24796385])

In [124]:
def cb_recommend(user_id, topk, knn, train_df, X, item_n, item_id_to_iid, item_iid_to_id):
    sorted_rated_before = np.asarray(train_df[(train_df['user_id']==user_id)]['track_id'].value_counts().index)
    
    if sorted_rated_before.size > 0:

        raw_recommends = {}
        for item_id in sorted_rated_before:
            item_iid = item_id_to_iid[item_id]
            distances, indices = knn.kneighbors(X[item_iid], 
                                                n_neighbors=topk+1)
            sorted_pairs = sorted(list(zip(indices.squeeze().tolist(),
                                           distances.squeeze().tolist())),
                                  key=lambda x: x[1])
            raw_recommends[item_iid] = sorted_pairs 
        
        top_item_ids = []
        pos = 0
        while True:
            for item_id in sorted_rated_before:
                item_iid = item_id_to_iid[item_id]
                next_neighbor_iid = raw_recommends[item_iid][pos][0]
                next_neighbor_id = item_iid_to_id[next_neighbor_iid]
                if next_neighbor_id not in sorted_rated_before:
                    top_item_ids.append(next_neighbor_id)
                if len(top_item_ids) > topk - 1:
                    return (user_id, np.array(top_item_ids))
            
            pos += 1
    else:

        top_item_iids = random.sample(list(range(0, item_n)), topk)
        top_item_ids = [item_iid_to_id[iid] for iid in top_item_iids]
        return (user_id, np.asarray(top_item_ids))

In [133]:
def sample_evaluate(test_user_ids, knn, X, train_df, test_df, item_n):

    r = []
    for user_id in test_user_ids:
        test_item_ids = np.asarray(test_df[test_df['user_id']==user_id]['track_id'].unique())
        
        if len(test_item_ids) > 0:
            top_item_ids = list(cb_recommend(user_id, 10, knn, train_df, X, item_n, item_id_to_iid, item_iid_to_id)[1])
            recall = Recall(test_item_ids, top_item_ids)
            r.append(recall)
    
    return np.average(r)

In [137]:
tagged_listening_df = pd.merge(listening_df, tag_df, on='track_id')
tagged_listening_df = tagged_listening_df.sample(frac=0.1, ignore_index=True)

In [146]:
item_n

424016

In [138]:
user_counts = tagged_listening_df['user_id'].value_counts()

In [139]:
filtered_users = user_counts[user_counts > 10]
filtered_user_df = filtered_users.to_frame().reset_index()

In [142]:
# group the users into three levels of interactions:
# (10, 1000], (1000, 5000], (5000, -)
level_one_user_df = filtered_user_df[filtered_user_df['count'] < 101]
level_two_user_df = filtered_user_df[(filtered_user_df['count'] < 501) & (filtered_user_df['count'] > 100)]
level_thr_user_df = filtered_user_df[filtered_user_df['count'] > 500]

In [143]:
print('Number of users in each group:')
print(f'10-1000: {level_one_user_df.shape[0]}')
print(f'1001-5000: {level_two_user_df.shape[0]}')
print(f'5001-: {level_thr_user_df.shape[0]}')

Number of users in each group:
10-1000: 7093
1001-5000: 5126
5001-: 330


In [144]:
filtered_tagged_listening_df = tagged_listening_df.merge(filtered_user_df, on='user_id')

In [145]:
user_n = filtered_tagged_listening_df['user_id'].nunique()
item_n = filtered_tagged_listening_df['track_id'].nunique()

user_ids = filtered_tagged_listening_df['user_id'].unique()
item_ids = filtered_tagged_listening_df['track_id'].unique()

item_id_to_iid = {item_ids[i]:i for i in range(len(item_ids))}
item_iid_to_id = {i:item_ids[i] for i in range(len(item_ids))}

level_one_user_ids = level_one_user_df['user_id'].unique()
level_two_user_ids = level_two_user_df['user_id'].unique()
level_thr_user_ids = level_thr_user_df['user_id'].unique()

In [147]:
filtered_tag_df = filtered_tagged_listening_df.drop_duplicates(subset=['track_id'])[['track_id', 'tags']]

In [148]:
tf = TfidfVectorizer(analyzer = lambda x: (g for g in x))
X_tfidf = tf.fit_transform(filtered_tag_df['tags'])

In [149]:
knn = NearestNeighbors(metric='cosine', algorithm='auto', n_neighbors=10, n_jobs=-1)
knn.fit(X_tfidf)

In [150]:
train_df, test_df = train_test_split(filtered_tagged_listening_df, test_size=0.2)

# train_mat = df_to_mat(train_df, user_n, item_n, user_id_to_iid, item_id_to_iid)
# train_mat = train_mat.transpose().tocsr()

# test_mat = df_to_mat(test_df, user_n, item_n, user_id_to_iid, item_id_to_iid)
# test_mat= test_mat.transpose().tocsr()

In [153]:
all_level_one_recall = []
all_level_two_recall = []
all_level_thr_recall = []

n_iters = 1
for _ in range(n_iters):

    test_level_one_user_ids = np.random.choice(level_one_user_ids, size=50, replace=False)
    test_level_two_user_ids = np.random.choice(level_two_user_ids, size=50, replace=False)
    test_level_thr_user_ids = np.random.choice(level_thr_user_ids, size=50, replace=False)

    level_one_recall = sample_evaluate(test_level_one_user_ids, knn, X_tfidf, train_df, test_df, item_n)
    level_two_recall = sample_evaluate(test_level_two_user_ids, knn, X_tfidf, train_df, test_df, item_n)
    level_thr_recall = sample_evaluate(test_level_thr_user_ids, knn, X_tfidf, train_df, test_df, item_n)

    all_level_one_recall.append(level_one_recall)
    all_level_two_recall.append(level_two_recall)
    all_level_thr_recall.append(level_thr_recall)
    
    print(f'Level one user recall: {level_one_recall}')
    print(f'Level two user recall: {level_two_recall}')
    print(f'Level thr user recall: {level_thr_recall}')

Level one user recall: 0.008
Level two user recall: 0.013999999999999999
Level thr user recall: 0.014000000000000002


In [154]:
cb_scores = [np.mean(all_level_one_recall),
            np.mean(all_level_two_recall),
            np.mean(all_level_thr_recall)]

### Fairness metric

In [155]:
def ugf(scores):
    return np.mean([abs(i[0] - i[1]) for i in combinations(scores, 2)])

In [156]:
ugf(cb_scores)

0.004000000000000001

In [157]:
cf_scores = [0.047330185917142435, 0.10817493021401037, 0.13336214347450304]
ugf(cf_scores)

0.05735463837157373