In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import sqrt
from scipy.sparse import csr_matrix
from tqdm.notebook import tqdm
from implicit.bpr import BayesianPersonalizedRanking
from implicit.evaluation import train_test_split, precision_at_k

In [None]:
data_dir = 'data/25m'

# we are working with movie data, but we'll name
# the movie as item to make it more generic to
# all use-cases
user_col = 'userId'
item_col = 'movieId'
value_col = 'rating'
time_col = 'timestamp'

rating_path = os.path.join(data_dir, 'ratings.csv')
df_raw = pd.read_csv(rating_path)
print('dimension: ', df_raw.shape)
df_raw.head()

In [None]:
title_col = 'title'
genre_col = 'genres'

item_info_path = os.path.join(data_dir, 'movies.csv')
df_item = pd.read_csv(item_info_path)
df_item = df_item[df_item[genre_col] != '(no genres listed)']
print('dimension: ', df_item.shape)
df_item.head()

In [None]:
class Item:
    """
    Data holder for our item.
    
    Parameters
    ----------
    id : int
  
    title : str

    genre : dict[str, float]
        The item/movie's genre distribution, where the key
        represents the genre and value corresponds to the
        ratio of that genre.

    score : float
        Score for the item, potentially generated by some
        recommendation algorithm.
    """
    def __init__(self, _id, title, genres, score=None):
        self.id = _id
        self.title = title
        self.score = score
        self.genres = genres

    def __repr__(self):
        return self.title


def create_item_mapping(df_item, item_col, title_col, genre_col):
    """Create a dictionary of item id to Item lookup."""
    item_mapping = {}
    for row in df_item.itertuples():
        item_id = getattr(row, item_col)
        item_title = getattr(row, title_col)
        item_genre = getattr(row, genre_col)

        splitted = item_genre.split('|')
        genre_ratio = 1. / len(splitted)
        item_genre = {genre: genre_ratio for genre in splitted}

        item = Item(item_id, item_title, item_genre)
        item_mapping[item_id] = item

    return item_mapping
    

item_mapping = create_item_mapping(df_item, item_col, title_col, genre_col)
item_mapping[1]

In [None]:
# convert to implicit feedback data and filter out
# movies that doesn't have any genre
df_rating = df_raw[df_raw[value_col] >= 4.0].copy()
df_rating = df_rating.merge(df_item, on=item_col)

for col in (user_col, item_col):
    df_rating[col] = df_rating[col].astype('category')

# the original id are converted to indices to create
# the sparse matrix, so we keep track of the mappings here
# e.g. a userId 1 will correspond to index 0 in our sparse matrix
index2user = df_rating[user_col].cat.categories
index2item = df_rating[item_col].cat.categories

print('dimension: ', df_rating.shape)
df_rating.head()

In [None]:
def create_user_item_csr_matrix(data, user_col, item_col, value_col):
    rows = data[user_col].cat.codes
    cols = data[item_col].cat.codes
    values = data[value_col].astype(np.float32)
    return csr_matrix((values, (rows, cols)))


user_item = create_user_item_csr_matrix(df_rating, user_col, item_col, value_col)
user_item

In [None]:
np.random.seed(1234)
user_item_train, user_item_test = train_test_split(user_item, train_percentage=0.8)
user_item_train

In [None]:
user_item_test

In [None]:
# the model expects item-user sparse matrix,
# i.e. the rows represents item and the column
# represents users
np.random.seed(1234)
bpr = BayesianPersonalizedRanking(iterations=70)
bpr.fit(user_item_train.T.tocsr())

In [None]:
precision = precision_at_k(bpr, user_item_train, user_item_test, K=10)
precision

In [None]:
user_item_test

In [None]:
# look a the first user
user_id = 1

# find the index that the user interacted with,
# we can then map this to a list of Item, note that we need to first
# map the recommended index to the actual itemId/movieId first
interacted_ids = user_item_train[user_id].nonzero()[1]
interacted_items = [item_mapping[index2item[index]] for index in interacted_ids]
interacted_items[:10]

In [None]:
# it returns the recommended index and their corresponding score
topn = 30
reco = bpr.recommend(user_id, user_item_train, N=topn)
reco[:10]

In [None]:
# map the index to Item
reco_items = [item_mapping[index2item[index]] for index, _ in reco]
reco_items[:10]

In [None]:
def compute_genre_distr(items):
    """Compute the genre distribution for a given list of Items."""
    distr = {}
    for item in items:
        for genre, score in item.genres.items():
            genre_score = distr.get(genre, 0.)
            distr[genre] = genre_score + score

    # we normalize the summed up probability so it sums up to 1
    # and round it to three decimal places, adding more precision
    # doesn't add much value and clutters the output
    for item, genre_score in distr.items():
        normed_genre_score = round(genre_score / len(items), 3)
        distr[item] = normed_genre_score

    return distr

In [None]:
# we can check that the probability does in fact add up to 1
# np.array(list(interacted_distr.values())).sum()
interacted_distr = compute_genre_distr(interacted_items)
interacted_distr

In [None]:
user_id = 0
summ = 0
for user_id in range(2400):
    
# find the index that the user interacted with,
# we can then map this to a list of Item, note that we need to first
# map the recommended index to the actual itemId/movieId first
    interacted_ids = user_item_train[user_id].nonzero()[1]
    interacted_items = [item_mapping[index2item[index]] for index in interacted_ids]
    interacted_items[:10]
    interacted_distr = compute_genre_distr(interacted_items)
    import operator
    n = 0
    p = interacted_distr.copy()
    r = {x:0 for x in interacted_distr.keys()}
    while n < 30:
        m = max(p.items(), key=operator.itemgetter(1))[0]
        r[m] += 1
        n += 1
        p[m] = p[m] / (2*r[m]+1)
        
    for i, j in r.items():
        r[i] = j/30
        
    summ += compute_kl_divergence(interacted_distr, r)

In [None]:
summ / 2400

In [None]:
reco_distr = compute_genre_distr(reco_items)
reco_distr

In [None]:
# change default style figure and font size
plt.rcParams['figure.figsize'] = 10, 8
plt.rcParams['font.size'] = 12


def distr_comparison_plot(interacted_distr, reco_distr, width=0.3):
    
    # the value will automatically be converted to a column with the
    # column name of '0'
    interacted = pd.DataFrame.from_dict(interacted_distr, orient='index')
    reco = pd.DataFrame.from_dict(reco_distr, orient='index')
    df = interacted.join(reco, how='outer', lsuffix='_interacted')

    n = df.shape[0]
    index = np.arange(n)
    plt.barh(index, df['0_interacted'], height=width, label='interacted distr')
    plt.barh(index + width, df['0'], height=width, label='reco distr')
    plt.yticks(index, df.index)
    plt.legend(bbox_to_anchor=(1, 0.5))
    plt.title('Genre Distribution between User Historical Interaction v.s. Recommendation')
    plt.ylabel('Genre')
    plt.show()
    
    
distr_comparison_plot(interacted_distr, reco_distr)

In [None]:
interacted_distr

In [None]:
def compute_kl_divergence(interacted_distr, reco_distr, alpha=0.02):
    """
    KL (p || q), the lower the better.

    alpha is not really a tuning parameter, it's just there to make the
    computation more numerically stable.
    """
    kl_div = 0.
    for genre, score in interacted_distr.items():
        reco_score = reco_distr.get(genre, 0.)
        reco_score = (1 - alpha) * reco_score + alpha * score
        if reco_score==0:
            kl_div += score
        else:
            kl_div += score * np.log2(score / reco_score)

    return kl_div


compute_kl_divergence(interacted_distr, reco_distr)

In [None]:
def generate_item_candidates(model, user_item, user_id, index2item, item_mapping,
                             filter_already_liked_items=True):
    """
    For a given user, generate the list of items that we can recommend, during this
    step, we will also attach the recommender's score to each item.
    """
    n_items = user_item.shape[1]
    
    # this is how implicit's matrix factorization generates
    # the scores for each item for a given user, modify this
    # part of the logic if we were to use a completely different
    # algorithm to generate the ranked items
    user_factor = model.user_factors[user_id]
    scores = model.item_factors.dot(user_factor)

    liked = set()
    if filter_already_liked_items:
        liked = set(user_item[user_id].indices)

    item_ids = set(np.arange(n_items))
    item_ids -= liked

    items = []
    for item_id in item_ids:
        item = item_mapping[index2item[item_id]]
        item.score = scores[item_id]
        items.append(item)

    return items

In [None]:
items = generate_item_candidates(bpr, user_item_train, user_id, index2item, item_mapping)
print('number of item candidates:', len(items))
items[:5]

In [None]:
def compute_utility(reco_items, interacted_distr, lmbda=0.5):
    """
    Our objective function for computing the utility score for
    the list of recommended items.

    lmbda : float, 0.0 ~ 1.0, default 0.5
        Lambda term controls the score and calibration tradeoff,
        the higher the lambda the higher the resulting recommendation
        will be calibrated. Lambda is keyword in Python, so it's
        lmbda instead ^^
    """
    reco_distr = compute_genre_distr(reco_items)
    kl_div = compute_kl_divergence(interacted_distr, reco_distr)

    total_score = 0.0
    for item in reco_items:
        total_score += item.score
    
    # kl divergence is the lower the better, while score is
    # the higher the better so remember to negate it in the calculation
    utility = (1 - lmbda) * total_score - lmbda * kl_div
    return utility

In [None]:
def calib_recommend(items, interacted_distr, topn, lmbda=0.5):
    """
    start with an empty recommendation list,
    loop over the topn cardinality, during each iteration
    update the list with the item that maximizes the utility function.
    """
    calib_reco = []
    for _ in range(topn):
        max_utility = -np.inf
        for item in items:
            if item in calib_reco:
                continue

            utility = compute_utility(calib_reco + [item], interacted_distr, lmbda)
            if utility > max_utility:
                max_utility = utility
                best_item = item

        calib_reco.append(best_item)
        
    return calib_reco

In [None]:
start = time.time()
calib_reco_items = calib_recommend(items, interacted_distr, topn, lmbda=0.99)
elapsed = time.time() - start
print('elapsed: ', elapsed)
calib_reco_items

In [None]:
calib_reco_distr = compute_genre_distr(calib_reco_items)
calib_reco_kl_div = compute_kl_divergence(interacted_distr, calib_reco_distr)
reco_kl_div = compute_kl_divergence(interacted_distr, reco_distr)
print('\noriginal reco kl-divergence score:', reco_kl_div)
print('calibrated reco kl-divergence score:', calib_reco_kl_div)

distr_comparison_plot(interacted_distr, calib_reco_distr)

In [None]:
def precision(user_item, user_id, reco_items, index2item):
    indptr = user_item.indptr
    indices = user_item.indices

    reco_ids = {item.id for item in reco_items}
    likes = {index2item[indices[i]] for i in range(indptr[user_id], indptr[user_id + 1])}

    relevant = len(reco_ids & likes)
    total = min(len(reco_items), len(likes))
    return relevant / total

In [None]:
reco_precision = precision(user_item_test, user_id, reco_items, index2item)
calib_reco_precision = precision(user_item_test, user_id, calib_reco_items, index2item)
print('original reco precision score:', reco_precision)
print('calibrated reco precision score:', calib_reco_precision)

In [None]:
start = time.time()
calib_reco_items = calib_recommend(items, interacted_distr, topn, lmbda=0.5)
elapsed = time.time() - start
print('elapsed: ', elapsed)
calib_reco_items

In [None]:
calib_reco_distr = compute_genre_distr(calib_reco_items)
calib_reco_kl_div = compute_kl_divergence(interacted_distr, calib_reco_distr)
calib_reco_precision = precision(user_item_test, user_id, calib_reco_items, index2item)
print('calibrated reco kl-divergence score:', calib_reco_kl_div)
print('calibrated reco precision score:', calib_reco_precision)

In [None]:
calib_reco_distr = compute_genre_distr(calib_reco_items)
distr_comparison_plot(interacted_distr, calib_reco_distr)

In [None]:
reco_items

In [None]:
topn = 30
user_id = 100
lmbda = 0.9

prec_cal = 0
kl_cal = 0
kl_rec = 0
prec_rec = 0
n = 0

for user_id in tqdm(range(1, 11)):
    try:
        reco = bpr.recommend(user_id, user_item_train, N=topn)
        reco_items = [item_mapping[index2item[index]] for index, _ in reco]
        reco_distr = compute_genre_distr(reco_items)

        interacted_ids = user_item_train[user_id].nonzero()[1]
        interacted_items = [item_mapping[index2item[index]] for index in interacted_ids]
        interacted_distr = compute_genre_distr(interacted_items)

        items = generate_item_candidates(bpr, user_item_train, user_id, index2item, item_mapping)
        calib_reco_items = calib_recommend(items, interacted_distr, topn, lmbda)
        calib_reco_distr = compute_genre_distr(calib_reco_items)

        calib_reco_kl_div = compute_kl_divergence(interacted_distr, calib_reco_distr)
        calib_reco_precision = precision(user_item_test, user_id, calib_reco_items, index2item)
        #print('calibrated reco kl-divergence score:', calib_reco_kl_div)
        #print('calibrated reco precision score:', calib_reco_precision)
        #distr_comparison_plot(interacted_distr, calib_reco_distr)

        reco_kl_div = compute_kl_divergence(interacted_distr, reco_distr)
        reco_precision = precision(user_item_test, user_id, reco_items, index2item)
        #print('original reco kl-divergence score:', reco_kl_div)
        #print('original reco precision score:', reco_precision)
        #distr_comparison_plot(interacted_distr, reco_distr)

        prec_rec += reco_precision
        prec_cal += calib_reco_precision
        kl_rec += reco_kl_div
        kl_cal += calib_reco_kl_div
        n += 1
    except:
        continue

print('calibrated reco kl-divergence score:', kl_cal/n)
print('calibrated reco precision score:', prec_cal/n)
print('original reco kl-divergence score:', kl_rec/n)
print('original reco precision score:', prec_rec/n)