In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict
from sklearn.model_selection import KFold
import scipy.special as scisp

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

# Load data

In [2]:
ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv")).rename(columns={'movieId': 'item_id'})
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')

display(ml_movies_df.head(10))

# Filter the data to reduce the number of movies
seed = 6789
rng = np.random.RandomState(seed=seed)
left_ids = rng.choice(ml_movies_df['item_id'], size=100, replace=False)

ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]
ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]
ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]

print("Number of interactions left: {}".format(len(ml_ratings_df)))

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


Number of interactions left: 1170


# Inner workings of the Amazon recommender fit method

## Shift item ids and user ids so that they are consecutive

In [3]:
interactions_df = ml_ratings_df.copy()

unique_item_ids = interactions_df['item_id'].unique()
item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
unique_user_ids = interactions_df['user_id'].unique()
user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))

interactions_df.replace({'item_id': item_id_mapping, 'user_id': user_id_mapping}, inplace=True)

print("Item mapping")
print(item_id_mapping)
print()

print("Item reverse mapping")
print(item_id_reverse_mapping)
print()

print("User mapping")
print(user_id_mapping)
print()

print("User reverse mapping")
print(user_id_reverse_mapping)
print()

display(interactions_df.head(10))

Item mapping
{780: 0, 1500: 1, 3479: 2, 171: 3, 1914: 4, 4896: 5, 145: 6, 267: 7, 355: 8, 435: 9, 6502: 10, 73323: 11, 112421: 12, 1783: 13, 2806: 14, 3040: 15, 3551: 16, 2135: 17, 39715: 18, 41566: 19, 5673: 20, 7064: 21, 481: 22, 6537: 23, 44761: 24, 2690: 25, 228: 26, 4890: 27, 3614: 28, 3507: 29, 3628: 30, 5954: 31, 8605: 32, 3786: 33, 6755: 34, 3468: 35, 50601: 36, 3089: 37, 55444: 38, 118270: 39, 124404: 40, 3768: 41, 233: 42, 3687: 43, 171749: 44, 104218: 45, 182749: 46, 3342: 47, 65130: 48, 84952: 49, 152970: 50, 3067: 51, 4031: 52, 1107: 53, 47382: 54, 3801: 55, 5155: 56, 5612: 57, 5214: 58, 67295: 59, 3165: 60, 1752: 61, 31223: 62, 6713: 63, 66783: 64, 2043: 65, 2903: 66, 3313: 67, 4009: 68, 91842: 69, 2190: 70, 7282: 71, 4483: 72, 2275: 73, 3567: 74, 190207: 75, 4505: 76, 95147: 77, 4552: 78, 6033: 79, 2521: 80, 4397: 81, 151315: 82, 156706: 83, 151311: 84, 959: 85, 3714: 86, 4164: 87, 4796: 88, 31260: 89, 6927: 90, 126142: 91, 73804: 92, 26357: 93, 82684: 94, 6342: 95, 3279

Unnamed: 0,user_id,item_id,rating,timestamp
42,0,0,3.0,964984086
97,0,1,4.0,964980985
216,0,2,4.0,964981725
310,1,3,3.0,945078428
398,1,1,4.0,964622830
416,1,4,4.0,964622714
513,1,5,4.0,1007574532
616,2,6,4.0,845553966
629,2,3,3.0,845555402
677,2,7,3.0,845554376


## Get the number of items and users

In [4]:
n_items = np.max(interactions_df['item_id']) + 1
n_users = np.max(interactions_df['user_id']) + 1

print("n_items={}\nn_users={}".format(n_items, n_users))

n_items=100
n_users=378


## Get the maximal number of interactions

In [5]:
n_user_interactions = interactions_df[['user_id', 'item_id']].groupby("user_id").count()
# Unnecessary, but added for readability
n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})
max_interactions = n_user_interactions['n_items'].max()

print("max_interaction={}".format(max_interactions))

max_interaction=31


## Calculate P_Y's

In [6]:
n_interactions = len(interactions_df)
p_y = interactions_df[['item_id', 'user_id']].groupby("item_id").count().reset_index()
p_y = p_y.rename(columns={'user_id': 'P_Y'})
p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions
p_y = dict(zip(p_y['item_id'], p_y['P_Y']))

print(p_y)

{0: 0.17264957264957265, 1: 0.05042735042735043, 2: 0.015384615384615385, 3: 0.005128205128205128, 4: 0.007692307692307693, 5: 0.09145299145299145, 6: 0.04358974358974359, 7: 0.01452991452991453, 8: 0.035897435897435895, 9: 0.05384615384615385, 10: 0.04957264957264957, 11: 0.004273504273504274, 12: 0.002564102564102564, 13: 0.004273504273504274, 14: 0.007692307692307693, 15: 0.007692307692307693, 16: 0.011111111111111112, 17: 0.009401709401709401, 18: 0.005982905982905983, 19: 0.05299145299145299, 20: 0.028205128205128206, 21: 0.005128205128205128, 22: 0.01623931623931624, 23: 0.038461538461538464, 24: 0.010256410256410256, 25: 0.008547008547008548, 26: 0.002564102564102564, 27: 0.026495726495726495, 28: 0.006837606837606838, 29: 0.01282051282051282, 30: 0.0017094017094017094, 31: 0.018803418803418803, 32: 0.0017094017094017094, 33: 0.003418803418803419, 34: 0.011965811965811967, 35: 0.015384615384615385, 36: 0.007692307692307693, 37: 0.013675213675213675, 38: 0.002564102564102564, 39:

## For every X calculate the E[Y $\cap$ X]

In [7]:
e_xy = np.zeros(shape=(n_items, n_items))
e_xy[:][:] = -1e100
    
items = interactions_df['item_id'].unique()
    
p_y_powers = {}
for y in items:
    p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])
    
print("p_y_powers for the first item")
print(p_y_powers[0])

for x in items:
    # Get users who bought X
    c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()

    # Get users who bought only X
    c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()
    c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))

    # Calculate the number of non-X interactions for each user who bought X
    n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]
    n_non_x_interactions = n_non_x_interactions.groupby("user_id").count()
    # Unnecessary, but added for readability
    n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})

    # Include users with zero non-X interactions
    zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=["n_items"], index=c_only_x)
    n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])

    c_non_x = n_non_x_interactions.index.unique()
    c_x_and_non_x = list(set.intersection(set(c_x.tolist()), set(c_non_x.tolist())))
    n_non_x_interactions = n_non_x_interactions.loc[c_x_and_non_x]

    # Calculate the expected numbers of Y products bought by clients who bought X
    alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)
                                for abs_c in n_non_x_interactions["n_items"]])
                        for k in range(1, max_interactions + 1)])
    
    if x == 0:
        print("alpha_k")
        print(alpha_k)
        print()

    for y in items:  # Optimize to use only those Y's which have at least one client who bought both X and Y
        if y != x:
            e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])
        else:
            e_xy[x][y] = n_users * p_y[x]

print("E[Y|X]")
print(np.around(e_xy[:10, :10], 3))

p_y_powers for the first item
[1.72649573e-01 2.98078749e-02 5.14631687e-03 8.88509408e-04
 1.53400770e-04 2.64845773e-05 4.57255096e-06 7.89448968e-07
 1.36298027e-07 2.35317961e-08 4.06275454e-09 7.01432836e-10
 1.21102079e-10 2.09082222e-11 3.60979564e-12 6.23229674e-13
 1.07600337e-13 1.85771522e-14 3.20733738e-15 5.53745429e-16
 9.56039116e-17 1.65059745e-17 2.84974944e-18 4.92008023e-19
 8.49449749e-20 1.46657136e-20 2.53202919e-21 4.37153757e-22
 7.54744094e-23 1.30306245e-23 2.24973176e-24]
alpha_k
[ 6.29000000e+02 -2.78500000e+03  1.40760000e+04 -6.93680000e+04
  3.01779000e+05 -1.11984600e+06  3.53047000e+06 -9.50680900e+06
  2.20226720e+07 -4.41781690e+07  7.71648950e+07 -1.17857585e+08
  1.57903510e+08 -1.85975447e+08  1.92784504e+08 -1.75939393e+08
  1.41266349e+08 -9.96249800e+07  6.15359500e+07 -3.31533720e+07
  1.54912130e+07 -6.22966600e+06  2.13408000e+06 -6.14250000e+05
  1.45782000e+05 -2.77830000e+04  4.08800000e+03 -4.36000000e+02
  3.00000000e+01 -1.00000000e+00 

## Get the user-item interaction matrix

In [8]:
# mapping to int is necessary because of how iterrows works
r = np.zeros(shape=(n_users, n_items))
for idx, interaction in interactions_df.iterrows():
    r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
    
print(r[:10, :10])

[[1. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 1. 1. 1. 0. 0. 0. 0.]
 [1. 0. 0. 1. 0. 0. 1. 1. 1. 1.]
 [1. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 1. 1. 0. 0. 1.]
 [0. 1. 1. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]


## Calculate the number of users who bought both X and Y

In [9]:
# Simple and slow method (commented out)

# n_xy = np.zeros(shape=(n_items, n_items))

# for x in items:
#     for y in items:
#         users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())
#         users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())
#         users_x_and_y = users_x & users_y
#         n_xy[x][y] = len(users_x_and_y)

# Optimized method (can be further optimized by using sparse matrices)

n_xy = np.matmul(r.T, r)

print(n_xy[:10, :10])

[[202.  34.  15.   3.   3.  66.  36.  10.  25.  34.]
 [ 34.  59.   6.   2.   5.  24.  12.   4.   8.  12.]
 [ 15.   6.  18.   1.   2.   7.   3.   4.   6.   5.]
 [  3.   2.   1.   6.   1.   1.   1.   1.   2.   2.]
 [  3.   5.   2.   1.   9.   3.   2.   1.   1.   0.]
 [ 66.  24.   7.   1.   3. 107.  20.   5.  16.  18.]
 [ 36.  12.   3.   1.   2.  20.  51.   8.  16.  17.]
 [ 10.   4.   4.   1.   1.   5.   8.  17.   8.  10.]
 [ 25.   8.   6.   2.   1.  16.  16.   8.  42.  23.]
 [ 34.  12.   5.   2.   0.  18.  17.  10.  23.  63.]]


## Calculate the scores

In [10]:
scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)

print(np.around(scores[:10, :10], 3))

[[16.926  1.552  1.971 -0.087 -0.777  3.789  2.689  0.48   1.235  1.235]
 [ 1.071  9.148  0.827  0.408  1.863  1.15   0.376 -0.033 -0.38  -0.218]
 [ 1.497  0.411  5.053  0.341  0.932 -0.142 -0.737  1.555  1.023 -0.134]
 [ 0.451  1.23   1.349  2.917  2.259 -0.361  0.284  1.417  1.724  1.141]
 [-0.717  1.61   1.002  1.048  3.573 -0.244 -0.164  0.051 -0.687 -1.604]
 [ 2.601  0.765 -0.103 -0.97  -0.399 12.319  0.412 -0.724  0.125 -0.782]
 [ 2.127  0.237 -0.522 -0.359 -0.077  0.658  8.505  2.121  2.561  1.518]
 [ 0.3   -0.061  1.952  0.585  0.192 -0.484  2.235  4.91   2.697  2.728]
 [ 0.724 -0.582  1.265  0.641 -0.644  0.27   2.439  2.479  7.718  3.946]
 [ 1.793  0.544  0.756  0.679 -1.358  0.413  2.627  3.596  5.52   9.453]]


## Final comparison

In [11]:
print("E[Y|X]")
print(np.around(e_xy[:10, :10], 3))
print()

print("N(X, Y)")
print(n_xy[:10, :10])
print()

print("Scores")
print(np.around(scores[:10, :10], 3))
print()

E[Y|X]
[[65.262 26.076  9.065  3.154  4.68  41.571 23.082  8.592 19.542 27.522]
 [28.303 19.062  4.288  1.5    2.223 18.99  10.768  4.066  9.15  12.778]
 [10.216  5.074  5.815  0.712  1.046  7.386  4.577  1.872  3.964  5.308]
 [ 2.315  0.859  0.283  1.938  0.144  1.433  0.754  0.267  0.631  0.911]
 [ 4.526  2.47   0.999  0.366  2.908  3.453  2.245  0.951  1.962  2.574]
 [47.984 20.534  7.279  2.549  3.776 34.569 18.241  6.902 15.507 21.636]
 [25.303 11.206  4.05   1.429  2.112 17.265 16.477  3.843  8.524 11.789]
 [ 9.094  4.124  1.561  0.561  0.826  6.205  3.701  5.492  3.186  4.326]
 [21.633  9.823  3.601  1.276  1.884 14.955  8.776  3.417 13.569 10.322]
 [25.03  10.257  3.571  1.243  1.844 16.332  9.082  3.385  7.691 20.354]]

N(X, Y)
[[202.  34.  15.   3.   3.  66.  36.  10.  25.  34.]
 [ 34.  59.   6.   2.   5.  24.  12.   4.   8.  12.]
 [ 15.   6.  18.   1.   2.   7.   3.   4.   6.   5.]
 [  3.   2.   1.   6.   1.   1.   1.   1.   2.   2.]
 [  3.   5.   2.   1.   9.   3.   2.   1.

# Inner workings of the Amazon recommender recommend method

In [12]:
user_id = 1
should_recommend_already_bought = False
n_recommendations = 10

mapped_user_id = user_id_mapping[user_id]

x_list = interactions_df.loc[interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
final_scores = np.sum(scores[x_list], axis=0)

# Choose n recommendations based on highest scores
if not should_recommend_already_bought:
    final_scores[x_list] = -1e100

chosen_ids = np.argsort(-final_scores)[:n_recommendations]

for item_id in chosen_ids:
    print("Recommendation: {}, {}, {}".format(user_id_reverse_mapping[mapped_user_id],
                                              ml_movies_df.loc[ml_movies_df['item_id'] == item_id_reverse_mapping[item_id], 
                                                            'title'].iloc[0],
                                              final_scores[item_id]))

Recommendation: 1, Brick (2005), 6.122652596595853
Recommendation: 1, Oh, God! (1977), 5.908857666844879
Recommendation: 1, Bubba Ho-tep (2002), 5.830666625469312
Recommendation: 1, Meatballs (1979), 5.56930833865894
Recommendation: 1, Millennium Actress (Sennen joyû) (2001), 5.502504256363742
Recommendation: 1, Honeymoon in Vegas (1992), 5.387478215471393
Recommendation: 1, Six-String Samurai (1998), 5.225652131462832
Recommendation: 1, Grass Is Greener, The (1960), 5.144470412494206
Recommendation: 1, Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001), 4.796473011676857
Recommendation: 1, Clara's Heart (1988), 4.608515964550741


# Amazon recommender

In [13]:
from recommenders.recommender import Recommender

class AmazonRecommender(Recommender):
    """
    Basic item-to-item collaborative filtering algorithm used in Amazon.com as described in:
    - Linden G., Smith B., York Y., Amazon.com Recommendations. Item-to-Item Collaborative Filtering,
        IEEE Internet Computing, 2003,
    - Smith B., Linden G., Two Decades of Recommender Systems at Amazon.com, IEEE Internet Computing, 2017.
    """

    def __init__(self):
        super().__init__()
        self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
        self.interactions_df = None
        self.item_id_mapping = None
        self.user_id_mapping = None
        self.item_id_reverse_mapping = None
        self.user_id_reverse_mapping = None
        self.e_xy = None
        self.n_xy = None
        self.scores = None
        self.most_popular_items = None
        self.should_recommend_already_bought = False

    def initialize(self, **params):
        if 'should_recommend_already_bought' in params:
            self.should_recommend_already_bought = params['should_recommend_already_bought']

    def fit(self, interactions_df, users_df, items_df):
        """
        Training of the recommender.

        :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
            defined by user_id, item_id and features of the interaction.
        :param pd.DataFrame users_df: DataFrame with users and their features defined by
            user_id and the user feature columns.
        :param pd.DataFrame items_df: DataFrame with items and their features defined
            by item_id and the item feature columns.
        """

        # Shift item ids and user ids so that they are consecutive

        unique_item_ids = interactions_df['item_id'].unique()
        self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
        self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
        unique_user_ids = interactions_df['user_id'].unique()
        self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
        self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
        
        interactions_df = interactions_df.copy()
        interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)

        # Get the number of items and users

        self.interactions_df = interactions_df
        n_items = np.max(interactions_df['item_id']) + 1
        n_users = np.max(interactions_df['user_id']) + 1

        # Get maximal number of interactions

        n_user_interactions = interactions_df[['user_id', 'item_id']].groupby("user_id").count()
        # Unnecessary, but added for readability
        n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})
        max_interactions = n_user_interactions['n_items'].max()

        # Calculate P_Y's

        n_interactions = len(interactions_df)
        p_y = interactions_df[['item_id', 'user_id']].groupby("item_id").count().reset_index()
        p_y = p_y.rename(columns={'user_id': 'P_Y'})
        p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions
        p_y = dict(zip(p_y['item_id'], p_y['P_Y']))

        # Get the series of all items

        # items = list(range(n_items))
        items = interactions_df['item_id'].unique()

        # For every X calculate the E[Y|X]

        e_xy = np.zeros(shape=(n_items, n_items))
        e_xy[:][:] = -1e100

        p_y_powers = {}
        for y in items:
            p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])

        for x in items:
            # Get users who bought X
            c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()

            # Get users who bought only X
            c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()
            c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))

            # Calculate the number of non-X interactions for each user who bought X
            n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]
            n_non_x_interactions = n_non_x_interactions.groupby("user_id").count()
            # Unnecessary, but added for readability
            n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})

            # Include users with zero non-X interactions
            zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=["n_items"], index=c_only_x)
            n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])

            c_non_x = n_non_x_interactions.index.unique()
            c_x_and_non_x = list(set.intersection(set(c_x.tolist()), set(c_non_x.tolist())))
            n_non_x_interactions = n_non_x_interactions.loc[c_x_and_non_x]

            # Calculate the expected numbers of Y products bought by clients who bought X
            alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)
                                        for abs_c in n_non_x_interactions["n_items"]])
                                for k in range(1, max_interactions + 1)])

            for y in items:  # Optimize to use only those Y's which have at least one client who bought both X and Y
                if y != x:
                    e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])
                else:
                    e_xy[x][y] = n_users * p_y[x]

        self.e_xy = e_xy

        # Calculate the number of users who bought both X and Y

        # Simple and slow method (commented out)

        # n_xy = np.zeros(shape=(n_items, n_items))

        # for x in items:
        #     for y in items:
        #         users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())
        #         users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())
        #         users_x_and_y = users_x & users_y
        #         n_xy[x][y] = len(users_x_and_y)

        # Optimized method (can be further optimized by using sparse matrices)

        # Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
        r = np.zeros(shape=(n_users, n_items))
        for idx, interaction in interactions_df.iterrows():
            r[int(interaction['user_id'])][int(interaction['item_id'])] = 1

        # Get the number of users who bought both X and Y

        n_xy = np.matmul(r.T, r)

        self.n_xy = n_xy
        
        # Calculate the scores

        self.scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)
        
        # Find the most popular items for the cold start problem
        
        offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
        offers_count = offers_count.sort_values('user_id', ascending=False)
        self.most_popular_items = offers_count.index

    def recommend(self, users_df, items_df, n_recommendations=1):
        """
        Serving of recommendations. Scores items in items_df for each user in users_df and returns
        top n_recommendations for each user.

        :param pd.DataFrame users_df: DataFrame with users and their features for which
            recommendations should be generated.
        :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
        :param int n_recommendations: Number of recommendations to be returned for each user.
        :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
            for each user.
        :rtype: pd.DataFrame
        """

        # Clean previous recommendations (iloc could be used alternatively)
        self.recommender_df = self.recommender_df[:0]
        
        # Handle users not in the training data

        # Map item ids
        
        items_df = items_df.copy()
        items_df.replace({'item_id': self.item_id_mapping}, inplace=True)

        # Generate recommendations

        for idx, user in users_df.iterrows():
            recommendations = []
            
            user_id = user['user_id']
            
            if user_id in self.user_id_mapping:
                mapped_user_id = self.user_id_mapping[user_id]
            
                x_list = self.interactions_df.loc[self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
                final_scores = np.sum(self.scores[x_list], axis=0)

                # Choose n recommendations based on highest scores
                if not self.should_recommend_already_bought:
                    final_scores[x_list] = -1e100

                chosen_ids = np.argsort(-final_scores)[:n_recommendations]

                for item_id in chosen_ids:
                    recommendations.append(
                        {
                            'user_id': self.user_id_reverse_mapping[mapped_user_id],
                            'item_id': self.item_id_reverse_mapping[item_id],
                            'score': final_scores[item_id]
                        }
                    )
            else:  # For new users recommend most popular items
                for i in range(n_recommendations):
                    recommendations.append(
                        {
                            'user_id': user['user_id'],
                            'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
                            'score': 1.0
                        }
                    )

            user_recommendations = pd.DataFrame(recommendations)

            self.recommender_df = pd.concat([self.recommender_df, user_recommendations])

        return self.recommender_df

In [14]:
# Quick test of the recommender

amazon_recommender = AmazonRecommender()
amazon_recommender.fit(ml_ratings_df, None, ml_movies_df)
recommendations = amazon_recommender.recommend(pd.DataFrame([[1], [4], [6]], columns=['user_id']), ml_movies_df, 10)

recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')
print("Recommendations")
display(HTML(recommendations.to_html()))

Recommendations


Unnamed: 0,user_id,item_id,score,title,genres
0,1,44761,6.122653,Brick (2005),Crime|Drama|Film-Noir|Mystery
1,1,5214,5.908858,"Oh, God! (1977)",Comedy|Fantasy
2,1,6755,5.830667,Bubba Ho-tep (2002),Comedy|Horror
3,1,3040,5.569308,Meatballs (1979),Comedy
4,1,6713,5.502504,Millennium Actress (Sennen joyû) (2001),Animation|Drama|Romance
5,1,3614,5.387478,Honeymoon in Vegas (1992),Comedy|Romance
6,1,2275,5.225652,Six-String Samurai (1998),Action|Adventure|Sci-Fi
7,1,4796,5.14447,"Grass Is Greener, The (1960)",Comedy|Romance
8,1,4896,4.796473,Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001),Adventure|Children|Fantasy
9,1,3714,4.608516,Clara's Heart (1988),Drama


# Training-test split evaluation

In [15]:
from evaluation_and_testing.testing import evaluate_train_test_split_implicit

amazon_recommender = AmazonRecommender()

amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(
    amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

amazon_tts_results = pd.DataFrame(
    amazon_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(amazon_tts_results.to_html()))

Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,AmazonRecommender,0.181818,0.311688,0.402597,0.551948,0.181818,0.257806,0.294682,0.34147


In [16]:
from recommenders.tfidf_recommender import TFIDFRecommender

tfidf_recommender = TFIDFRecommender()

tfidf_tts_results = [['TFIDFRecommender'] + list(evaluate_train_test_split_implicit(
    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

tfidf_tts_results = pd.DataFrame(
    tfidf_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(tfidf_tts_results.to_html()))

Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,TFIDFRecommender,0.025974,0.090909,0.136364,0.318182,0.025974,0.064393,0.083685,0.140799


In [17]:
tts_results = pd.concat([amazon_tts_results, tfidf_tts_results]).reset_index(drop=True)
display(HTML(tts_results.to_html()))

Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,AmazonRecommender,0.181818,0.311688,0.402597,0.551948,0.181818,0.257806,0.294682,0.34147
1,TFIDFRecommender,0.025974,0.090909,0.136364,0.318182,0.025974,0.064393,0.083685,0.140799


# Leave-one-out evaluation

In [18]:
from evaluation_and_testing.testing import evaluate_leave_one_out_implicit

amazon_recommender = AmazonRecommender()

amazon_loo_results = [['AmazonRecommender'] + list(evaluate_leave_one_out_implicit(
    amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

amazon_loo_results = pd.DataFrame(
    amazon_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(amazon_loo_results.to_html()))

Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,AmazonRecommender,0.166667,0.256667,0.32,0.426667,0.166667,0.219086,0.245486,0.279978


In [19]:
tfidf_recommender = TFIDFRecommender()

tfidf_loo_results = [['TFIDFRecommender'] + list(evaluate_leave_one_out_implicit(
    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

tfidf_loo_results = pd.DataFrame(
    tfidf_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(tfidf_loo_results.to_html()))

Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,TFIDFRecommender,0.006667,0.053333,0.123333,0.233333,0.006667,0.033491,0.062178,0.096151


In [20]:
loo_results = pd.concat([amazon_loo_results, tfidf_loo_results]).reset_index(drop=True)
display(HTML(loo_results.to_html()))

Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,AmazonRecommender,0.166667,0.256667,0.32,0.426667,0.166667,0.219086,0.245486,0.279978
1,TFIDFRecommender,0.006667,0.053333,0.123333,0.233333,0.006667,0.033491,0.062178,0.096151
