In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict
from sklearn.model_selection import KFold
import scipy.special as scisp

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

# Load data

In [None]:
ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv")).rename(columns={'movieId': 'item_id'})
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')

display(ml_movies_df.head(10))

# Filter the data to reduce the number of movies
seed = 6789
rng = np.random.RandomState(seed=seed)
left_ids = rng.choice(ml_movies_df['item_id'], size=100, replace=False)

ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]
ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]
ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]

print("Number of interactions left: {}".format(len(ml_ratings_df)))

# Inner workings of the Amazon recommender fit method

## Shift item ids and user ids so that they are consecutive

**Task 1.** Create a mapping from item ids in ml_ratings_df DataFrame into consecutive natural numbers starting from 0. Example:

    {780: 0, 1500: 1, 3479: 2, 171: 3, 1914: 4, 4896: 5, ...}

Name this mapping item_id_mapping. Create also a reverse mapping to this one. Name it item_id_reverse_mapping.

Create analogous mappings for user ids. Name them user_id_mapping and user_id_reverse_mapping, respectively.

Copy ml_ratings_df into interactions_df and apply the mappings to the user_id and item_id columns.

In [None]:
interactions_df = ml_ratings_df.copy()

# Write your code here

print("Item mapping")
print(item_id_mapping)
print()

print("Item reverse mapping")
print(item_id_reverse_mapping)
print()

print("User mapping")
print(user_id_mapping)
print()

print("User reverse mapping")
print(user_id_reverse_mapping)
print()

display(interactions_df.head(10))

## Get the number of items and users

In [None]:
n_items = np.max(interactions_df['item_id']) + 1
n_users = np.max(interactions_df['user_id']) + 1

print("n_items={}\nn_users={}".format(n_items, n_users))

## Get the maximal number of interactions

**Task 2.** Calculate the maximal number of interactions for a single user and set the max_interactions variable to this value.

In [None]:
# Write your code here

print("max_interactions={}".format(max_interactions))

## Calculate P_Y's

**Task 3.** For every movie calculate the prior probability of interaction (the number of users who rated a given movie divided by the number of all users) and put those probabilities in a dictionary as follows:

    {0: 0.17264957264957265, 1: 0.05042735042735043, 2: 0.015384615384615385, ...}
    
Set the result to p_y and print it.

In [None]:
# Write your code here

print(p_y)

## For every X calculate the E[Y $\cap$ X]

**Task 4.** Calculate $E_{XY}$ for all pairs of items as described in the Amazon paper (see lecture 5 one note). To do that first calculate powers of $P(Y)$ up to $k$ equal to $max\_interactions$. Then calculate $\alpha_k$ for every item and every $k$ between 1 and $max\_interactions$. Finally, calculate $E_{XY}$ from those values.

Print the submatrix of $E_{XY}$ of the first ten rows and ten columns.

Note that for large datasets $E_{XY}$ are never calculated all at once, but only the needed ones are calculated on the fly from the powers of $P(Y)$ and $\alpha_k$'s. But for smaller datasets it is more efficient to calculate all $E_{XY}$ once, sacrificing memory for a speed up in processing time.

In [None]:
# Write your code here
e_xy = np.zeros(shape=(n_items, n_items))
e_xy[:][:] = -1e100
    
# Write your code here

print("E[Y|X]")
print(np.around(e_xy[:10, :10], 3))

## Get the user-item interaction matrix

In [None]:
# mapping to int is necessary because of how iterrows works
r = np.zeros(shape=(n_users, n_items))
for idx, interaction in interactions_df.iterrows():
    r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
    
print(r[:10, :10])

## Calculate the number of users who bought both X and Y

**Task 5.** Calculate the number of users who bought both X and Y ($N_{XY}$). Use the interaction matrix and matrix multiplication to achieve that. Print the submatrix of $N_{XY}$ of the first ten rows and ten columns.

In [None]:
# Write your code here

print(n_xy[:10, :10])

## Calculate the scores

**Task 6.** Calculate the "Chi-squared" scores for all pairs of items as described in the Amazon paper (see lecture 5 one note). Print the submatrix of the first ten rows and ten columns.

In [None]:
# Write your code here

print(np.around(scores[:10, :10], 3))

## Final comparison

In [None]:
print("E[Y|X]")
print(np.around(e_xy[:10, :10], 3))
print()

print("N(X, Y)")
print(n_xy[:10, :10])
print()

print("Scores")
print(np.around(scores[:10, :10], 3))
print()

# Inner workings of the Amazon recommender recommend method

**Task 7.** Using the scores for all pairs of items generate recommendations for the user with original user_id=1. To do that you have to take all movies this user has already rated ($X$) and for every movie in the dataset ($Y$) sum up all the scores $score\_{XY}$. Then you have to return items $Y$ with the highest score. Do not recommend movies already rated.

Print ten first recommendations in the following form:

Recommendation: 1, Honeymoon in Vegas (1992), 5.387478215471393

using code in the form:

    print("Recommendation: {}, {}, {}".format(user_id, movie_title, score)

In [None]:
user_id = 1
should_recommend_already_bought = False
n_recommendations = 10

# Write your code here

# Amazon recommender

In [None]:
from recommenders.recommender import Recommender

class AmazonRecommender(Recommender):
    """
    Basic item-to-item collaborative filtering algorithm used in Amazon.com as described in:
    - Linden G., Smith B., York Y., Amazon.com Recommendations. Item-to-Item Collaborative Filtering,
        IEEE Internet Computing, 2003,
    - Smith B., Linden G., Two Decades of Recommender Systems at Amazon.com, IEEE Internet Computing, 2017.
    """

    def __init__(self):
        super().__init__()
        self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
        self.interactions_df = None
        self.item_id_mapping = None
        self.user_id_mapping = None
        self.item_id_reverse_mapping = None
        self.user_id_reverse_mapping = None
        self.e_xy = None
        self.n_xy = None
        self.scores = None
        self.most_popular_items = None
        self.should_recommend_already_bought = False

    def initialize(self, **params):
        if 'should_recommend_already_bought' in params:
            self.should_recommend_already_bought = params['should_recommend_already_bought']

    def fit(self, interactions_df, users_df, items_df):
        """
        Training of the recommender.

        :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
            defined by user_id, item_id and features of the interaction.
        :param pd.DataFrame users_df: DataFrame with users and their features defined by
            user_id and the user feature columns.
        :param pd.DataFrame items_df: DataFrame with items and their features defined
            by item_id and the item feature columns.
        """

        # Shift item ids and user ids so that they are consecutive

        unique_item_ids = interactions_df['item_id'].unique()
        self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
        self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
        unique_user_ids = interactions_df['user_id'].unique()
        self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
        self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
        
        interactions_df = interactions_df.copy()
        interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)

        # Get the number of items and users

        self.interactions_df = interactions_df
        n_items = np.max(interactions_df['item_id']) + 1
        n_users = np.max(interactions_df['user_id']) + 1

        # Get maximal number of interactions

        n_user_interactions = interactions_df[['user_id', 'item_id']].groupby("user_id").count()
        # Unnecessary, but added for readability
        n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})
        max_interactions = n_user_interactions['n_items'].max()

        # Calculate P_Y's

        n_interactions = len(interactions_df)
        p_y = interactions_df[['item_id', 'user_id']].groupby("item_id").count().reset_index()
        p_y = p_y.rename(columns={'user_id': 'P_Y'})
        p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions
        p_y = dict(zip(p_y['item_id'], p_y['P_Y']))

        # Get the series of all items

        # items = list(range(n_items))
        items = interactions_df['item_id'].unique()

        # For every X calculate the E[Y|X]

        e_xy = np.zeros(shape=(n_items, n_items))
        e_xy[:][:] = -1e100

        p_y_powers = {}
        for y in items:
            p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])

        for x in items:
            # Get users who bought X
            c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()

            # Get users who bought only X
            c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()
            c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))

            # Calculate the number of non-X interactions for each user who bought X
            n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]
            n_non_x_interactions = n_non_x_interactions.groupby("user_id").count()
            # Unnecessary, but added for readability
            n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})

            # Include users with zero non-X interactions
            zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=["n_items"], index=c_only_x)
            n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])

            c_non_x = n_non_x_interactions.index.unique()
            c_x_and_non_x = list(set.intersection(set(c_x.tolist()), set(c_non_x.tolist())))
            n_non_x_interactions = n_non_x_interactions.loc[c_x_and_non_x]

            # Calculate the expected numbers of Y products bought by clients who bought X
            alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)
                                        for abs_c in n_non_x_interactions["n_items"]])
                                for k in range(1, max_interactions + 1)])

            for y in items:  # Optimize to use only those Y's which have at least one client who bought both X and Y
                if y != x:
                    e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])
                else:
                    e_xy[x][y] = n_users * p_y[x]

        self.e_xy = e_xy

        # Calculate the number of users who bought both X and Y

        # Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
        r = np.zeros(shape=(n_users, n_items))
        for idx, interaction in interactions_df.iterrows():
            r[int(interaction['user_id'])][int(interaction['item_id'])] = 1

        # Get the number of users who bought both X and Y

        n_xy = np.matmul(r.T, r)

        self.n_xy = n_xy
        
        # Calculate the scores

        self.scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)
        
        # Find the most popular items for the cold start problem
        
        offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
        offers_count = offers_count.sort_values('user_id', ascending=False)
        self.most_popular_items = offers_count.index

    def recommend(self, users_df, items_df, n_recommendations=1):
        """
        Serving of recommendations. Scores items in items_df for each user in users_df and returns
        top n_recommendations for each user.

        :param pd.DataFrame users_df: DataFrame with users and their features for which
            recommendations should be generated.
        :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
        :param int n_recommendations: Number of recommendations to be returned for each user.
        :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
            for each user.
        :rtype: pd.DataFrame
        """

        # Clean previous recommendations (iloc could be used alternatively)
        self.recommender_df = self.recommender_df[:0]
        
        # Handle users not in the training data

        # Map item ids
        
        items_df = items_df.copy()
        items_df.replace({'item_id': self.item_id_mapping}, inplace=True)

        # Generate recommendations

        for idx, user in users_df.iterrows():
            recommendations = []
            
            user_id = user['user_id']
            
            if user_id in self.user_id_mapping:
                mapped_user_id = self.user_id_mapping[user_id]
            
                x_list = self.interactions_df.loc[self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
                final_scores = np.sum(self.scores[x_list], axis=0)

                # Choose n recommendations based on highest scores
                if not self.should_recommend_already_bought:
                    final_scores[x_list] = -1e100

                chosen_ids = np.argsort(-final_scores)[:n_recommendations]

                for item_id in chosen_ids:
                    recommendations.append(
                        {
                            'user_id': self.user_id_reverse_mapping[mapped_user_id],
                            'item_id': self.item_id_reverse_mapping[item_id],
                            'score': final_scores[item_id]
                        }
                    )
            else:  # For new users recommend most popular items
                for i in range(n_recommendations):
                    recommendations.append(
                        {
                            'user_id': user['user_id'],
                            'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
                            'score': 1.0
                        }
                    )

            user_recommendations = pd.DataFrame(recommendations)

            self.recommender_df = pd.concat([self.recommender_df, user_recommendations])

        return self.recommender_df

In [None]:
# Quick test of the recommender

amazon_recommender = AmazonRecommender()
amazon_recommender.fit(ml_ratings_df, None, ml_movies_df)
recommendations = amazon_recommender.recommend(pd.DataFrame([[1], [4], [6]], columns=['user_id']), ml_movies_df, 10)

recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')
print("Recommendations")
display(HTML(recommendations.to_html()))

# Training-test split evaluation

In [None]:
from evaluation_and_testing.testing import evaluate_train_test_split_implicit

amazon_recommender = AmazonRecommender()

amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(
    amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

amazon_tts_results = pd.DataFrame(
    amazon_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(amazon_tts_results.to_html()))

In [None]:
from recommenders.tfidf_recommender import TFIDFRecommender

tfidf_recommender = TFIDFRecommender()

tfidf_tts_results = [['TFIDFRecommender'] + list(evaluate_train_test_split_implicit(
    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

tfidf_tts_results = pd.DataFrame(
    tfidf_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(tfidf_tts_results.to_html()))

In [None]:
tts_results = pd.concat([amazon_tts_results, tfidf_tts_results]).reset_index(drop=True)
display(HTML(tts_results.to_html()))

# Leave-one-out evaluation

In [None]:
from evaluation_and_testing.testing import evaluate_leave_one_out_implicit

amazon_recommender = AmazonRecommender()

amazon_loo_results = [['AmazonRecommender'] + list(evaluate_leave_one_out_implicit(
    amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

amazon_loo_results = pd.DataFrame(
    amazon_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(amazon_loo_results.to_html()))

In [None]:
tfidf_recommender = TFIDFRecommender()

tfidf_loo_results = [['TFIDFRecommender'] + list(evaluate_leave_one_out_implicit(
    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

tfidf_loo_results = pd.DataFrame(
    tfidf_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(tfidf_loo_results.to_html()))

In [None]:
loo_results = pd.concat([amazon_loo_results, tfidf_loo_results]).reset_index(drop=True)
display(HTML(loo_results.to_html()))