# Collaborative filtering - neighborhood methods

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict

np.set_printoptions(edgeitems=10, linewidth=500)

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

# Load data

In [2]:
ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv")).rename(columns={'movieId': 'item_id'})
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')

# Filter the data to reduce the number of movies
left_ids = [1, 318, 1193, 1208, 1214, 1721, 2959, 3578, 4306, 109487]

ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]
ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]
ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]

display(ml_movies_df.head(10))

print("Number of interactions left: {}".format(len(ml_ratings_df)))

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
277,318,"Shawshank Redemption, The (1994)",Crime|Drama
896,1193,One Flew Over the Cuckoo's Nest (1975),Drama
909,1208,Apocalypse Now (1979),Action|Drama|War
915,1214,Alien (1979),Horror|Sci-Fi
1291,1721,Titanic (1997),Drama|Romance
2226,2959,Fight Club (1999),Action|Crime|Drama|Thriller
2674,3578,Gladiator (2000),Action|Adventure|Drama
3194,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...
8376,109487,Interstellar (2014),Sci-Fi|IMAX


Number of interactions left: 1689


# Shift item ids and user ids so that they are consecutive

In [3]:
interactions_df = ml_ratings_df.copy()

unique_item_ids = interactions_df['item_id'].unique()
item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
unique_user_ids = interactions_df['user_id'].unique()
user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))

interactions_df.replace({'item_id': item_id_mapping, 'user_id': user_id_mapping}, inplace=True)

display(interactions_df.head(10))

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,0,4.0,964982703
72,0,1,4.0,964983250
75,0,2,4.0,964981855
192,0,3,5.0,964983282
219,0,4,5.0,964980668
232,1,5,3.0,1445714835
235,1,4,4.0,1445714885
255,1,6,3.0,1445715145
458,2,3,2.0,945078528
516,3,0,4.0,847434962


# Get the number of items and users

In [4]:
n_items = np.max(interactions_df['item_id']) + 1
n_users = np.max(interactions_df['user_id']) + 1

print("n_items={}\nn_users={}".format(n_items, n_users))

n_items=10
n_users=521


# Get the user-item interaction matrix

In [5]:
# mapping to int is necessary because of how iterrows works
r = np.zeros(shape=(n_users, n_items))
for idx, interaction in interactions_df.iterrows():
    r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
    
print(r)

[[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 1. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 1. 0. 0. 1. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 0. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 ...
 [1. 0. 0. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 0. 0. 0. 1. 1.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [1. 1. 1. 1. 1. 1. 0. 1. 1. 1.]
 [1. 0. 1. 0. 0. 1. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 0. 1. 1. 1.]
 [1. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 0.]]


# Calculate cosine similarities of users

<center>
$$
    \text{Sim}(\vec{u}, \vec{v}) = \text{Cos}(\vec{u}, \vec{v}) = \frac{\vec{u} \cdot \vec{v}}{\lVert u \rVert \lVert v \rVert} = \frac{\sum_{i = 1}^n u_i v_i}{\sqrt{\sum_{i = 1}^n u_i^2} \sqrt{\sum_{i = 1}^n v_i^2}}
$$
</center>

For interaction vectors cosine similarity changes from 0 to 1. 1 means that both vectors are identical. 0 means that they have no 1's in common.

**Task 1.** Code the cosine method calculating the cosine similarity with above formula for two vectors (numpy arrays) $u$ and $v$.

In [6]:
def cosine(u, v):
    # Write your code here
    sumaUV = np.sum(u*v)
    sumaU2 = np.sum(u*u)
    sumaV2 = np.sum(v*v)
    sqrt_sumaU2 = np.sqrt(sumaU2)
    sqrt_sumaV2 = np.sqrt(sumaV2)
    mianownik = sqrt_sumaU2 * sqrt_sumaV2
    return sumaUV/mianownik

print(cosine(np.array([1, 0, 1, 0]), np.array([1, 0, 0, 0])))
print(cosine(np.array([1, 0, 1, 0]), np.array([1, 0, 1, 0])))
print(cosine(np.array([1, 0, 1, 0]), np.array([0, 1, 0, 1])))

0.7071067811865475
0.9999999999999998
0.0


In [7]:
print("Cosine similarity between user 0 and 1")
user_id_1 = 0
user_id_2 = 1
print(r[user_id_1])
print(r[user_id_2])
print(cosine(r[user_id_1], r[user_id_2]))

Cosine similarity between user 0 and 1
[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 1. 1. 0. 0. 0.]
0.2581988897471611


In [8]:
print("Cosine similarity between user 0 and 5")
user_id_1 = 0
user_id_2 = 5
print(r[user_id_1])
print(r[user_id_2])
print(cosine(r[user_id_1], r[user_id_2]))

Cosine similarity between user 0 and 5
[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
[1. 1. 0. 0. 1. 0. 0. 1. 0. 0.]
0.6708203932499369


# Calculate Pearson similarities of users

<center>
$$
    \text{Sim}(\vec{u}, \vec{v}) = \text{Pearson}(\vec{u}, \vec{v}) = \frac{\sum_{i = 1}^n (u_i - \bar{u}) (v_i - \bar{v})}{\sqrt{\sum_{i = 1}^n (u_i - \bar{u})^2} \sqrt{\sum_{i = 1}^n (v_i - \bar{v})^2}}
$$
</center>

Correlation changes from -1 to 1. Correlation of 1 means that vectors are identical, -1 means they are opposites.

**Task 2.** Code the pearson method calculating the Peason similarity with the above formula for two vectors (numpy arrays) $u$ and $v$.

In [9]:
def pearson(u, v):
    # Write your code here
    meanU = np.mean(u)
    meanV = np.mean(v)
    sumaUV = np.sum((u - meanU)*(v - meanV))
    sumaU2 = np.sum((u - meanU)*(u - meanU))
    sumaV2 = np.sum((v - meanV)*(v - meanV))
    sqrt_sumaU2 = np.sqrt(sumaU2)
    sqrt_sumaV2 = np.sqrt(sumaV2)
    mianownik = sqrt_sumaU2 * sqrt_sumaV2
    return sumaUV/mianownik

print(pearson(np.array([1, 0, 1, 0]), np.array([1, 0, 0, 0])))
print(pearson(np.array([1, 0, 1, 0]), np.array([1, 0, 1, 0])))
print(pearson(np.array([1, 0, 1, 0]), np.array([0, 1, 0, 1])))

0.5773502691896258
1.0
-1.0


In [10]:
print("Pearson similarity between user 0 and 1")
user_id_1 = 0
user_id_2 = 1
print(r[user_id_1])
print(r[user_id_2])
print(pearson(r[user_id_1], r[user_id_2]))

Pearson similarity between user 0 and 1
[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 1. 1. 0. 0. 0.]
-0.21821789023599233


In [11]:
print("Pearson similarity between user 0 and 5")
user_id_1 = 0
user_id_2 = 5
print(r[user_id_1])
print(r[user_id_2])
print(pearson(r[user_id_1], r[user_id_2]))

Pearson similarity between user 0 and 5
[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
[1. 1. 0. 0. 1. 0. 0. 1. 0. 0.]
0.40824829046386296


# All cosine similarities

**Task 3.** Calculate the entire matrix of cosine similarities between all users and print the first 15 rows and columns. Call the resulting matrix cos_sim.

In [22]:
# Write your code here
n_uv = np.matmul(r, r.T)

print("Scalar products")
print(n_uv[:15, :15])
print()

norms = np.sqrt(np.diag(n_uv))

print("Norms")
print(np.around(norms[:15], 3))
print()

cos_sim = n_uv / norms.reshape(-1, 1) / norms.reshape(1, -1)

print("Cosine similarities")
print(np.around(cos_sim[:15, :15], 3))

Scalar products
[[5. 1. 1. 1. 0. 3. 0. 2. 0. 0. 1. 0. 4. 3. 2.]
 [1. 3. 0. 1. 1. 1. 1. 2. 1. 0. 1. 1. 3. 1. 1.]
 [1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1.]
 [1. 1. 0. 2. 1. 1. 1. 0. 1. 0. 0. 1. 2. 1. 2.]
 [0. 1. 0. 1. 1. 0. 1. 0. 1. 0. 0. 1. 1. 1. 1.]
 [3. 1. 0. 1. 0. 4. 0. 2. 0. 0. 1. 0. 3. 1. 1.]
 [0. 1. 0. 1. 1. 0. 1. 0. 1. 0. 0. 1. 1. 1. 1.]
 [2. 2. 1. 0. 0. 2. 0. 4. 0. 0. 1. 0. 4. 1. 1.]
 [0. 1. 0. 1. 1. 0. 1. 0. 2. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 1. 0. 1. 1. 1. 2. 0. 1. 0. 0.]
 [0. 1. 0. 1. 1. 0. 1. 0. 1. 0. 0. 1. 1. 1. 1.]
 [4. 3. 1. 2. 1. 3. 1. 4. 1. 0. 1. 1. 7. 3. 3.]
 [3. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 3. 5. 3.]
 [2. 1. 1. 2. 1. 1. 1. 1. 1. 0. 0. 1. 3. 3. 4.]]

Norms
[2.236 1.732 1.    1.414 1.    2.    1.    2.    1.414 1.    1.414 1.    2.646 2.236 2.   ]

Cosine similarities
[[1.    0.258 0.447 0.316 0.    0.671 0.    0.447 0.    0.    0.316 0.    0.676 0.6   0.447]
 [0.258 1.    0.    0.408 0.577 0.289 0.577 0.577 

# All Pearson similarities

**Task 4.** Calculate the entire matrix of Pearson similarities between all users and print the first 15 rows and columns. Call the resulting matrix pearson_sim.

In [24]:
# Write your code here

r_shifted = r - np.mean(r, axis=1).reshape(-1, 1)

n_uv = np.matmul(r_shifted, r_shifted.T)

norms = np.sqrt(np.diag(n_uv))

norms[norms == 0] = 0.000001

person_sim = n_uv / norms.reshape(1, -1) / norms.reshape(-1, 1)

print("Scalar products")
print(np.around(n_uv[:15, :15], 3))
print()

print("Norms")
print(np.around(norms[:15], 3))
print()

print("Pearson similarities")
print(np.around(person_sim[:15, :15], 3))

Scalar products
[[ 2.5 -0.5  0.5 -0.  -0.5  1.  -0.5 -0.  -1.  -0.5  0.  -0.5  0.5  0.5  0. ]
 [-0.5  2.1 -0.3  0.4  0.7 -0.2  0.7  0.8  0.4 -0.3  0.4  0.7  0.9 -0.5 -0.2]
 [ 0.5 -0.3  0.9 -0.2 -0.1 -0.4 -0.1  0.6 -0.2 -0.1 -0.2 -0.1  0.3  0.5  0.6]
 [-0.   0.4 -0.2  1.6  0.8  0.2  0.8 -0.8  0.6 -0.2 -0.4  0.8  0.6  0.   1.2]
 [-0.5  0.7 -0.1  0.8  0.9 -0.4  0.9 -0.4  0.8 -0.1 -0.2  0.9  0.3  0.5  0.6]
 [ 1.  -0.2 -0.4  0.2 -0.4  2.4 -0.4  0.4 -0.8 -0.4  0.2 -0.4  0.2 -1.  -0.6]
 [-0.5  0.7 -0.1  0.8  0.9 -0.4  0.9 -0.4  0.8 -0.1 -0.2  0.9  0.3  0.5  0.6]
 [-0.   0.8  0.6 -0.8 -0.4  0.4 -0.4  2.4 -0.8 -0.4  0.2 -0.4  1.2 -1.  -0.6]
 [-1.   0.4 -0.2  0.6  0.8 -0.8  0.8 -0.8  1.6  0.8  0.6  0.8 -0.4 -0.   0.2]
 [-0.5 -0.3 -0.1 -0.2 -0.1 -0.4 -0.1 -0.4  0.8  0.9  0.8 -0.1 -0.7 -0.5 -0.4]
 [ 0.   0.4 -0.2 -0.4 -0.2  0.2 -0.2  0.2  0.6  0.8  1.6 -0.2 -0.4 -1.  -0.8]
 [-0.5  0.7 -0.1  0.8  0.9 -0.4  0.9 -0.4  0.8 -0.1 -0.2  0.9  0.3  0.5  0.6]
 [ 0.5  0.9  0.3  0.6  0.3  0.2  0.3  1.2 -0.4 -

# Calculate scores of all items for user 0

## Find n closest neighbors

**Task 5.** Generate a numpy array with n_neighbors ids of the closest neighbors of user 0 sorted decreasingly by the cosine similarity.

In [35]:
np.fill_diagonal(cos_sim, -1)

user_id = 0
n_neighbors = 10

# Write your code here
neighbours0 = np.argsort(-cos_sim[user_id])
#print(neighbours0)
neighbor_ids = neighbours0[:n_neighbors]

print("Nearest neighbors")
print(neighbor_ids)
print()

print("User {}".format(user_id))
print(r[user_id])
print()
for i in range(3):
    print("User {}".format(neighbor_ids[i]))
    print(r[neighbor_ids[i]])
    print()

Nearest neighbors
[138 387 240 399 513 285 473 172  24 270]

User 0
[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]

User 138
[1. 1. 1. 1. 1. 0. 0. 1. 0. 0.]

User 387
[1. 1. 1. 1. 1. 0. 1. 0. 0. 1.]

User 240
[1. 1. 1. 1. 1. 1. 1. 0. 0. 0.]



## Score all items

<center>
$$
    \text{score(i)} = \frac{\sum_{v \in N(u)} \text{Sim}(u, v) \cdot v(i)}{\sum_{v \in N(u)} |\text{Sim}(u, v)|}
$$
</center>

**Task 6.** Code the score method to calculate the score of item $i$ for user $u$ based on a vector (numpy array) of similarities of this user to other users and an interaction vector (numpy array) of item $i$ with user's $u$ neighbors. Use the above formula. Print the score for user_id=0 and item_id=7.

In [44]:
def score(similarities, v_i):
    # Write your code here
    print()

item_id = 7

print("Interactions for nearest neighbors")
print(r[neighbor_ids])
print()

user_id = 7

# Get similarities for the chosen user and his neighbors
similarities = cos_sim[user_id][:neighbor_ids]

# Write your code here

print("similarities")
print(similarities)
print()

# Get the interaction vector of the chosen item and the active user's neighbors


# Write your code here

print("v_i")
print(v_i)
print()

print("score for user_id={} and item_id={}".format(user_id, item_id))
print(score(similarities, v_i))


# [1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]

Interactions for nearest neighbors
[[1. 1. 1. 1. 1. 0. 0. 1. 0. 0.]
 [1. 1. 1. 1. 1. 0. 1. 0. 0. 1.]
 [1. 1. 1. 1. 1. 1. 1. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 0. 0. 1. 0.]
 [1. 1. 1. 1. 1. 0. 0. 0. 1. 1.]
 [1. 1. 1. 1. 1. 1. 0. 1. 0. 0.]
 [1. 1. 1. 1. 0. 1. 0. 0. 0. 0.]
 [1. 1. 1. 1. 0. 1. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 0. 0. 0. 0.]]



TypeError: only integer scalar arrays can be converted to a scalar index

### Calculate and print scores for the first 10 items

**Task 7.** Calculate and print scores for user_id=0 for the first 10 items.

In [None]:
print(r[user_id])

for i in range(10):
    # Write your code here
    print("score for user_id={} and item_id={}".format(user_id, i))
    print(round(score(similarities, v_i), 2))

### The same scoring with a single operation

**Task 8.** Calculate scores for user_id=0 for the first 10 items in one operation using matrix multiplication. Print the resulting vector of scores.

In [None]:
item_ids = list(range(10))
print("Neighbor ids")
print(neighbor_ids)
print()
print("Item ids")
print(item_ids)
print()
print("similarities")
print(similarities)
print()

# Get the interaction matrix of the chosen items and the active user's neighbors

# Write your code here

print("v_i")
print(v_i)
print()

# Calculate scores

# Write your code here

print("scores")
print(np.around(scores, 2))

# Load a bigger dataset

In [None]:
ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv")).rename(columns={'movieId': 'item_id'})
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')

# Filter the data to reduce the number of movies
seed = 6789
rng = np.random.RandomState(seed=seed)
left_ids = rng.choice(ml_movies_df['item_id'], size=100, replace=False)

ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]
ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]
ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]

display(ml_movies_df.head(10))

print("Number of interactions left: {}".format(len(ml_ratings_df)))

**Task 9.** Fill in the code in the item-based version of the recommend method. Generate a numpy array of scores and a numpy array of chosen item ids (chosen_ids) with recommended item ids sorted by decreasing score.

In [None]:
from recommenders.recommender import Recommender

class NearestNeighborsRecommender(Recommender):
    """
    Nearest neighbors recommender allowing to do user-based or item-based collaborative filtering.

    Possible similarity measures:
        - 'cosine',
        - 'pearson'.
    """

    def __init__(self):
        super().__init__()
        self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
        self.interactions_df = None
        self.item_id_mapping = None
        self.user_id_mapping = None
        self.item_id_reverse_mapping = None
        self.user_id_reverse_mapping = None
        self.r = None
        self.similarities = None
        self.most_popular_items = None

        self.collaboration_type = 'user'
        self.similarity_measure = 'cosine'
        self.n_neighbors = 10
        self.should_recommend_already_bought = False

    def initialize(self, **params):
        if 'n_neighbors' in params:
            self.n_neighbors = params['n_neighbors']
        if 'should_recommend_already_bought' in params:
            self.should_recommend_already_bought = params['should_recommend_already_bought']

    def fit(self, interactions_df, users_df, items_df):
        """
        Training of the recommender.

        :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
            defined by user_id, item_id and features of the interaction.
        :param pd.DataFrame users_df: DataFrame with users and their features defined by
            user_id and the user feature columns.
        :param pd.DataFrame items_df: DataFrame with items and their features defined
            by item_id and the item feature columns.
        """

        del users_df, items_df

        # Shift item ids and user ids so that they are consecutive

        unique_item_ids = interactions_df['item_id'].unique()
        self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
        self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
        unique_user_ids = interactions_df['user_id'].unique()
        self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
        self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))

        interactions_df = interactions_df.copy()
        interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)

        # Get the number of items and users

        self.interactions_df = interactions_df
        n_items = np.max(interactions_df['item_id']) + 1
        n_users = np.max(interactions_df['user_id']) + 1

        # Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
        r = np.zeros(shape=(n_users, n_items))
        for idx, interaction in interactions_df.iterrows():
            r[int(interaction['user_id'])][int(interaction['item_id'])] = 1

        if self.collaboration_type == 'item':
            r = r.T

        self.r = r

        # Calculate all similarities

        similarities = None
        if self.similarity_measure == 'cosine':
            n_uv = np.matmul(r, r.T)
            norms = np.sqrt(np.diag(n_uv))
            similarities = n_uv / norms[:, np.newaxis] / norms[np.newaxis, :]
        elif self.similarity_measure == 'pearson':
            r_shifted = r - np.mean(r, axis=1).reshape(-1, 1)
            n_uv = np.matmul(r_shifted, r_shifted.T)
            norms = np.sqrt(np.diag(n_uv))
            norms[norms == 0] = 0.000001
            similarities = n_uv / norms[:, np.newaxis] / norms[np.newaxis, :]

        np.fill_diagonal(similarities, -1000)

        self.similarities = similarities

        # Find the most popular items for the cold start problem

        offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
        offers_count = offers_count.sort_values('user_id', ascending=False)
        self.most_popular_items = offers_count.index

    def recommend(self, users_df, items_df, n_recommendations=1):
        """
        Serving of recommendations. Scores items in items_df for each user in users_df and returns
        top n_recommendations for each user.

        :param pd.DataFrame users_df: DataFrame with users and their features for which
            recommendations should be generated.
        :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
        :param int n_recommendations: Number of recommendations to be returned for each user.
        :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
            for each user.
        :rtype: pd.DataFrame
        """

        # Clean previous recommendations (iloc could be used alternatively)
        self.recommender_df = self.recommender_df[:0]

        # Handle users not in the training data

        # Map item ids

        items_df = items_df.copy()
        items_df = items_df.loc[items_df['item_id'].isin(self.item_id_mapping)]
        items_df.replace({'item_id': self.item_id_mapping}, inplace=True)

        # Generate recommendations

        for idx, user in users_df.iterrows():
            recommendations = []

            user_id = user['user_id']

            if user_id in self.user_id_mapping:
                chosen_ids = []
                scores = []
                mapped_user_id = self.user_id_mapping[user_id]

                if self.collaboration_type == 'user':
                    neighbor_ids = np.argsort(-self.similarities[mapped_user_id])[:self.n_neighbors]
                    user_similarities = self.similarities[mapped_user_id][neighbor_ids]

                    item_ids = items_df['item_id'].tolist()

                    v_i = self.r[neighbor_ids][:, item_ids]

                    scores = np.matmul(user_similarities, v_i) / np.maximum(np.sum(user_similarities), 0.0001)

                    # Choose n recommendations based on highest scores
                    if not self.should_recommend_already_bought:
                        x_list = self.interactions_df.loc[
                            self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
                        scores[x_list] = -1e100

                    chosen_ids = np.argsort(-scores)[:n_recommendations]

                elif self.collaboration_type == 'item':
                    
                    # Write your code here

                for item_id in chosen_ids:
                    recommendations.append(
                        {
                            'user_id': self.user_id_reverse_mapping[mapped_user_id],
                            'item_id': self.item_id_reverse_mapping[item_id],
                            'score': scores[item_id]
                        }
                    )
            else:  # For new users recommend most popular items
                for i in range(n_recommendations):
                    recommendations.append(
                        {
                            'user_id': user['user_id'],
                            'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
                            'score': 1.0
                        }
                    )

            user_recommendations = pd.DataFrame(recommendations)

            self.recommender_df = pd.concat([self.recommender_df, user_recommendations])

        return self.recommender_df
    

class UserBasedCosineNearestNeighborsRecommender(NearestNeighborsRecommender):
    
    def __init__(self):
        super().__init__()
        
        self.collaboration_type = 'user'
        self.similarity_measure = 'cosine'
        
        
class UserBasedPearsonNearestNeighborsRecommender(NearestNeighborsRecommender):
    
    def __init__(self):
        super().__init__()
        
        self.collaboration_type = 'user'
        self.similarity_measure = 'pearson'
        
        
class ItemBasedCosineNearestNeighborsRecommender(NearestNeighborsRecommender):
    
    def __init__(self):
        super().__init__()
        
        self.collaboration_type = 'item'
        self.similarity_measure = 'cosine'
        

class ItemBasedPearsonNearestNeighborsRecommender(NearestNeighborsRecommender):
    
    def __init__(self):
        super().__init__()
        
        self.collaboration_type = 'item'
        self.similarity_measure = 'pearson'

In [None]:
# Quick test of the recommender

nearest_neighbors_recommender = ItemBasedCosineNearestNeighborsRecommender()
nearest_neighbors_recommender.initialize(n_neighbors=20)
nearest_neighbors_recommender.fit(ml_ratings_df, None, ml_movies_df)
recommendations = nearest_neighbors_recommender.recommend(pd.DataFrame([[1], [4], [6]], columns=['user_id']), ml_movies_df, 10)

recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')
print("Recommendations")
display(recommendations)

# Training-test split evaluation

In [None]:
from evaluation_and_testing.testing import evaluate_train_test_split_implicit

In [None]:
ub_cos_nn_recommender = UserBasedCosineNearestNeighborsRecommender()
ub_cos_nn_recommender.initialize(n_neighbors=30)

ub_cos_nn_tts_results = [['UserBasedCosineNearestNeighborsRecommender'] + list(evaluate_train_test_split_implicit(
    ub_cos_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

ub_cos_nn_tts_results = pd.DataFrame(
    ub_cos_nn_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(ub_cos_nn_tts_results)

In [None]:
ub_pearson_nn_recommender = UserBasedPearsonNearestNeighborsRecommender()
ub_pearson_nn_recommender.initialize(n_neighbors=30)

ub_pearson_nn_tts_results = [['UserBasedPearsonNearestNeighborsRecommender'] + list(evaluate_train_test_split_implicit(
    ub_pearson_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

ub_pearson_nn_tts_results = pd.DataFrame(
    ub_pearson_nn_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(ub_pearson_nn_tts_results)

In [None]:
ib_cos_nn_recommender = ItemBasedCosineNearestNeighborsRecommender()
ib_cos_nn_recommender.initialize(n_neighbors=30)

ib_cos_nn_tts_results = [['ItemBasedCosineNearestNeighborsRecommender'] + list(evaluate_train_test_split_implicit(
    ib_cos_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

ib_cos_nn_tts_results = pd.DataFrame(
    ib_cos_nn_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(ib_cos_nn_tts_results)

In [None]:
ib_pearson_nn_recommender = ItemBasedPearsonNearestNeighborsRecommender()
ib_pearson_nn_recommender.initialize(n_neighbors=30)

ib_pearson_nn_tts_results = [['ItemBasedPearsonNearestNeighborsRecommender'] + list(evaluate_train_test_split_implicit(
    ib_pearson_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

ib_pearson_nn_tts_results = pd.DataFrame(
    ib_pearson_nn_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(ib_pearson_nn_tts_results)

In [None]:
from recommenders.amazon_recommender import AmazonRecommender

amazon_recommender = AmazonRecommender()

amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(
    amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

amazon_tts_results = pd.DataFrame(
    amazon_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(amazon_tts_results)

In [None]:
from recommenders.tfidf_recommender import TFIDFRecommender

tfidf_recommender = TFIDFRecommender()

tfidf_tts_results = [['TFIDFRecommender'] + list(evaluate_train_test_split_implicit(
    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

tfidf_tts_results = pd.DataFrame(
    tfidf_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(tfidf_tts_results)

In [None]:
tts_results = pd.concat([ub_cos_nn_tts_results, ub_pearson_nn_tts_results, ib_cos_nn_tts_results, 
                         ib_pearson_nn_tts_results, amazon_tts_results, tfidf_tts_results]).reset_index(drop=True)
display(tts_results)

# Leave-one-out evaluation

In [None]:
from evaluation_and_testing.testing import evaluate_leave_one_out_implicit

In [None]:
ub_cos_nn_recommender = UserBasedCosineNearestNeighborsRecommender()
ub_cos_nn_recommender.initialize(n_neighbors=30)

ub_cos_nn_loo_results = [['UserBasedCosineNearestNeighborsRecommender'] + list(evaluate_leave_one_out_implicit(
    ub_cos_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

ub_cos_nn_loo_results = pd.DataFrame(
    ub_cos_nn_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(ub_cos_nn_loo_results)

In [None]:
ub_pearson_nn_recommender = UserBasedPearsonNearestNeighborsRecommender()
ub_pearson_nn_recommender.initialize(n_neighbors=30)

ub_pearson_nn_loo_results = [['UserBasedPearsonNearestNeighborsRecommender'] + list(evaluate_leave_one_out_implicit(
    ub_pearson_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

ub_pearson_nn_loo_results = pd.DataFrame(
    ub_pearson_nn_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(ub_pearson_nn_loo_results)

In [None]:
ib_cos_nn_recommender = ItemBasedCosineNearestNeighborsRecommender()
ib_cos_nn_recommender.initialize(n_neighbors=30)

ib_cos_nn_loo_results = [['ItemBasedCosineNearestNeighborsRecommender'] + list(evaluate_leave_one_out_implicit(
    ib_cos_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

ib_cos_nn_loo_results = pd.DataFrame(
    ib_cos_nn_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(ib_cos_nn_loo_results)

In [None]:
ib_pearson_nn_recommender = ItemBasedPearsonNearestNeighborsRecommender()
ib_pearson_nn_recommender.initialize(n_neighbors=30)

ib_pearson_nn_loo_results = [['ItemBasedPearsonNearestNeighborsRecommender'] + list(evaluate_leave_one_out_implicit(
    ib_pearson_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

ib_pearson_nn_loo_results = pd.DataFrame(
    ib_pearson_nn_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(ib_pearson_nn_loo_results)

In [None]:
from recommenders.amazon_recommender import AmazonRecommender

amazon_recommender = AmazonRecommender()

amazon_loo_results = [['AmazonRecommender'] + list(evaluate_leave_one_out_implicit(
    amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

amazon_loo_results = pd.DataFrame(
    amazon_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(amazon_loo_results)

In [None]:
tfidf_recommender = TFIDFRecommender()

tfidf_loo_results = [['TFIDFRecommender'] + list(evaluate_leave_one_out_implicit(
    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

tfidf_loo_results = pd.DataFrame(
    tfidf_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(tfidf_loo_results)

In [None]:
loo_results = pd.concat([ub_cos_nn_loo_results, ub_pearson_nn_loo_results, ib_cos_nn_loo_results, 
                         ib_pearson_nn_loo_results, amazon_loo_results, tfidf_loo_results]).reset_index(drop=True)
display(loo_results)

**Task 10.** Add inverse of the euclidean distance as an eligible similarity measure in the nearest neighbors recommender and compare results of both the user- and item-based recommenders with this measure to other recommenders tested in this notebook. Create two new classes inheriting from the NearestNeighborsRecommender and name them:

- UserBasedInvEuclideanNearestNeighborsRecommender,
- ItemBasedInvEuclideanNearestNeighborsRecommender.

In [None]:
# Write your code here

**Task 11.** Find the optimal number of neighbors for the Item-Based Cosine Nearest Neighbors Recommender for $1 \leq \text{n_neighbors} \leq 100$ and the train-test split testing scheme. Use seed=6789 and a set of interactions for a chosen subset of 100 movies. Use grid search to test all possibilities and compare it with the result of tuning with hyperopt.

In [None]:
# Write your code here