In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict
from sklearn.model_selection import KFold

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 50)

# Movielens data

In [None]:
ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv")).rename(columns={'movieId': 'item_id'})
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')
print("Number of movies: {}".format(len(ml_movies_df)))
print("Number of users: {}".format(len(ml_ratings_df.user_id.unique())))
print("Number of interactions: {}".format(len(ml_ratings_df)))
print()
print("Movies")
display(ml_movies_df.head(10))
print("Interactions")
display(ml_ratings_df.head(30))

# Steam data

In [None]:
steam_df = pd.read_csv(os.path.join("data", "steam", "steam-200k.csv"), header=None, 
                       names=['user_id', 'game_title', 'behavior_name', 'value', 'zero']).drop(columns='zero')
print("Number of records: {}".format(len(steam_df)))
display(steam_df.head(10))

# Hotel data

In [None]:
hotel_original_data = pd.read_csv(os.path.join("data", "hotel_data", "hotel_data_original.csv"))
print("Number of records: {}".format(len(hotel_original_data)))
display(hotel_original_data.head(30))

In [None]:
hotel_data_interactions_df = pd.read_csv(os.path.join("data", "hotel_data", "hotel_data_interactions_df.csv"))
print("Number of records: {}".format(len(hotel_data_interactions_df)))
display(hotel_data_interactions_df.head(30))

# Test recommenders

## Load a sample of Movielens data

In [None]:
# Filter the data to reduce the number of movies

ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv")).rename(columns={'movieId': 'item_id'})
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')

seed = 6789
rng = np.random.RandomState(seed=seed)
left_ids = rng.choice(ml_movies_df['item_id'], size=1000, replace=False)

ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]
ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]
ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]

print("Number of left interactions: {}".format(len(ml_ratings_df)))

## Train several recommenders

In [None]:
from recommenders.basic_recommenders import RandomRecommender
from recommenders.basic_recommenders import MostPopularRecommender
from recommenders.basic_recommenders import HighestRatedRecommender
from recommenders.tfidf_recommender import TFIDFRecommender
from recommenders.nearest_neighbors_recommender import ItemBasedCosineNearestNeighborsRecommender
from recommenders.amazon_recommender import AmazonRecommender
from recommenders.netflix_recommender import NetflixRecommender
from recommenders.gmf_recommenders import GMFRecommender

random_recommender = RandomRecommender()
most_popular_recommender = MostPopularRecommender()
highest_rated_recommender = HighestRatedRecommender()
tfidf_recommender = TFIDFRecommender()
ibcnn_recommender = ItemBasedCosineNearestNeighborsRecommender(n_neighbors=30)
amazon_recommender = AmazonRecommender()
netflix_recommender = NetflixRecommender(print_type='live', embedding_dim=8, n_epochs=20)
gmf_recommender = GMFRecommender(print_type='live', n_neg_per_pos=10, batch_size=16, 
                                 embedding_dim=6, lr=0.001, weight_decay=0.0001, n_epochs=5, seed=1)

In [None]:
random_recommender.fit(ml_ratings_df, None, ml_movies_df)

In [None]:
most_popular_recommender = MostPopularRecommender()
highest_rated_recommender = HighestRatedRecommender()
most_popular_recommender.fit(ml_ratings_df, None, ml_movies_df)

In [None]:
highest_rated_recommender.fit(ml_ratings_df, None, ml_movies_df)

In [None]:
tfidf_recommender.fit(ml_ratings_df, None, ml_movies_df)

In [None]:
ibcnn_recommender.fit(ml_ratings_df, None, ml_movies_df)

In [None]:
amazon_recommender.fit(ml_ratings_df, None, ml_movies_df)

In [None]:
netflix_recommender.fit(ml_ratings_df, None, ml_movies_df)

In [None]:
gmf_recommender.fit(ml_ratings_df, None, ml_movies_df)

In [None]:
recommenders = [random_recommender, most_popular_recommender, highest_rated_recommender, tfidf_recommender, 
                ibcnn_recommender, amazon_recommender, netflix_recommender, gmf_recommender]

## Take a look on user 6 preferences

In [None]:
# Print movies watched by user 6

active_user_movies = ml_df.loc[ml_df['user_id'] == 6]
print("Active user history")
display(active_user_movies.sort_values('rating', ascending=False))

## Generate recommendations

In [None]:
for recommender in recommenders:
    recommendations = recommender.recommend(pd.DataFrame([[6]], columns=['user_id']), ml_movies_df, 5)

    recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')
    print("Recommendations for {}".format(type(recommender).__name__))
    display(recommendations)

### Train-test split test

In [None]:
from evaluation_and_testing.testing import evaluate_train_test_split_implicit

random_recommender = RandomRecommender()
most_popular_recommender = MostPopularRecommender()
highest_rated_recommender = HighestRatedRecommender()
tfidf_recommender = TFIDFRecommender()
ibcnn_recommender = ItemBasedCosineNearestNeighborsRecommender(n_neighbors=30)
amazon_recommender = AmazonRecommender()
netflix_recommender = NetflixRecommender(print_type=None, embedding_dim=8, n_epochs=20)
gmf_recommender = GMFRecommender(print_type=None, n_neg_per_pos=10, batch_size=16, 
                                 embedding_dim=6, lr=0.001, weight_decay=0.0001, n_epochs=5, seed=1)

recommenders = [random_recommender, most_popular_recommender, highest_rated_recommender, tfidf_recommender, 
                ibcnn_recommender, amazon_recommender, netflix_recommender, gmf_recommender]

all_results = []

for recommender in recommenders:
    results = [[type(recommender).__name__] + list(evaluate_train_test_split_implicit(
        recommender, ml_ratings_df, ml_movies_df))]

    results = pd.DataFrame(results, 
                           columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])
    all_results.append(results)

    display(results)
    
all_results = pd.concat(all_results).reset_index(drop=True)
display(all_results)