In [1]:
!cp -r ../input/recsys-repo/RecSys_Course_AT_PoliMi-master/* ./

# Loading Data

In [2]:
%config Completer.use_jedi = False
import pandas as pd
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as plt
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

In [3]:
#load data_train, matrix of users interaction
URM_path = "../input/recommender-system-2021-challenge-polimi/data_train.csv"
URM_all_dataframe = pd.read_csv(filepath_or_buffer=URM_path, 
                                sep=",",
                                dtype={0:int, 1:int, 2:float},
                                header=0)
URM_all_dataframe.columns = ["UserID", "ItemID", "Interaction"]

In [4]:
URM_all_dataframe.head()

Unnamed: 0,UserID,ItemID,Interaction
0,0,53,1.0
1,0,209,1.0
2,0,223,1.0
3,0,249,1.0
4,0,435,1.0


In [5]:
userID_unique = URM_all_dataframe["UserID"].unique()
itemID_unique = URM_all_dataframe["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(URM_all_dataframe)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print ("Average interactions per user {:.2f}".format(n_interactions/n_users))
print ("Average interactions per item {:.2f}\n".format(n_interactions/n_items))

print ("Sparsity {:.2f} %".format((1-float(n_interactions)/(n_items*n_users))*100))

Number of items	 18059, Number of users	 13650
Max ID items	 18058, Max Id users	 13649

Average interactions per user 387.23
Average interactions per item 292.69

Sparsity 97.86 %


In [6]:
URM_all = sps.coo_matrix((URM_all_dataframe["Interaction"].values, 
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values)))
URM_all = URM_all.tocsr() # to obtain fast access to rows (users)
URM_all

<13650x18059 sparse matrix of type '<class 'numpy.float64'>'
	with 5285664 stored elements in Compressed Sparse Row format>

In [7]:
#now load ICM matrices and create a single ICM dataframe
ICM_genre_df = pd.read_csv("../input/recommender-system-2021-challenge-polimi/data_ICM_genre.csv")
ICM_genre_df

Unnamed: 0,row,col,data
0,0,3,1.0
1,1,3,1.0
2,2,6,1.0
3,3,0,1.0
4,3,1,1.0
...,...,...,...
17986,18055,3,1.0
17987,18056,2,1.0
17988,18056,3,1.0
17989,18057,3,1.0


In [8]:
items = ICM_genre_df.row
features = ICM_genre_df.col
data = ICM_genre_df.data
ICM_genre = sps.coo_matrix((data, (items, features)))
ICM_genre = ICM_genre.tocsr() #fast row (items) access
ICM_genre.shape

(18059, 8)

In [9]:
ICM_subgenre_df = pd.read_csv("../input/recommender-system-2021-challenge-polimi/data_ICM_subgenre.csv")
ICM_subgenre_df

Unnamed: 0,row,col,data
0,0,70,1.0
1,1,27,1.0
2,2,40,1.0
3,3,50,1.0
4,4,62,1.0
...,...,...,...
18052,18054,90,1.0
18053,18055,85,1.0
18054,18056,103,1.0
18055,18057,90,1.0


In [10]:
items = ICM_subgenre_df.row
features = ICM_subgenre_df.col
data = ICM_subgenre_df.data
ICM_subgenre = sps.coo_matrix((data, (items, features)))
ICM_subgenre = ICM_subgenre.tocsr() #fast row (items) access
ICM_subgenre.shape

(18059, 113)

In [11]:
# Combine the 2 matrices in one
ICM_combined = sps.hstack((ICM_genre, ICM_subgenre))
ICM_combined.shape

(18059, 121)

# Data processing and basic tuning setup

In [12]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

# split data into train and validation data 85/15
URM_train, URM_valid = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)



In [13]:
from Evaluation.Evaluator import EvaluatorHoldout

#create an evaluator object to evaluate validation set
#we will use it for hyperparameter tuning
evaluator_valid = EvaluatorHoldout(URM_valid, cutoff_list=[10])

EvaluatorHoldout: Ignoring 13646 ( 0.0%) Users that have less than 1 test interactions


In [14]:
from Recommenders.KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender

#try a simple CF model based on item-item similarity, here just select the class of model to perform tuning
recommender_class = ItemKNNCBFRecommender

In [15]:
from skopt.space import Real, Integer, Categorical

#define hyperparameter set for our model
#ItemKNNCF uses topK (K param), shrink term (to consider support of similarity), similarity type (we consider cosine one), normalization of data (true, false)
hyperparameters_range_dictionary = {
    "topK": Integer(5, 1000),
    "shrink": Integer(0, 1000),
    "similarity": Categorical(["cosine"]),
    "normalize": Categorical([True, False]),
    "feature_weighting": Categorical(["TF-IDF"])
}

In [16]:
from HyperparameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

#create a bayesian optimizer object, we pass the recommender and the evaluator
hyperparameterSearch = SearchBayesianSkopt(recommender_class,
                                         evaluator_validation=evaluator_valid)

In [17]:
from HyperparameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
  
#provide data needed to create instance of model (one on URM_train, the other on URM_all)
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train, ICM_combined],     # For a CBF model simply put [URM_train, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

In [18]:
recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_all, ICM_combined],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

In [19]:
import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    
n_cases = 50
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"   
cutoff_to_optimize = 10

In [20]:
#let's run the bayesian search
hyperparameterSearch.search(recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       hyperparameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "last",
                       output_folder_path = output_folder_path, # Where to save the results
                       output_file_name_root = recommender_class.RECOMMENDER_NAME, # How to call the files
                       metric_to_optimize = metric_to_optimize,
                       cutoff_to_optimize = cutoff_to_optimize,
                      )

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'topK': 162, 'shrink': 951, 'similarity': 'cosine', 'normalize': False, 'feature_weighting': 'TF-IDF'}
ItemKNNCBFRecommender: ICM Detected 13 ( 0.1%) items with no features.
Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 18059 (100.0%), 2480.01 column/sec. Elapsed time 7.28 sec
EvaluatorHoldout: Processed 13646 (100.0%) in 18.06 sec. Users per second: 756
SearchBayesianSkopt: New best config found. Config 0: {'topK': 162, 'shrink': 951, 'similarity': 'cosine', 'normalize': False, 'feature_weighting': 'TF-IDF'} - results: PRECISION: 0.0278616, PRECISION_RECALL_MIN_DEN: 0.0278735, RECALL: 0.0040153, MAP: 0.0084174, MAP_MIN_DEN: 0.0084214, MRR: 0.0661837, NDCG: 0.0265837, F1: 0.0070191, HIT_RATE: 0.2273926, ARHR_ALL_HITS: 0.0742851, NOVELTY: 0.0080601, AVERAGE_POPULARITY: 0.0666970, DIVERSITY_MEAN_INTER_LIST: 0.9702159, DIVERSITY_HERFINDAHL: 0.9970145, COVER

In [21]:
from Recommenders.DataIO import DataIO

#explore the results of the search
data_loader = DataIO(folder_path = output_folder_path)
search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

search_metadata.keys()

dict_keys(['cutoff_to_optimize', 'time_df', 'result_on_test_df', 'time_on_train_total', 'result_on_validation_df', 'algorithm_name_recommender', 'time_on_validation_avg', 'time_on_last_df', 'result_on_validation_best', 'time_on_validation_total', 'time_on_test_avg', 'result_on_last', 'metric_to_optimize', 'time_on_test_total', 'time_on_train_avg', 'algorithm_name_search', 'hyperparameters_best_index', 'exception_list', 'hyperparameters_df', 'result_on_test_best', 'hyperparameters_best'])

In [22]:
hyperparameters_df = search_metadata["hyperparameters_df"]
hyperparameters_df

Unnamed: 0,topK,shrink,similarity,normalize,feature_weighting
0,162,951,cosine,False,TF-IDF
1,715,133,cosine,True,TF-IDF
2,49,448,cosine,False,TF-IDF
3,873,806,cosine,False,TF-IDF
4,118,254,cosine,False,TF-IDF
5,363,513,cosine,False,TF-IDF
6,124,928,cosine,True,TF-IDF
7,927,701,cosine,True,TF-IDF
8,621,123,cosine,True,TF-IDF
9,329,208,cosine,True,TF-IDF


In [23]:
result_on_validation_df = search_metadata["result_on_validation_df"]
result_on_validation_df

Unnamed: 0_level_0,Unnamed: 1_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_ITEM_CORRECT,COVERAGE_USER,COVERAGE_USER_CORRECT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
Unnamed: 0_level_1,cutoff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,10,0.027862,0.027874,0.004015,0.008417,0.008421,0.066184,0.026584,0.007019,0.227393,0.074285,...,0.05399,0.999707,0.227326,0.057781,10.083061,0.997399,0.232941,0.81377,0.329218,0.037921
1,10,0.023919,0.023951,0.003286,0.006716,0.006727,0.055457,0.022373,0.005779,0.204016,0.060957,...,0.041198,0.999707,0.203956,0.044674,9.440635,0.992825,0.180101,0.761922,0.301031,0.038457
2,10,0.046871,0.046911,0.006588,0.014839,0.014861,0.109003,0.045145,0.011553,0.356735,0.126926,...,0.031286,0.999707,0.35663,0.021961,8.455357,0.994367,0.088533,0.682404,0.565096,0.035845
3,10,0.022226,0.022329,0.003272,0.005749,0.005775,0.048249,0.020121,0.005704,0.194416,0.052601,...,0.02514,0.999707,0.194359,0.017948,7.451787,0.980081,0.072357,0.601409,0.283667,0.038849
4,10,0.03272,0.032813,0.00479,0.009641,0.009655,0.0754,0.030907,0.008356,0.262788,0.085169,...,0.049726,0.999707,0.262711,0.051988,9.795213,0.996692,0.209587,0.790539,0.373508,0.037409
5,10,0.023919,0.023976,0.003487,0.00721,0.00723,0.058849,0.023072,0.006087,0.201744,0.064931,...,0.042195,0.999707,0.201685,0.045766,9.794472,0.996156,0.184503,0.790479,0.305736,0.038327
6,10,0.030302,0.030322,0.004335,0.009229,0.009232,0.072936,0.029096,0.007585,0.246373,0.081778,...,0.053381,0.999707,0.2463,0.057823,10.111145,0.997733,0.233109,0.816037,0.367563,0.037728
7,10,0.022395,0.022497,0.003319,0.005639,0.005663,0.047428,0.020049,0.005781,0.194636,0.051722,...,0.030179,0.999707,0.194579,0.022783,7.817698,0.981132,0.09185,0.63094,0.286998,0.038715
8,10,0.02315,0.023181,0.003179,0.006884,0.006896,0.058011,0.022383,0.005591,0.198959,0.063109,...,0.043801,0.999707,0.198901,0.05186,9.864098,0.995577,0.209073,0.796098,0.296262,0.03843
9,10,0.023362,0.023421,0.003312,0.00714,0.007158,0.058066,0.02268,0.005801,0.196468,0.064199,...,0.049726,0.999707,0.19641,0.060558,10.373479,0.998258,0.244139,0.837209,0.30536,0.038087


In [24]:
best_hyperparameters = search_metadata["hyperparameters_best"]
best_hyperparameters

{'topK': 41,
 'shrink': 1000,
 'similarity': 'cosine',
 'normalize': True,
 'feature_weighting': 'TF-IDF'}

This are the best hyperparameters found by the bayesian search, we will use them in our model

In [25]:
#let's fit the model with the hyperparamethers obtained from the previous search and evaluate them on validation set

recommender = ItemKNNCBFRecommender(URM_all)
recommender.fit()
#evaluator_valid.evaluateRecommender(recommender)

TypeError: __init__() missing 1 required positional argument: 'ICM_train'

# Create final recommendations

In [None]:
test_users = pd.read_csv('../input/recommender-system-2021-challenge-polimi/data_target_users_test.csv')
test_users

In [None]:
user_id = test_users['user_id']
recommendations = []
for user in user_id:
    recommendations.append(recommender.recommend(user,cutoff = 10))

In [None]:
for index in range(len(recommendations)):
    recommendations[index]=np.array(recommendations[index])
    
test_users['item_list']= recommendations
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])
test_users.to_csv('submission.csv', index=False)