In [1]:
!cp -r ../input/recsys-repo/RecSys_Course_AT_PoliMi-master/* ./

# Loading Data

In [2]:
%config Completer.use_jedi = False
import pandas as pd
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as pyplot
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

In [3]:
#load data_train, matrix of users interaction
URM_path = "../input/recommender-system-2021-challenge-polimi/data_train.csv"
URM_all_dataframe = pd.read_csv(filepath_or_buffer=URM_path, 
                                sep=",",
                                dtype={0:int, 1:int, 2:float},
                                header=0)
URM_all_dataframe.columns = ["UserID", "ItemID", "Interaction"]

In [4]:
URM_all_dataframe.head()

Unnamed: 0,UserID,ItemID,Interaction
0,0,53,1.0
1,0,209,1.0
2,0,223,1.0
3,0,249,1.0
4,0,435,1.0


In [5]:
userID_unique = URM_all_dataframe["UserID"].unique()
itemID_unique = URM_all_dataframe["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(URM_all_dataframe)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print ("Average interactions per user {:.2f}".format(n_interactions/n_users))
print ("Average interactions per item {:.2f}\n".format(n_interactions/n_items))

print ("Sparsity {:.2f} %".format((1-float(n_interactions)/(n_items*n_users))*100))

Number of items	 18059, Number of users	 13650
Max ID items	 18058, Max Id users	 13649

Average interactions per user 387.23
Average interactions per item 292.69

Sparsity 97.86 %


In [6]:
URM_all = sps.coo_matrix((URM_all_dataframe["Interaction"].values, 
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values)))
URM_all = URM_all.tocsr() # to obtain fast access to rows (users)
URM_all

<13650x18059 sparse matrix of type '<class 'numpy.float64'>'
	with 5285664 stored elements in Compressed Sparse Row format>

# Data processing and basic tuning setup

In [7]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

# split data into train and validation data 80/20
URM_train, URM_valid = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)



In [8]:
from Evaluation.Evaluator import EvaluatorHoldout

#create an evaluator object to evaluate validation set
#we will use it for hyperparameter tuning
evaluator_valid = EvaluatorHoldout(URM_valid, cutoff_list=[10])

EvaluatorHoldout: Ignoring 13645 ( 0.0%) Users that have less than 1 test interactions


In [9]:
from Recommenders.SLIM.Cython.SLIM_BPR_Cython import SLIM_BPR_Cython

#try a SLIM BPR model
recommender_class = SLIM_BPR_Cython

In [10]:
import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    
n_cases = 10
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"   
cutoff_to_optimize = 10

In [11]:
from skopt.space import Real, Integer, Categorical
#SLIM BPR is machine learning-based technique
#to tuning hyperparam are typical of ML models to drive the learning process

hyperparameters_range_dictionary = {
    "epochs": Categorical([700]),
    "sgd_mode": Categorical(["sgd", "adagrad", "adam"]),
    "topK": Integer(5, 700),
    "lambda_i": Real(low = 1e-4, high = 1e-1, prior = 'log-uniform'),
    "lambda_j": Real(low = 1e-4, high = 1e-1, prior = 'log-uniform'),
    "learning_rate": Real(low = 1e-4, high = 1e-1, prior = 'log-uniform')
}

In [12]:
#We also setup the early stopping 
earlystopping_keywargs = {"validation_every_n": 15,
                          "stop_on_validation": True,
                          "evaluator_object": evaluator_valid,
                          "lower_validations_allowed": 5,
                          "validation_metric": metric_to_optimize,
                          }

In [13]:
from HyperparameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

#create a bayesian optimizer object, we pass the recommender and the evaluator
hyperparameterSearch = SearchBayesianSkopt(recommender_class,
                                         evaluator_validation=evaluator_valid)

In [14]:
from HyperparameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
  
#provide data needed to create instance of model (one on URM_train, the other on URM_all)
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train],     # For a CBF model simply put [URM_train, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = earlystopping_keywargs
)

In [15]:
recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_all],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = earlystopping_keywargs
)

In [16]:
import pyximport
pyximport.install()

(None, <pyximport.pyximport.PyxImporter at 0x7f0f8cff00d0>)

In [17]:
#prepare the environment to run Cython code
!python run_compile_all_cython.py

run_compile_all_cython: Found 10 Cython files in 4 folders...
run_compile_all_cython: All files will be compiled using your current python environment: '/opt/conda/bin/python'
Compiling [1/10]: MatrixFactorizationImpressions_Cython_Epoch.pyx... 
In file included from [01m[K/opt/conda/lib/python3.7/site-packages/numpy/core/include/numpy/ndarraytypes.h:1822:0[m[K,
                 from [01m[K/opt/conda/lib/python3.7/site-packages/numpy/core/include/numpy/ndarrayobject.h:12[m[K,
                 from [01m[K/opt/conda/lib/python3.7/site-packages/numpy/core/include/numpy/arrayobject.h:4[m[K,
                 from [01m[KMatrixFactorizationImpressions_Cython_Epoch.c:620[m[K:
  [01;35m[K^~~~~~~[m[K
[01m[KMatrixFactorizationImpressions_Cython_Epoch.c:[m[K In function ‘[01m[K__pyx_f_43MatrixFactorizationImpressions_Cython_Epoch_32MatrixFactorization_Cython_Epoch_sampleBPR_Cython[m[K’:
       [01;35m[K__pyx_t_4 = (__pyx_v_start_pos_impression_items + __pyx_v

In [18]:
from Recommenders.MatrixFactorization.Cython.MatrixFactorization_Cython_Epoch import MatrixFactorization_Cython_Epoch
#let's run the bayesian search
hyperparameterSearch.search(recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       hyperparameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "last",
                       output_folder_path = output_folder_path, # Where to save the results
                       output_file_name_root = recommender_class.RECOMMENDER_NAME, # How to call the files
                       metric_to_optimize = metric_to_optimize,
                       cutoff_to_optimize = cutoff_to_optimize,
                      )

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'epochs': 700, 'sgd_mode': 'adam', 'topK': 16, 'lambda_i': 0.020829291641260773, 'lambda_j': 0.007734003896155046, 'learning_rate': 0.000468523028239338}
SLIM_BPR_Recommender: Automatic selection of fastest train mode. Available RAM is 16784.00 MB (93.24%) of 18001.00 MB, required is 1304.51 MB. Using dense matrix.
Processed 13650 (100.0%) in 0.77 sec. BPR loss is 7.66E-01. Sample per second: 17627
SLIM_BPR_Recommender: Epoch 1 of 700. Elapsed time 0.60 sec
Processed 13650 (100.0%) in 1.36 sec. BPR loss is 4.61E+00. Sample per second: 10039
SLIM_BPR_Recommender: Epoch 2 of 700. Elapsed time 1.18 sec
Processed 13650 (100.0%) in 0.94 sec. BPR loss is 9.25E+00. Sample per second: 14517
SLIM_BPR_Recommender: Epoch 3 of 700. Elapsed time 1.76 sec
Processed 13650 (100.0%) in 1.50 sec. BPR loss is 1.29E+01. Sample per second: 9083
SLIM_BPR_Recommender: Epoch 4 of 700. Elapsed time 2.33 sec
Proc

In [19]:
from Recommenders.DataIO import DataIO

#explore the results of the search
data_loader = DataIO(folder_path = output_folder_path)
search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

search_metadata.keys()

dict_keys(['algorithm_name_search', 'algorithm_name_recommender', 'result_on_test_df', 'hyperparameters_best_index', 'metric_to_optimize', 'result_on_validation_df', 'result_on_last', 'time_df', 'time_on_last_df', 'hyperparameters_df', 'exception_list', 'time_on_validation_avg', 'time_on_train_avg', 'time_on_train_total', 'time_on_test_avg', 'result_on_test_best', 'time_on_validation_total', 'hyperparameters_best', 'time_on_test_total', 'cutoff_to_optimize', 'result_on_validation_best'])

In [20]:
hyperparameters_df = search_metadata["hyperparameters_df"]
hyperparameters_df

Unnamed: 0,epochs,sgd_mode,topK,lambda_i,lambda_j,learning_rate
0,285,adam,16,0.020829,0.007734,0.000469
1,690,adagrad,135,0.000483,0.003285,0.000524
2,30,adam,505,0.00035,0.008732,0.000451
3,690,adagrad,149,0.003476,0.003283,0.003664
4,690,adagrad,106,0.000108,0.014982,0.008845
5,15,adagrad,5,0.1,0.0001,0.1
6,15,adagrad,5,0.1,0.1,0.1
7,435,sgd,700,0.0001,0.0001,0.006785
8,690,adagrad,700,0.0001,0.0001,0.00105
9,180,adagrad,700,0.1,0.0001,0.006118


In [21]:
result_on_validation_df = search_metadata["result_on_validation_df"]
result_on_validation_df

Unnamed: 0_level_0,Unnamed: 1_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_ITEM_CORRECT,COVERAGE_USER,COVERAGE_USER_CORRECT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
Unnamed: 0_level_1,cutoff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,10,0.324397,0.325415,0.054589,0.193441,0.194014,0.603041,0.344917,0.093453,0.939538,1.062778,...,0.011573,0.999634,0.939194,0.002084,5.583664,0.972149,0.008405,0.450656,3.475954,0.024567
1,10,0.342741,0.344168,0.05918,0.208171,0.208917,0.61925,0.363672,0.100932,0.949285,1.116805,...,0.014065,0.999634,0.948938,0.002322,5.750905,0.974876,0.009367,0.464154,3.479642,0.024543
2,10,0.320799,0.321942,0.053696,0.191862,0.19247,0.600067,0.341854,0.091994,0.936534,1.054534,...,0.009469,0.999634,0.93619,0.00174,5.319708,0.968631,0.00702,0.429353,3.585404,0.024392
3,10,0.367988,0.369732,0.065189,0.225554,0.226517,0.635369,0.386759,0.110758,0.961524,1.177025,...,0.025084,0.999634,0.961172,0.004072,6.51063,0.982287,0.016425,0.525472,3.142988,0.02517
4,10,0.378358,0.380181,0.067512,0.232376,0.233363,0.63666,0.395301,0.11458,0.966728,1.196438,...,0.033667,0.999634,0.966374,0.005804,6.982673,0.986025,0.023411,0.56357,2.909878,0.025644
5,10,0.017823,0.017846,0.002334,0.006585,0.006597,0.053947,0.019106,0.004127,0.147893,0.059488,...,0.062074,0.999634,0.147839,0.042184,9.507766,0.996288,0.170153,0.76737,0.069823,0.040908
6,10,0.012679,0.012689,0.001663,0.004299,0.004302,0.036395,0.013117,0.002941,0.109564,0.03955,...,0.046348,0.999634,0.109524,0.026102,8.725268,0.993613,0.105285,0.704215,0.055454,0.041571
7,10,0.384969,0.387002,0.069619,0.236558,0.23772,0.641263,0.401372,0.117914,0.972151,1.211612,...,0.040202,0.999634,0.971795,0.006797,7.189164,0.987326,0.027416,0.580236,2.898706,0.025584
8,10,0.335471,0.33684,0.057402,0.201943,0.202678,0.611837,0.355951,0.09803,0.948553,1.093202,...,0.01185,0.999634,0.948205,0.001955,5.501457,0.971497,0.007884,0.444022,3.540049,0.024467
9,10,0.366046,0.367572,0.064194,0.222213,0.22291,0.624633,0.382409,0.109232,0.961085,1.15897,...,0.039537,0.999634,0.960733,0.008322,7.571927,0.991292,0.03357,0.611129,2.592814,0.026191


In [22]:
best_hyperparameters = search_metadata["hyperparameters_best"]
best_hyperparameters

{'epochs': 435,
 'sgd_mode': 'sgd',
 'topK': 700,
 'lambda_i': 0.0001,
 'lambda_j': 0.0001,
 'learning_rate': 0.006784502931291649}

This are the best hyperparameters found by the bayesian search, we will use them in our model

In [23]:
#let's fit the model with the hyperparamethers obtained from the previous search and evaluate them on validation set

recommender = SLIM_BPR_Cython(URM_all)
recommender.fit(epochs=650, sgd_mode = "sgd", topK = 483, lambda_i = 0.0006712905081189398, lambda_j = 0.06584150350451998, learning_rate = 0.0036482363905043207)
#evaluator_valid.evaluateRecommender(recommender)

SLIM_BPR_Recommender: Automatic selection of fastest train mode. Available RAM is 15211.00 MB (84.50%) of 18001.00 MB, required is 1304.51 MB. Using dense matrix.
Processed 13650 (100.0%) in 1.32 sec. BPR loss is 5.24E-01. Sample per second: 10351
SLIM_BPR_Recommender: Epoch 1 of 650. Elapsed time 0.66 sec
Processed 13650 (100.0%) in 0.98 sec. BPR loss is 1.90E+00. Sample per second: 13933
SLIM_BPR_Recommender: Epoch 2 of 650. Elapsed time 1.32 sec
Processed 13650 (100.0%) in 1.63 sec. BPR loss is 3.14E+00. Sample per second: 8390
SLIM_BPR_Recommender: Epoch 3 of 650. Elapsed time 1.97 sec
Processed 13650 (100.0%) in 1.35 sec. BPR loss is 4.32E+00. Sample per second: 10105
SLIM_BPR_Recommender: Epoch 4 of 650. Elapsed time 2.70 sec
Processed 13650 (100.0%) in 1.07 sec. BPR loss is 5.11E+00. Sample per second: 12748
SLIM_BPR_Recommender: Epoch 5 of 650. Elapsed time 3.42 sec
Processed 13650 (100.0%) in 0.76 sec. BPR loss is 6.35E+00. Sample per second: 17988
SLIM_BPR_Recommender: Epoch 

In [24]:
recommender.save_model(output_folder_path, file_name = recommender.RECOMMENDER_NAME + "_my_own_save.zip" )

SLIM_BPR_Recommender: Saving model in file 'result_experiments/SLIM_BPR_Recommender_my_own_save.zip'
SLIM_BPR_Recommender: Saving complete


# Create final recommendations

In [25]:
test_users = pd.read_csv('../input/recommender-system-2021-challenge-polimi/data_target_users_test.csv')
test_users

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
...,...
13645,13645
13646,13646
13647,13647
13648,13648


In [26]:
user_id = test_users['user_id']
recommendations = []
for user in user_id:
    recommendations.append(recommender.recommend(user,cutoff = 10))

In [27]:
for index in range(len(recommendations)):
    recommendations[index]=np.array(recommendations[index])
    
test_users['item_list']= recommendations
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])
test_users.to_csv('submission.csv', index=False)