In [1]:
!cp -r ../input/recsys-repo/RecSys_Course_AT_PoliMi-master/* ./

In [2]:
%config Completer.use_jedi = False
import pandas as pd
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as pyplot
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_path = "../input/recommender-system-2021-challenge-polimi/data_train.csv"
URM_all_dataframe = pd.read_csv(filepath_or_buffer=URM_path, 
                                sep=",",
                                dtype={0:int, 1:int, 2:float},
                                header=0)
URM_all_dataframe.columns = ["UserID", "ItemID", "Interaction"]

userID_unique = URM_all_dataframe["UserID"].unique()
itemID_unique = URM_all_dataframe["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(URM_all_dataframe)

URM_all = sps.coo_matrix((URM_all_dataframe["Interaction"].values, 
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values)))
URM_all = URM_all.tocsr() # to obtain fast access to rows (users)

In [3]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

# split data into train and validation data 80/20
URM_train, URM_valid = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)



In [4]:
from Evaluation.Evaluator import EvaluatorHoldout

#create an evaluator object to evaluate validation set
#we will use it for hyperparameter tuning
evaluator_valid = EvaluatorHoldout(URM_valid, cutoff_list=[10])

EvaluatorHoldout: Ignoring 13646 ( 0.0%) Users that have less than 1 test interactions


In [5]:
import numpy as np
import scipy.sparse as sps
from Recommenders.Recommender_utils import check_matrix
from sklearn.linear_model import ElasticNet
from Recommenders.BaseSimilarityMatrixRecommender import BaseItemSimilarityMatrixRecommender
from Utils.seconds_to_biggest_unit import seconds_to_biggest_unit
import time, sys
from tqdm import tqdm
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

# os.environ["PYTHONWARNINGS"] = ('ignore::exceptions.ConvergenceWarning:sklearn.linear_model')
# os.environ["PYTHONWARNINGS"] = ('ignore:Objective did not converge:ConvergenceWarning:')

class SLIMElasticNetRecommender(BaseItemSimilarityMatrixRecommender):
    """
    Train a Sparse Linear Methods (SLIM) item similarity model.
    NOTE: ElasticNet solver is parallel, a single intance of SLIM_ElasticNet will
          make use of half the cores available
    See:
        Efficient Top-N Recommendation by Linear Regression,
        M. Levy and K. Jack, LSRS workshop at RecSys 2013.
        SLIM: Sparse linear methods for top-n recommender systems,
        X. Ning and G. Karypis, ICDM 2011.
        http://glaros.dtc.umn.edu/gkhome/fetch/papers/SLIM2011icdm.pdf
    """

    RECOMMENDER_NAME = "SLIMElasticNetRecommender"

    def __init__(self, URM_train, verbose = True):
        super(SLIMElasticNetRecommender, self).__init__(URM_train, verbose = verbose)

    @ignore_warnings(category=ConvergenceWarning)
    def fit(self, l1_ratio=0.1, alpha = 1.0, positive_only=True, topK = 100,**earlystopping_kwargs):

        assert l1_ratio>= 0 and l1_ratio<=1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format(self.RECOMMENDER_NAME, l1_ratio)

        self.l1_ratio = l1_ratio
        self.positive_only = positive_only
        self.topK = topK


        # initialize the ElasticNet model
        self.model = ElasticNet(alpha=alpha,
                                l1_ratio=self.l1_ratio,
                                positive=self.positive_only,
                                fit_intercept=False,
                                copy_X=False,
                                precompute=True,
                                selection='random',
                                max_iter=100,
                                tol=1e-4)

        URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32)

        n_items = URM_train.shape[1]

        # Use array as it reduces memory requirements compared to lists
        dataBlock = 10000000

        rows = np.zeros(dataBlock, dtype=np.int32)
        cols = np.zeros(dataBlock, dtype=np.int32)
        values = np.zeros(dataBlock, dtype=np.float32)

        numCells = 0

        start_time = time.time()
        start_time_printBatch = start_time

        # fit each item's factors sequentially (not in parallel)
        for currentItem in range(n_items):

            # get the target column
            y = URM_train[:, currentItem].toarray()

            # set the j-th column of X to zero
            start_pos = URM_train.indptr[currentItem]
            end_pos = URM_train.indptr[currentItem + 1]

            current_item_data_backup = URM_train.data[start_pos: end_pos].copy()
            URM_train.data[start_pos: end_pos] = 0.0

            # fit one ElasticNet model per column
            self.model.fit(URM_train, y)

            # self.model.coef_ contains the coefficient of the ElasticNet model
            # let's keep only the non-zero values

            # Select topK values
            # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
            # - Partition the data to extract the set of relevant items
            # - Sort only the relevant items
            # - Get the original item index

            nonzero_model_coef_index = self.model.sparse_coef_.indices
            nonzero_model_coef_value = self.model.sparse_coef_.data

            local_topK = min(len(nonzero_model_coef_value)-1, self.topK)

            relevant_items_partition = (-nonzero_model_coef_value).argpartition(local_topK)[0:local_topK]
            relevant_items_partition_sorting = np.argsort(-nonzero_model_coef_value[relevant_items_partition])
            ranking = relevant_items_partition[relevant_items_partition_sorting]

            for index in range(len(ranking)):

                if numCells == len(rows):
                    rows = np.concatenate((rows, np.zeros(dataBlock, dtype=np.int32)))
                    cols = np.concatenate((cols, np.zeros(dataBlock, dtype=np.int32)))
                    values = np.concatenate((values, np.zeros(dataBlock, dtype=np.float32)))


                rows[numCells] = nonzero_model_coef_index[ranking[index]]
                cols[numCells] = currentItem
                values[numCells] = nonzero_model_coef_value[ranking[index]]

                numCells += 1

            # finally, replace the original values of the j-th column
            URM_train.data[start_pos:end_pos] = current_item_data_backup

            elapsed_time = time.time() - start_time
            new_time_value, new_time_unit = seconds_to_biggest_unit(elapsed_time)


            if time.time() - start_time_printBatch > 300 or currentItem == n_items-1:
                self._print("Processed {} ({:4.1f}%) in {:.2f} {}. Items per second: {:.2f}".format(
                    currentItem+1,
                    100.0* float(currentItem+1)/n_items,
                    new_time_value,
                    new_time_unit,
                    float(currentItem)/elapsed_time))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()

        # generate the sparse weight matrix
        self.W_sparse = sps.csr_matrix((values[:numCells], (rows[:numCells], cols[:numCells])),
                                       shape=(n_items, n_items), dtype=np.float32)

In [6]:
recommender_SLIMElasticNet = SLIMElasticNetRecommender(URM_train)
recommender_SLIMElasticNet.fit(l1_ratio = 0.004529437143089812, alpha = 0.0647636014497447, positive_only = True, topK = 332)

SLIMElasticNetRecommender: Processed 1584 ( 8.8%) in 5.00 min. Items per second: 5.28
SLIMElasticNetRecommender: Processed 3200 (17.7%) in 10.00 min. Items per second: 5.33
SLIMElasticNetRecommender: Processed 4806 (26.6%) in 15.00 min. Items per second: 5.34
SLIMElasticNetRecommender: Processed 6367 (35.3%) in 20.01 min. Items per second: 5.30
SLIMElasticNetRecommender: Processed 7943 (44.0%) in 25.01 min. Items per second: 5.29
SLIMElasticNetRecommender: Processed 9521 (52.7%) in 30.01 min. Items per second: 5.29
SLIMElasticNetRecommender: Processed 11123 (61.6%) in 35.01 min. Items per second: 5.29
SLIMElasticNetRecommender: Processed 12748 (70.6%) in 40.01 min. Items per second: 5.31
SLIMElasticNetRecommender: Processed 14390 (79.7%) in 45.02 min. Items per second: 5.33
SLIMElasticNetRecommender: Processed 16025 (88.7%) in 50.02 min. Items per second: 5.34
SLIMElasticNetRecommender: Processed 17575 (97.3%) in 55.02 min. Items per second: 5.32
SLIMElasticNetRecommender: Processed 18

In [7]:
import pyximport
pyximport.install()
!python run_compile_all_cython.py

run_compile_all_cython: Found 10 Cython files in 4 folders...
run_compile_all_cython: All files will be compiled using your current python environment: '/opt/conda/bin/python'
Compiling [1/10]: MatrixFactorizationImpressions_Cython_Epoch.pyx... 
In file included from [01m[K/opt/conda/lib/python3.7/site-packages/numpy/core/include/numpy/ndarraytypes.h:1822[m[K,
                 from [01m[K/opt/conda/lib/python3.7/site-packages/numpy/core/include/numpy/ndarrayobject.h:12[m[K,
                 from [01m[K/opt/conda/lib/python3.7/site-packages/numpy/core/include/numpy/arrayobject.h:4[m[K,
                 from [01m[KMatrixFactorizationImpressions_Cython_Epoch.c:620[m[K:
      |  [01;35m[K^~~~~~~[m[K
[01m[KMatrixFactorizationImpressions_Cython_Epoch.c:[m[K In function ‘[01m[K__pyx_f_43MatrixFactorizationImpressions_Cython_Epoch_32MatrixFactorization_Cython_Epoch_sampleBPR_Cython[m[K’:
12608 |       [01;35m[K__pyx_t_4 = (__pyx_v_start_pos_impression_ite

In [8]:
from Recommenders.SLIM.Cython.SLIM_BPR_Cython import SLIM_BPR_Cython
recommender_SLIM_BPR_Cython = SLIM_BPR_Cython(URM_train)
recommender_SLIM_BPR_Cython.fit(epochs=650, sgd_mode = "sgd", topK = 483, lambda_i = 0.0006712905081189398, 
                lambda_j = 0.06584150350451998, learning_rate = 0.0036482363905043207)

SLIM_BPR_Recommender: Automatic selection of fastest train mode. Available RAM is 16722.00 MB (92.89%) of 18001.00 MB, required is 1304.51 MB. Using dense matrix.
Processed 13650 (100.0%) in 0.88 sec. BPR loss is 2.75E-01. Sample per second: 15596
SLIM_BPR_Recommender: Epoch 1 of 650. Elapsed time 0.60 sec
Processed 13650 (100.0%) in 1.48 sec. BPR loss is 1.10E+00. Sample per second: 9229
SLIM_BPR_Recommender: Epoch 2 of 650. Elapsed time 1.21 sec
Processed 13650 (100.0%) in 1.07 sec. BPR loss is 1.91E+00. Sample per second: 12731
SLIM_BPR_Recommender: Epoch 3 of 650. Elapsed time 1.80 sec
Processed 13650 (100.0%) in 0.66 sec. BPR loss is 2.71E+00. Sample per second: 20596
SLIM_BPR_Recommender: Epoch 4 of 650. Elapsed time 2.39 sec
Processed 13650 (100.0%) in 1.25 sec. BPR loss is 3.45E+00. Sample per second: 10922
SLIM_BPR_Recommender: Epoch 5 of 650. Elapsed time 2.98 sec
Processed 13650 (100.0%) in 0.84 sec. BPR loss is 4.05E+00. Sample per second: 16213
SLIM_BPR_Recommender: Epoch 

In [9]:
from Recommenders.MatrixFactorization.IALSRecommender import IALSRecommender

recommender_mf_ials = IALSRecommender(URM_train)
recommender_mf_ials.fit(num_factors = 48, confidence_scaling= 'linear', alpha = 1.0, epsilon = 0.10715471885641545, reg = 0.0005294631576919714, epochs = 30)

IALSRecommender: Epoch 1 of 30. Elapsed time 11.55 sec
IALSRecommender: Epoch 2 of 30. Elapsed time 22.84 sec
IALSRecommender: Epoch 3 of 30. Elapsed time 35.42 sec
IALSRecommender: Epoch 4 of 30. Elapsed time 47.78 sec
IALSRecommender: Epoch 5 of 30. Elapsed time 58.74 sec
IALSRecommender: Epoch 6 of 30. Elapsed time 1.19 min
IALSRecommender: Epoch 7 of 30. Elapsed time 1.39 min
IALSRecommender: Epoch 8 of 30. Elapsed time 1.57 min
IALSRecommender: Epoch 9 of 30. Elapsed time 1.77 min
IALSRecommender: Epoch 10 of 30. Elapsed time 1.98 min
IALSRecommender: Epoch 11 of 30. Elapsed time 2.16 min
IALSRecommender: Epoch 12 of 30. Elapsed time 2.36 min
IALSRecommender: Epoch 13 of 30. Elapsed time 2.56 min
IALSRecommender: Epoch 14 of 30. Elapsed time 2.77 min
IALSRecommender: Epoch 15 of 30. Elapsed time 2.95 min
IALSRecommender: Epoch 16 of 30. Elapsed time 3.15 min
IALSRecommender: Epoch 17 of 30. Elapsed time 3.36 min
IALSRecommender: Epoch 18 of 30. Elapsed time 3.54 min
IALSRecommende

In [10]:
from Recommenders.MatrixFactorization.NMFRecommender import NMFRecommender

recommender_NMF = NMFRecommender(URM_train)
recommender_NMF.fit(num_factors = 36, beta_loss = 'frobenius', l1_ratio = 1.0)

NMFRecommender: Computing NMF decomposition...
NMFRecommender: Computing NMF decomposition... done in 1.17 min


In [11]:
print("MAP of the starting models")

result_df, _ = evaluator_valid.evaluateRecommender(recommender_SLIMElasticNet)
print("SLIM ElasticNet - MAP: {}".format(result_df.loc[10]["MAP"]))

result_df, _ = evaluator_valid.evaluateRecommender(recommender_SLIM_BPR_Cython)
print("SLIM BPR - MAP: {}".format(result_df.loc[10]["MAP"]))

result_df, _ = evaluator_valid.evaluateRecommender(recommender_mf_ials)
print("MF IALS - MAP: {}".format(result_df.loc[10]["MAP"]))

result_df, _ = evaluator_valid.evaluateRecommender(recommender_NMF)
print("NMF - MAP: {}".format(result_df.loc[10]["MAP"]))

MAP of the starting models
EvaluatorHoldout: Processed 13646 (100.0%) in 24.34 sec. Users per second: 561
SLIM ElasticNet - MAP: 0.24700765559533663
EvaluatorHoldout: Processed 13646 (100.0%) in 23.81 sec. Users per second: 573
SLIM BPR - MAP: 0.2395518280838154
EvaluatorHoldout: Processed 13646 (100.0%) in 14.58 sec. Users per second: 936
MF IALS - MAP: 0.23390040165267473
EvaluatorHoldout: Processed 13646 (100.0%) in 25.24 sec. Users per second: 541
NMF - MAP: 0.21594444211804648


In [12]:
from numpy import linalg as LA
from Recommenders.BaseRecommender import BaseRecommender

class DifferentLossScoresHybridRecommender(BaseRecommender):
    """ ScoresHybridRecommender
    Hybrid of four predictions scores
    R = R1*alpha + R2*beta + R3*theta + R3*(1-alpha-beta-theta)
    
    Class from Dacrema exercise modified by Antonio Ercolani
    The original took as input 2 recommender

    """

    RECOMMENDER_NAME = "DifferentLossScoresHybridRecommender"


    def __init__(self, URM_train, recommender_1, recommender_2, recommender_3, recommender_4):
        super(DifferentLossScoresHybridRecommender, self).__init__(URM_train)

        self.URM_train = sps.csr_matrix(URM_train)
        self.recommender_1 = recommender_1
        self.recommender_2 = recommender_2
        self.recommender_3 = recommender_3
        self.recommender_4 = recommender_4
        
        
        
    def fit(self, norm, alpha = 0.5, beta = 0.5, theta = 0):

        self.alpha = alpha
        self.beta = beta
        self.theta = theta
        self.norm = norm


    def _compute_item_score(self, user_id_array, items_to_compute):
        
        item_weights_1 = self.recommender_1._compute_item_score(user_id_array)
        item_weights_2 = self.recommender_2._compute_item_score(user_id_array)
        item_weights_3 = self.recommender_3._compute_item_score(user_id_array)
        item_weights_4 = self.recommender_4._compute_item_score(user_id_array)

        norm_item_weights_1 = LA.norm(item_weights_1, self.norm)
        norm_item_weights_2 = LA.norm(item_weights_2, self.norm)
        norm_item_weights_3 = LA.norm(item_weights_3, self.norm)
        norm_item_weights_4 = LA.norm(item_weights_4, self.norm)
        
        
        if norm_item_weights_1 == 0:
            raise ValueError("Norm {} of item weights for recommender 1 is zero. Avoiding division by zero".format(self.norm))
        
        if norm_item_weights_2 == 0:
            raise ValueError("Norm {} of item weights for recommender 2 is zero. Avoiding division by zero".format(self.norm))
            
        if norm_item_weights_3 == 0:
            raise ValueError("Norm {} of item weights for recommender 3 is zero. Avoiding division by zero".format(self.norm))
            
        if norm_item_weights_4 == 0:
            raise ValueError("Norm {} of item weights for recommender 4 is zero. Avoiding division by zero".format(self.norm))
        
        item_weights = item_weights_1 / norm_item_weights_1 * self.alpha + item_weights_2 / norm_item_weights_2 * self.beta + item_weights_3 / norm_item_weights_3 * self.theta + item_weights_4 / norm_item_weights_4 * (1-self.alpha-self.beta-self.theta)

        return item_weights

In [13]:
recommender_object = DifferentLossScoresHybridRecommender(URM_train, recommender_SLIMElasticNet, recommender_SLIM_BPR_Cython, recommender_mf_ials, recommender_NMF)

best_model = {
    "MAP" : 0,
    "alpha" : 0,
    "beta" : 0,
    "theta" : 0,
    "norm" : 0
}

for norm in [2]:
    for alpha in np.arange(0.2, 1.1, 0.1):
        for beta in np.arange(0.3, 1.1, 0.1):
            for theta in np.arange(0.4, 1.1, 0.1):
            
                #truncate digits since np.arange sometimes doesn't
                alpha = round(alpha,1)
                beta = round(beta,1)
                theta = round(theta,1)


                #discard cases in which the sum is greater than 1 
                if ( (alpha+beta+theta) <= 1): 
                    gamma = round(1-alpha-beta-theta,1)

                    print("----")
                    recommender_object.fit(norm, alpha, beta, theta)
                    result_df, _ = evaluator_valid.evaluateRecommender(recommender_object)
                    print("Norm: {}, Alpha: {}, Beta: {}, Theta: {}, Gamma: {}, Result: {}".format(norm, alpha, beta, theta, gamma, result_df.loc[10]["MAP"]))

                    if result_df.loc[10]["MAP"] > best_model["MAP"]:
                        best_model["MAP"] = result_df.loc[10]["MAP"]
                        best_model["alpha"] = alpha
                        best_model["beta"] = beta
                        best_model["theta"] = theta
                        best_model["norm"] = norm

                        print("*** New best model found! ")
                        print("New best model has MAP: {} with alpha: {}, beta: {}, theta: {}, gamma: {}, norm: {}".format(best_model["MAP"], best_model["alpha"], best_model["beta"],
                                                                                                                           best_model["theta"], gamma,
                                                                                                                           best_model["norm"]))

print("----")
print("Best model has MAP: {} with alpha: {}, beta: {}, theta: {}, norm: {}".format(best_model["MAP"], best_model["alpha"], best_model["beta"], best_model["theta"], best_model["norm"]))

----
EvaluatorHoldout: Processed 13646 (100.0%) in 3.96 min. Users per second: 57
Norm: 2, Alpha: 0.2, Beta: 0.3, Theta: 0.4, Gamma: 0.1, Result: 0.24864367777987173
*** New best model found! 
New best model has MAP: 0.24864367777987173 with alpha: 0.2, beta: 0.3, theta: 0.4, gamma: 0.1, norm: 2
----
EvaluatorHoldout: Processed 13646 (100.0%) in 3.56 min. Users per second: 64
Norm: 2, Alpha: 0.2, Beta: 0.3, Theta: 0.5, Gamma: 0.0, Result: 0.24907279940165017
*** New best model found! 
New best model has MAP: 0.24907279940165017 with alpha: 0.2, beta: 0.3, theta: 0.5, gamma: 0.0, norm: 2
----
EvaluatorHoldout: Processed 13646 (100.0%) in 3.23 min. Users per second: 70
Norm: 2, Alpha: 0.2, Beta: 0.4, Theta: 0.4, Gamma: 0.0, Result: 0.2501788738603555
*** New best model found! 
New best model has MAP: 0.2501788738603555 with alpha: 0.2, beta: 0.4, theta: 0.4, gamma: 0.0, norm: 2
----
EvaluatorHoldout: Processed 13646 (100.0%) in 3.21 min. Users per second: 71
Norm: 2, Alpha: 0.3, Beta: 0.