In [1]:
import pandas as pd
import numpy as np
import pickle
from scipy.sparse.linalg import norm
from scipy.sparse import csr_matrix, csc_matrix, coo_matrix, issparse, vstack
from pandas.api.types import CategoricalDtype
from IPython.display import clear_output
import timeit, json, warnings
from tqdm import notebook

# Prediction
----

In [2]:
def predict(R, mode):
    """
    Generates the predictions for each given user in `R`
    
    :param R: users to predict recommendations, usually the interaction matrix (user/item), either: pd.DataFrame, np.ndarray, np.matrix or scipy.sparse matrix
    :param method: method chosen of how `R` was created. 'rating'/'count' will result in equation 2.8, 'binary' will result in equation 2.10
    
    :returns: -scipy.sparse coo_matrix if `method` was 'rating' or 'count'
              - np.array if `method` was 'binary' (float32)
    """
    predictions = None
    
    # convert R to sparse column matrix if not already done
    if isinstance(R, pd.DataFrame):
        R = csc_matrix(R.values)
    elif isinstance(R, np.matrix) or isinstance(R, np.ndarray):
        R = csc_matrix(R)
    elif issparse(R):
        pass
    
    if mode == 'rating' or mode == 'count':
        """This mode works if the rating is NOT unary AND
        when it is NOT possible for similarity scores to be negative when ratings are constrained to be nonnegative.
        
        Formula: p_{u,i} = (sum_{j∈S}(s(i,j)*r_{u,j})) / (sum_{j∈S}(abs(s(i,j)))) | S is a set of items similar to i
        
        Equation 2.8 shown in:
        Collaborative Filtering Recommender Systems 2010
        By Michael D. Ekstrand, John T. Riedl and Joseph A. Konstan"""
        
        batchsize = 10000 # tests have shown that this is a good batch size to avoid performance issues
        df_s = pd.read_csv('../data/similar_items/freq_rating_item_similar_objects.csv')
        sim = pickle.load(open('../data/similarity/freq_rating_item_similarity.pkl', 'rb'))
        df_ix = df_s.iloc[:, 2:]
        num_items = int(df_ix.shape[1] / 3)
        s_ix_np = df_ix.iloc[:, num_items:-num_items].to_numpy()
        sim_product = df_s.iloc[:, -num_items:].to_numpy()

        # create sparse similarity matrix where for each column the item_i just contains the k nearest similarities
        # rest is zero for matrix dot product
        col_ix = np.array([s_ix_np.shape[1] * [i] for i in range(s_ix_np.shape[0])]).ravel()
        row_ix = s_ix_np.astype(int).ravel()
        A = np.zeros(sim.shape)
        A[row_ix, col_ix] = 1
        S = A * sim # hadamard product to just keep k similarities
        S = csr_matrix(S)
        
        # perform batchwise predictions
        i_prev = 0
        denominators = 1 / np.sum(np.absolute(sim_product), axis = 1)
        
        for i in range(batchsize, R.shape[0] + batchsize, batchsize):
            # batch prep
            i = min(i, R.shape[0])
            batch = R[i_prev:i]
            
            # numerators
            batch_predictions = batch.dot(S)

            # denominators with hadamard product
            D = np.array([[denominator] * batch_predictions.shape[0] for denominator in denominators]).T
            batch_predictions = batch_predictions.multiply(D)
            
            # append batch to predictions
            if issparse(predictions):
                predictions = vstack([predictions, batch_predictions])
            else:
                predictions = batch_predictions
            
            # update slice index
            i_prev = i
        
    elif mode == 'binary':
        """This mode works only for unary scores.
        
        Formula: p_{u,i} = sum_{j∈I_u}(s(i,j)) | I_u is the user's purchase history
        
        Equation 2.10 shown in:
        Collaborative Filtering Recommender Systems 2010
        By Michael D. Ekstrand, John T. Riedl and Joseph A. Konstan"""

        S = pickle.load(open('../data/similarity/freq_binary_item_similarity.pkl', 'rb'))
        # dot product works because summation of similarities which are in I_u is given if rating is unary
        # and non bought-items are weighted as zero
        I = coo_matrix(R).tocsr()
        predictions = np.float32(I.dot(S)) #np.float32 doubles execution time, but reduces memory requirements by half
    
    return predictions

In [3]:
R = pickle.load(open('../data/interaction/freq_rating_item_interaction.pkl', 'rb'))
#predict(R[:10000], 'rating')

<10000x9936 sparse matrix of type '<class 'numpy.float64'>'
	with 39591080 stored elements in COOrdinate format>

In [4]:
R = pickle.load(open('../data/interaction/freq_binary_item_interaction.pkl', 'rb'))
#p = predict(R, 'binary')
#p