In [1]:
import pandas as pd
import numpy as np
import pickle
from scipy.sparse.linalg import norm
from scipy.sparse import csr_matrix, csc_matrix, coo_matrix, issparse, vstack
from pandas.api.types import CategoricalDtype
from IPython.display import clear_output
import timeit, json, warnings
from tqdm import notebook

# Prediction
----

In [2]:
df_s = pd.read_csv('../data/similar_items/freq_rating_item_similar_objects.csv')
s_width = int(df_s.shape[1] / 2) # size of similarity values
s_products = df_s.iloc[:, 1:s_width+1]
sim_product = df_s.iloc[:, -s_width:].to_numpy()

products = json.load(open('../data/interaction/products/freq_rating_item_products.json'))

sim = pickle.load(open('../data/similarity/freq_rating_item_similarity.pkl', 'rb'))

df_s

Unnamed: 0,id,Recommendation for product:,Product 1.,Product 2.,Product 3.,Product 4.,Product 5.,Product 6.,Product 7.,Product 8.,...,Similarity 6.,Similarity 7.,Similarity 8.,Similarity 9.,Similarity 10.,Similarity 11.,Similarity 12.,Similarity 13.,Similarity 14.,Similarity 15.
0,0,#2 Coffee Filters,Compostable Coffee Filters,Organic Hass Avocado,Organic Garlic,Bag of Organic Bananas,Organic Yellow Onion,Major Dickason's Blend Ground Coffee Dark Roast,Organic Strawberries,Organic Lemon,...,0.049973,0.048253,0.047569,0.047076,0.046623,0.046103,0.045945,0.044290,0.043183,0.043164
1,1,0% Fat Blueberry Greek Yogurt,Greek 0% Fat Strawberry on the Bottom Yogurt,0% Fat Superfruits Greek Yogurt,0% Fat Organic Greek Vanilla Yogurt,"0% Greek, Blueberry on the Bottom Yogurt",Nonfat Strawberry With Fruit On The Bottom Gre...,"Organic Nonfat Yogurt, Peach",YoKids Strawberry Banana Organic Lowfat Yogurt,Vegetable Dumpling,...,0.039299,0.037897,0.035791,0.032983,0.032233,0.032127,0.031653,0.031442,0.030699,0.030687
2,2,0% Fat Free Organic Milk,Vitamin D Organic Whole Milk,Organic Skim Milk,Organic 1% Milk,2% Reduced Fat Organic Milk,Banana,Organic Skim Milk with DHA Omega-3,Large Lemon,Organic Large Grade A Brown Eggs,...,0.063104,0.059423,0.055346,0.054124,0.053103,0.051740,0.049951,0.048660,0.047975,0.047058
3,3,0% Fat Organic Greek Vanilla Yogurt,Vanilla Greek Yogurt 0% Fat,0% Fat Blueberry Greek Yogurt,Greek 0% Fat Strawberry on the Bottom Yogurt,Organic Vanilla Bean Greek Yogurt,YoKids Strawberry Banana Organic Lowfat Yogurt,0% Fat Superfruits Greek Yogurt,Organic Strawberries,Nonfat Plain Greek Yogurt,...,0.077823,0.056596,0.053974,0.051517,0.046818,0.046076,0.045680,0.045299,0.045136,0.043073
4,4,0% Fat Superfruits Greek Yogurt,Greek 0% Fat Strawberry on the Bottom Yogurt,0% Fat Blueberry Greek Yogurt,0% Fat Organic Greek Vanilla Yogurt,Nonfat Strawberry With Fruit On The Bottom Gre...,YoKids Strawberry Banana Organic Lowfat Yogurt,"0% Greek, Blueberry on the Bottom Yogurt",Cherry Pomegranate Greek Yogurt,Vanilla Greek Yogurt 0% Fat,...,0.060427,0.058599,0.058476,0.057264,0.052049,0.048396,0.043193,0.042597,0.042537,0.042423
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9931,9931,of Hanover 100 Calorie Pretzels Mini,100 Calorie Pretzels Packs,Goldfish Cheddar Baked Snack Crackers Multi Packs,Cheez-It Packs,Classic Mix Variety,Variety Pack Snack Stacks,Pretzel Snaps Pretzel Packs,100 Calorie Per Bag Popcorn,Oreo Mini/Nutter Butter Bites/Mini Chips Ahoy!...,...,0.111210,0.110836,0.104913,0.103571,0.101299,0.099143,0.094165,0.090169,0.089389,0.088879
9932,9932,smartwater® Electrolyte Enhanced Water,Smartwater Electrolyte Enhanced Water,Electrolyte Enhanced Water,Sensitive Toilet Paper,Ultra Soft Facial Tissues,Cola Soft Drink,Original Glass Cleaner,Classic Hummus,Banana,...,0.060350,0.059629,0.057630,0.057304,0.057138,0.055620,0.055537,0.054315,0.053404,0.052331
9933,9933,vitaminwater® XXX Acai Blueberry Pomegranate,Power-C Dragonfruit Vitamin Water Drink,Vitaminwater Revive Fruit Punch,Energy Tropical Citrus Vitamin Water Drink,Vitaminwater® Focus Kiwi-Strawberry,Essential Orange-Orange Vitamin Water Drink,VitaminWater Zero™ XXX Acai Blueberry Pomegranate,Vitamin Water Zero Squeezed Lemonade,Zero Vitamin Water,...,0.102554,0.078562,0.077014,0.074649,0.070355,0.060045,0.048015,0.043550,0.042980,0.042882
9934,9934,with Crispy Almonds Cereal,Cherrios Honey Nut,Cinnamon Toast Crunch,Raisin Bran,Honey Bunches of Oats Roasted Oats,0% Greek Strained Yogurt,Clementines,Trail Mix,Cheez-It Cheddar Cracker,...,0.200865,0.196640,0.167157,0.165352,0.162647,0.159786,0.158736,0.155532,0.155194,0.154455


In [3]:
def predict(R, method):
    """
    Generates the predictions for each given user in `R`
    
    :param R: users to predict recommendations, usually the interaction matrix (user/item), either: pd.DataFrame, np.ndarray, np.matrix or scipy.sparse matrix
    :param method: method chosen of how `R` was created. 'rating'/'count' will result in equation 2.8, 'binary' will result in equation 2.10
    
    :returns: -scipy.sparse matrix if `method` was 'rating' or 'count'
              - np.array if `method` was 'binary' (float32)
    """
    predictions = None
    
    # convert R to sparse column matrix if not already done
    if isinstance(R, pd.DataFrame):
        R = csc_matrix(R.values)
    elif isinstance(R, np.matrix) or isinstance(R, np.ndarray):
        R = csc_matrix(R)
    elif issparse(R):
        pass
    
    if method == 'rating' or method == 'count':
        """This method works if the rating is NOT unary AND
        when it is NOT possible for similarity scores to be negative when ratings are constrained to be nonnegative.
        
        Formula: p_{u,i} = (sum_{j∈S}(s(i,j)*r_{u,j})) / (sum_{j∈S}(abs(s(i,j)))) | S is a set of items similar to i
        
        Equation 2.8 shown in:
        Collaborative Filtering Recommender Systems 2010
        By Michael D. Ekstrand, John T. Riedl and Joseph A. Konstan"""
        
        batchsize = 10000 # tests have shown that this is a good batch size to avoid performance issues
        df_s = pd.read_csv('../data/similar_items/freq_rating_item_similar_objects.csv')
        
        # select item similarity matrix
        df_ix = df_s.iloc[:, 2:]
        num_items = int(df_ix.shape[1] / 3)
        s_ix_np = df_ix.iloc[:, num_items:-num_items].to_numpy()

        # create sparse similarity matrix where for each column the item_i just contains the k nearest similarities
        # rest is zero for matrix dot product
        col_ix = np.array([s_ix_np.shape[1] * [i] for i in range(s_ix_np.shape[0])]).ravel()
        row_ix = s_ix_np.astype(int).ravel()
        A = np.zeros(sim.shape)
        A[row_ix, col_ix] = 1
        S = A * sim # hadamard product to just keep k similarities
        S = csr_matrix(S)
        
        # perform batchwise predictions
        i_prev = 0
        denominators = 1 / np.sum(np.absolute(sim_product), axis = 1)
        
        for i in range(batchsize, R.shape[0] + batchsize, batchsize):
            # batch prep
            i = min(i, R.shape[0])
            batch = R[i_prev:i]
            
            # numerators
            batch_predictions = batch.dot(S)

            # denominators with hadamard product
            D = np.array([[denominator] * batch_predictions.shape[0] for denominator in denominators]).T
            batch_predictions = batch_predictions.multiply(D)
            
            # append batch to predictions
            if issparse(predictions):
                predictions = vstack([predictions, batch_predictions])
            else:
                predictions = batch_predictions
            
            # update slice index
            i_prev = i
        
    elif method == 'binary':
        """This method works only for unary scores.
        
        Formula: p_{u,i} = sum_{j∈I_u}(s(i,j)) | I_u is the user's purchase history
        
        Equation 2.10 shown in:
        Collaborative Filtering Recommender Systems 2010
        By Michael D. Ekstrand, John T. Riedl and Joseph A. Konstan"""

        S = pickle.load(open('../data/similarity/freq_binary_item_similarity.pkl', 'rb'))
        # dot product works because summation of similarities which are in I_u is given if rating is unary
        # and non bought-items are weighted as zero
        I = coo_matrix(R).tocsr()
        predictions = np.float32(I.dot(S)) #np.float32 doubles execution time, but reduces memory requirements by half
    
    return predictions

In [4]:
R = pickle.load(open('../data/interaction/freq_rating_item_interaction.pkl', 'rb'))
#predict(R[:10000], 'rating')

In [5]:
R = pickle.load(open('../data/interaction/freq_binary_item_interaction.pkl', 'rb'))
p = predict(R, 'binary')
p

array([[0.26325935, 0.12898889, 0.1897966 , ..., 0.22534288, 1.7317654 ,
        0.08050919],
       [1.7627888 , 1.1670631 , 1.5974131 , ..., 1.8012952 , 0.4000131 ,
        0.72464716],
       [0.69137955, 0.46592778, 0.5581075 , ..., 0.6489711 , 0.31144032,
        0.2646716 ],
       ...,
       [1.9820381 , 1.004596  , 1.0656419 , ..., 1.5270361 , 0.28962445,
        0.4797298 ],
       [4.03237   , 1.7629918 , 1.8797815 , ..., 3.0527122 , 0.5316414 ,
        0.8208857 ],
       [0.6821633 , 0.44322035, 1.0733685 , ..., 0.8086056 , 0.30947533,
        0.7012919 ]], dtype=float32)