Implementing Recommender System

In [2]:
from google.colab import drive
drive.mount('drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at drive


In [0]:
import pandas as pd
tb_final = pd.read_csv('/content/drive/My Drive/tb_final.csv')

In [4]:
tb_final.head()

Unnamed: 0,user_id,item_id,category_id,behavior_type,timestamp
0,1,3219016,3002561,pv,02/12/2017
1,1,4954999,411153,pv,03/12/2017
2,1,818610,411153,pv,03/12/2017
3,1,568695,1320293,pv,03/12/2017
4,1,2278603,3002561,pv,02/12/2017


In [0]:
tb_final.behavior_type.unique()

array(['pv', 'fav', 'cart', 'buy'], dtype=object)

In [0]:
#Assign weights to each behavior type, according to their importance. It is reasonable to assume that the order of importance would be: buy>cart>fav>pv.
#Merge the preference (p) for an item with the confidence (c) we have for that preference
#Instead of representing an explicit rating, the behavior_strength can represent the confidence  in terms of how strong the user interaction was.

behavior_type_strength = {
   'pv': 1.0,
   'fav': 2.0, 
   'cart': 3.0, 
   'buy': 4.0 
}

tb_final['behavior_strength'] = tb_final['behavior_type'].apply(lambda x: behavior_type_strength[x])

In [0]:
#Group behavior_strength together with user_id and item_id and category_id
grouped_tb = tb_final.groupby(['user_id', 'item_id', 'category_id']).sum().reset_index()


In [7]:
grouped_tb.head()

Unnamed: 0,user_id,item_id,category_id,behavior_strength
0,1,271696,411153,1.0
1,1,568695,1320293,1.0
2,1,818610,411153,1.0
3,1,929177,4801426,1.0
4,1,1323189,3524510,1.0


In [0]:
# merge the preference (p) for an item with the confidence (c) we have for that preference
# convert user id and item id into numeric

grouped_tb['user'] = grouped_tb['user_id'].astype("category").cat.codes
grouped_tb['item'] = grouped_tb['item_id'].astype("category").cat.codes
#grouped_tb['item_category'] = grouped_tb['category_id'].astype("category").cat.codes

In [9]:
grouped_tb.head()

Unnamed: 0,user_id,item_id,category_id,behavior_strength,user,item
0,1,271696,411153,1.0,0,5812
1,1,568695,1320293,1.0,0,12218
2,1,818610,411153,1.0,0,17461
3,1,929177,4801426,1.0,0,19788
4,1,1323189,3524510,1.0,0,28166


In [10]:
!pip install implicit

Collecting implicit
[?25l  Downloading https://files.pythonhosted.org/packages/5a/d8/6b4f1374ffa2647b72ac76960c71b984c6f3238090359fb419d03827d87a/implicit-0.4.2.tar.gz (1.1MB)
[K     |████████████████████████████████| 1.1MB 2.8MB/s 
Building wheels for collected packages: implicit
  Building wheel for implicit (setup.py) ... [?25l[?25hdone
  Created wheel for implicit: filename=implicit-0.4.2-cp36-cp36m-linux_x86_64.whl size=3471606 sha256=8ece062dea3c1a613dbc29697f036ecd418d23f11cdcb0715759da0c98391b3d
  Stored in directory: /root/.cache/pip/wheels/1b/48/b1/1aebe3acc3afb5589e72d3e7c3ffc3f637dc4721c1a974dff7
Successfully built implicit
Installing collected packages: implicit
Successfully installed implicit-0.4.2


In [0]:
import scipy.sparse as sparse
import numpy as np
import implicit

In [12]:
#Create two matrices:
#Matrix 1: model fitting (item-user) 
#Matrix 2: recommendation (user-item)


sparse_item_user = sparse.csr_matrix((grouped_tb['behavior_strength'].astype(float), (grouped_tb['item'], grouped_tb['user'])))
sparse_user_item = sparse.csr_matrix((grouped_tb['behavior_strength'].astype(float), (grouped_tb['user'], grouped_tb['item'])))

print(sparse_user_item.shape[0])

4826


In [13]:
#Alternating Least Squares Method
model = implicit.als.AlternatingLeastSquares(factors=32, regularization=0.1, iterations=10, use_gpu=True)



In [14]:
#Calculate the confidence for each value in our data
#Alpha value between 15 and 40 is advisable. Ref: https://medium.com/radon-dev/als-implicit-collaborative-filtering-5ed653ba39fe
alpha = 20
data = (sparse_item_user * alpha).astype('double')
model.fit(data)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [0]:
from sklearn.preprocessing import MinMaxScaler

In [0]:
#Function for recommender 

def recommend(user, sparse_user_item, user_vecs, item_vecs, num_items=10):

    #Interactions scores from the sparse user-item matrix
    user_interactions = sparse_user_item[user,:].toarray()
   

    # We don't want to recommend items the user has interacted with. 
    # Set user interactions for these to be 0 and unknowns to 1.
    
    user_interactions = user_interactions.reshape(-1) + 1

    user_interactions[user_interactions > 1] = 0
    

    #Dot product of user vector and all item vectors
    rec_vector = user_vecs[user,:].dot(item_vecs.T).toarray()
    

    #Scale the recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    

    # Multiply the recommendation of those item already interacted by zero 
    recommend_vector = user_interactions * rec_vector_scaled


    # Sort the indices of the items by order of recommendation
    item_index = np.argsort(recommend_vector)[::-1][:num_items]
    
    # Store item IDs, category IDs and scores
    items = []
    #categories =[]
    scores = []

    for index in item_index:
        items.append(grouped_tb.item.loc[grouped_tb.item == index].iloc[0])
        scores.append(recommend_vector[index])
        #categories.append(grouped_tb.item_category.loc[grouped_tb.item == index].iloc[0])

    recommendations = pd.DataFrame({'Item ID': items, 'Score': scores})

    return recommendations
    


In [0]:
# Get the trained user and item vectors. Convert to csr_matrix.
user_vecs = sparse.csr_matrix(model.user_factors)
item_vecs = sparse.csr_matrix(model.item_factors)

In [18]:
# Example: Create recommendations for user with id 0
user = 0

recommendations = recommend(user, sparse_user_item, user_vecs, item_vecs)

print(recommendations)

   Item ID     Score
0   109880  1.000000
1    32707  0.982904
2    29837  0.980566
3    82477  0.957674
4    49843  0.953020
5    65038  0.938433
6    35936  0.930571
7   102264  0.930136
8    99213  0.898736
9    82629  0.894418


In [20]:
#There are 4826 users
grouped_tb.user.unique()

array([   0,    1,    2, ..., 4823, 4824, 4825])

In [24]:
len(grouped_tb.user.unique())

4826

In [26]:
#Recommender for all users
i = 0

for i in range(len(grouped_tb.user.unique())):
  print("User:", i, "\n")
  print(recommend(i, sparse_user_item, user_vecs, item_vecs))
  print("\n\n")
  i+=1

User: 0 

   Item ID     Score
0   109880  1.000000
1    32707  0.982904
2    29837  0.980566
3    82477  0.957674
4    49843  0.953020
5    65038  0.938433
6    35936  0.930571
7   102264  0.930136
8    99213  0.898736
9    82629  0.894418



User: 1 

   Item ID     Score
0    18466  0.828280
1    30758  0.786492
2    81327  0.744933
3    19042  0.741534
4    54744  0.737843
5    37634  0.736062
6    31892  0.714401
7    39690  0.711446
8   105865  0.686611
9   106995  0.684264



User: 2 

   Item ID     Score
0   108478  1.000000
1    54327  0.913226
2    52474  0.874192
3    74417  0.872666
4    87327  0.833507
5    34154  0.830439
6    45626  0.823039
7    87743  0.805908
8   107191  0.783353
9    50709  0.779425



User: 3 

   Item ID     Score
0    71688  0.802012
1    32707  0.776735
2    83174  0.768910
3    33745  0.763697
4    49697  0.747479
5     8255  0.740618
6    67519  0.735109
7     5015  0.731254
8   102613  0.718915
9    30564  0.711255



User: 4 

   Item ID    

Evaluate Recommender System

In [0]:
#Ref: https://jessesw.com/Rec-System/
import random

def make_train(sparse_mtx, pct_test = 0.2):

#sparse_mtx
#he original matrix from which you want to generate a train/test set. Test is just a complete copy of the original set. This is in the form of a sparse csr_matrix. 

#pct_test:
#The percentage of user-item interactions where an interaction took place that you want to mask in the training set for later comparison to the test set, which contains all of the original ratings. 

    test_set = sparse_mtx.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    
    training_set = sparse_mtx.copy() # Make a copy of the original data we can alter as our training set. 
    
    nonzero_indices = training_set.nonzero() # Find the indices in the data where an interaction exists
    nonzero_pairs = list(zip(nonzero_indices[0], nonzero_indices[1])) # Zip item, user into list

    
    random.seed(42) # Set the random seed to 42 for reproducibility
    
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of item-user pairs without replacement

    item_indices = [index[0] for index in samples] # Get the item row indices

    user_indices = [index[1] for index in samples] # Get the user column indices

    
    training_set[item_indices, user_indices] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    
    return training_set, test_set, list(set(user_indices))

In [0]:
item_train, item_test, item_users_altered = make_train(sparse_item_user, pct_test = 0.2)

In [0]:
from sklearn import metrics

In [0]:
def auc_score(predictions, test):
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr)

In [0]:
def calc_mean_auc(training_set, altered_users, predictions, test_set):
    store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
    popularity_auc = [] # To store popular AUC scores
    pop_items = np.array(test_set.sum(axis = 1)).reshape(-1) # Get sum of item iteractions to find most popular
    item_vecs = predictions[1]

    count = 0

    for user in altered_users: # Iterate through each user that had an item altered
        training_column = training_set[:,user].toarray().reshape(-1) # Get the training set column
        zero_inds = np.where(training_column == 0) # Find where the interaction had not yet occurred
        
        # Get the predicted values based on our user/item vectors
        user_vec = predictions[0][user,:]
        pred = user_vec.dot(item_vecs).toarray()[0,zero_inds].reshape(-1)
        
        # Get only the items that were originally zero
        # Select all ratings from the MF prediction for this user that originally had no iteraction
        actual = test_set[:,user].toarray()[zero_inds,0].reshape(-1)
        
        # Select the binarized yes/no interaction pairs from the original full data
        # that align with the same pairs in training 
        pop = pop_items[zero_inds] # Get the item popularity for our chosen items
        
        store_auc.append(auc_score(pred, actual)) # Calculate AUC for the given user and store
        
        popularity_auc.append(auc_score(pop, actual)) # Calculate AUC using most popular and score
        
        
        if (count%10==0):
          print("User:", count, store_auc[count])
        count+=1
    # End users iteration

    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))

In [34]:
calc_mean_auc(item_train, item_users_altered,
              [user_vecs, item_vecs.T], item_test)

User: 0 0.7030446745347911
User: 10 0.9887622754220029
User: 20 0.6850274687977989
User: 30 0.9903511947863867
User: 40 0.9981918345785223
User: 50 0.9932583286568551
User: 60 0.9737629736905623
User: 70 0.9803257714289163
User: 80 0.9617786546955326
User: 90 0.9933410152850146
User: 100 0.9632169430945613
User: 110 0.9973934528604139
User: 120 0.880331185170435
User: 130 0.4596003475238922
User: 140 0.8245180210351007
User: 150 0.9984188363142404
User: 160 0.9991070862412342
User: 170 0.9453676550450745
User: 180 0.9610311045134583
User: 190 0.8462327537569883
User: 200 0.9985989030895197
User: 210 0.9437143038169459
User: 220 0.9772169772893906
User: 230 0.9058941728064299
User: 240 0.990560317129902
User: 250 0.9993514004916974
User: 260 0.9116019224155097
User: 270 0.9999456989004027
User: 280 0.9817879245988999
User: 290 0.9996003534450439
User: 300 0.852830171602346
User: 310 0.9833087403249897
User: 320 0.9985825807237564
User: 330 0.7749248039774698
User: 340 0.683585088378239


(0.922, 0.601)

As a benchmark, I calculated the mean AUC for most popular items, to compare it with the recommender for top 10 items. My recommender system performed well (mean AUC score of 0.922) vis-a-vis the benchmark recommender (mean AUC score of 0.601).