In [2]:
#Reference -- https://ethen8181.github.io/machine-learning/recsys/4_bpr.html
#Change the current path of the execution
import sys
import os
cwd = f'{os.getcwd()}/../'
sys.path.append(cwd)
os.chdir(cwd)

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
import sys
import numpy as np
import pandas as pd
from subprocess import call
from src.examples_src.BPR import BPR, auc_score, create_train_test, create_matrix

In [6]:
#Downloading and saving the data
file_path = os.path.join('datasets', 'ml-100k.zip')
call(['curl', 'http://files.grouplens.org/datasets/movielens/ml-100k.zip', '-L' ,'-o', file_path])
call(['unzip', '-o', file_path, '-d', 'datasets/'])

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0 4808k    0 15649    0     0  26256      0  0:03:07 --:--:--  0:03:07 26212

Archive:  datasets/ml-100k.zip
   creating: datasets/ml-100k/
  inflating: datasets/ml-100k/allbut.pl  
  inflating: datasets/ml-100k/mku.sh  
  inflating: datasets/ml-100k/README  
  inflating: datasets/ml-100k/u.data  
  inflating: datasets/ml-100k/u.genre  
  inflating: datasets/ml-100k/u.info  
  inflating: datasets/ml-100k/u.item  
  inflating: datasets/ml-100k/u.occupation  
  inflating: datasets/ml-100k/u.user  
  inflating: datasets/ml-100k/u1.base  
  inflating: datasets/ml-100k/u1.test  
  inflating: datasets/ml-100k/u2.base  
  inflating: datasets/ml-100k/u2.test  
  inflating: datasets/ml-100k/u3.base  
  inflating: datasets/ml-100k/u3.test  
  inflating: datasets/ml-100k/u4.base  
  inflating: datasets/ml-100k/u4.test  
  inflating: datasets/ml-100k/u5.base  
  inflating: datasets/ml-100k/u5.test  
  inflating: datasets/ml-100k/ua.base  
  inflating: datasets/ml-100k/ua.test  
  inflating: datasets/ml-100k/ub.base  
  inflating: datasets/ml-100k/ub.test  


100 4808k  100 4808k    0     0  3454k      0  0:00:01  0:00:01 --:--:-- 3454k


0

In [7]:
#Reading the data
read_file_path = os.path.join("datasets", "ml-100k", 'u.data')

# we will not be using the timestamp column
names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv(read_file_path, sep = '\t', names = names)
print('data dimension: \n', df.shape)
df.head()

data dimension: 
 (100000, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [8]:
#No. of unique users in the data
len(df.user_id.unique())

943

Because BPR assumes binary implicit feedback (meaing there's only positive and negative items), here we'll assume that an item is positive only if he/she gave the item a ratings above 3 (feel free to experiment and change the threshold). The next few code chunks, creates the sparse interaction matrix and split into train and test set.

In [9]:
items_col = 'item_id'
users_col = 'user_id'
ratings_col = 'rating'
threshold = 3
X, df = create_matrix(df, users_col, items_col, ratings_col, threshold)
X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[ratings_col] = 1


<943x1574 sparse matrix of type '<class 'numpy.int64'>'
	with 82520 stored elements in Compressed Sparse Row format>

In [10]:
X_train, X_test = create_train_test(X, test_size = 0.2, seed = 1234)

In [11]:
X_train

<943x1574 sparse matrix of type '<class 'numpy.int64'>'
	with 65641 stored elements in Compressed Sparse Row format>

In [12]:
X_test

<943x1574 sparse matrix of type '<class 'numpy.float64'>'
	with 16879 stored elements in Compressed Sparse Row format>

In [13]:
# parameters were randomly chosen
bpr_params = {
    'reg': 0.01,
    'learning_rate': 0.1,
    'n_iters': 160,
    'n_factors': 15,
    'batch_size': 100
}
bpr = BPR(**bpr_params)
bpr.fit(X_train)

BPR: 100%|████████████████████████████████████| 160/160 [00:03<00:00, 41.34it/s]


<src.examples_src.BPR.BPR at 0x7fc2e23919a0>

In [14]:
print(auc_score(bpr, X_train))
print(auc_score(bpr, X_test))

0.8964786017422693
0.8269677516209023


### Item Recommendations

In [15]:
#Given the trained model, we can get the most similar items by using the get_similar_items method, we can specify the number of most similar items by specifying the N argument. And this can be seen as the people who like/buy this also like/buy this functionality, since it's recommending similar items for a given item.
bpr.get_similar_items(N = 5)

array([[ 110,  180,   99,  469,   49],
       [  66,  362,  208,   98,  433],
       [ 400,  390,  582,  463,    1],
       ...,
       [ 447, 1253,  350, 1016,  863],
       [ 243,  114,  279, 1141,  536],
       [ 809,  750,  697,  660, 1062]], dtype=uint32)

In [16]:
#On the other hand, we can also generate a top-N recommended item for each given user, by passing the sparse rating data and N to the recommend method.
bpr.recommend(X_train, N = 5)

array([[473,   6, 274, 193, 275],
       [281, 469, 403, 317, 123],
       [301, 309, 268, 305, 675],
       ...,
       [403,  99,  12, 465, 274],
       [ 63, 890, 287,  68,   6],
       [233, 401, 153, 202, 264]], dtype=uint32)