<a href="https://colab.research.google.com/github/samohtwal/BPR/blob/20221026/BPR_20221026.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import torch
import copy
from itertools import islice

from google.colab import files

from scipy.sparse import csr_matrix, dok_matrix

In [30]:
# load data
raw_data_url = 'https://raw.githubusercontent.com/samohtwal/BPR/main/data/vali-20221026.csv'
user_column_name = 'user_id'
item_column_name = 'item_id'
column_names = [user_column_name, item_column_name]
raw_data = pd.read_csv(raw_data_url, names = column_names, header=0)

print('raw_data dimension:', raw_data.shape)
print('raw_data head: \n', raw_data.head())
print('raw_data.iloc[0][user_column_name]:', raw_data.iloc[0][user_column_name])
print('raw_data.iloc[0][item_column_name]:', raw_data.iloc[0][item_column_name])
print('raw_data.iloc[25054][user_column_name]:', raw_data.iloc[25054][user_column_name])
print('raw_data.iloc[25054][item_column_name]:', raw_data.iloc[25054][item_column_name])

structured_data = raw_data.groupby(user_column_name)[item_column_name].apply(list).reset_index(name=item_column_name).sort_values(by=[user_column_name])
print(structured_data)
print(structured_data.iloc[0][user_column_name])
print(structured_data.iloc[0][item_column_name])

user_ids = sorted(raw_data[user_column_name].unique())
item_ids = sorted(raw_data[item_column_name].unique())

print(len(user_ids))
print(len(item_ids))

raw_data dimension: (25055, 2)
raw_data head: 
    user_id  item_id
0        6        0
1        7        0
2        9        0
3       15        1
4       28        2
raw_data.iloc[0][user_column_name]: 6
raw_data.iloc[0][item_column_name]: 0
raw_data.iloc[25054][user_column_name]: 2148
raw_data.iloc[25054][item_column_name]: 3057
      user_id                                            item_id
0           0                             [128, 319, 2352, 2688]
1           1  [109, 985, 1201, 1294, 1324, 1345, 1427, 1664,...
2           2                                             [2633]
3           3  [142, 150, 237, 252, 320, 333, 461, 477, 479, ...
4           4                                             [1779]
...       ...                                                ...
3859     3919                                       [1883, 2002]
3860     3920                                 [1805, 1845, 1907]
3861     3921                                       [1884, 2135]
3862     3922   

In [24]:
def create_user_item_rating_matrix(raw_data, user_column_name, item_column_name):

    user_ids = sorted(raw_data[user_column_name].unique())
    n_user = len(user_ids)

    item_ids = sorted(raw_data[item_column_name].unique())
    n_item = len(item_ids)

    user_item_rating_matrix = np.zeros((n_user, n_item), dtype = np.int32)

    n_raw_data = raw_data.shape[0]-1
    for x in range(n_raw_data):
        user_item_rating_matrix[user_ids.index(raw_data.iloc[x][user_column_name]), item_ids.index(raw_data.iloc[x][item_column_name])] = 1

    return user_item_rating_matrix, user_ids, item_ids

In [25]:
user_item_rating_matrix, user_ids, item_ids = create_user_item_rating_matrix(raw_data, user_column_name, item_column_name)
print('user_item_rating_matrix:\n', user_item_rating_matrix)

'''
df = pd.DataFrame(user_item_rating_matrix)
df.to_csv('output.csv', encoding = 'utf-8-sig') 
files.download('output.csv')
'''

user_item_rating_matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


"\ndf = pd.DataFrame(user_item_rating_matrix)\ndf.to_csv('output.csv', encoding = 'utf-8-sig') \nfiles.download('output.csv')\n"

In [26]:
class BPR:

    def __init__(
        self, 
        user_ids,
        item_ids,
        learning_rate = 0.01,
        regularization_constant = 0.01,  
        n_factor = 15, 
        n_iteration = 10, 
        n_batch = 100, 
        random_seed = 1234):

        self.user_ids = user_ids
        self.item_ids = item_ids
        self.learning_rate = learning_rate
        self.regularization_constant = regularization_constant
        self.n_factor = n_factor
        self.n_iteration = n_iteration
        self.n_batch = n_batch
        self.random_seed = random_seed

        self.user_latent_factor_matrix = []
        self.item_latent_factor_matrix = []
    
    def train(self, user_item_rating_matrix):

        n_user = user_item_rating_matrix.shape[0]
        n_item = user_item_rating_matrix.shape[1]

        rstate = np.random.RandomState(self.random_seed)
        self.user_latent_factor_matrix = rstate.normal(size = (n_user, self.n_factor))
        self.item_latent_factor_matrix = rstate.normal(size = (n_item, self.n_factor))

        
        for _ in range(self.n_iteration):
            sample_tuples = self._create_sample_tuples(user_item_rating_matrix, self.user_latent_factor_matrix, self.item_latent_factor_matrix, self.n_batch)
            user_latent_matrix_tuples, item_latent_matrix_positive_item_tuples, item_latent_matrix_negative_item_tuples = sample_tuples
            self._update_sample_tuples(user_latent_matrix_tuples, item_latent_matrix_positive_item_tuples, item_latent_matrix_negative_item_tuples, self.learning_rate, self.regularization_constant)

        return self

    def _create_sample_tuples(self, user_item_rating_matrix, user_latent_factor_matrix, item_latent_factor_matrix, n_batch):

        n_user = user_item_rating_matrix.shape[0]
        n_item = user_item_rating_matrix.shape[1]

        sample_user_indexes = np.random.choice(n_user, size = n_batch, replace = False)
        sample_positive_item_indexes = np.zeros(n_batch, dtype = np.int32)
        sample_negative_item_indexes = np.zeros(n_batch, dtype = np.int32)

        for index, sampled_user_index in enumerate(sample_user_indexes):

            possible_positive_item_indexes = []
            possible_negative_item_indexes = []

            for x in range(n_item):
                if (user_item_rating_matrix[sampled_user_index,x]) == 1:
                    possible_positive_item_indexes.append(x)
                else:
                    possible_negative_item_indexes.append(x)

            sample_positive_item_indexes[index] = np.random.choice(possible_positive_item_indexes)
            sample_negative_item_indexes[index] = np.random.choice(possible_negative_item_indexes)

        user_latent_matrix_tuples = user_latent_factor_matrix[sample_user_indexes]
        item_latent_matrix_positive_item_tuples = item_latent_factor_matrix[sample_positive_item_indexes]
        item_latent_matrix_negative_item_tuples = item_latent_factor_matrix[sample_negative_item_indexes]

        return user_latent_matrix_tuples, item_latent_matrix_positive_item_tuples, item_latent_matrix_negative_item_tuples
    
    def _update_sample_tuples(self, user_latent_matrix_tuples, item_latent_matrix_positive_item_tuples, item_latent_matrix_negative_item_tuples, learning_rate, regularization_constant):

        r_uij = np.sum(user_latent_matrix_tuples * (item_latent_matrix_positive_item_tuples - item_latent_matrix_negative_item_tuples), axis = 1)
        sigmoid = np.exp(-r_uij) / (1.0 + np.exp(-r_uij))
        sigmoid_tiled = np.tile(sigmoid, (self.n_factor, 1)).T

        gradient_user = sigmoid_tiled * (item_latent_matrix_negative_item_tuples - item_latent_matrix_positive_item_tuples) + regularization_constant * user_latent_matrix_tuples
        gradient_positive_item = sigmoid_tiled * -user_latent_matrix_tuples + regularization_constant * item_latent_matrix_positive_item_tuples
        gradient_negative_item = sigmoid_tiled * user_latent_matrix_tuples + regularization_constant * item_latent_matrix_negative_item_tuples

        user_latent_matrix_tuples -= learning_rate * gradient_user
        item_latent_matrix_positive_item_tuples -= learning_rate * gradient_positive_item
        item_latent_matrix_negative_item_tuples -= learning_rate * gradient_negative_item

        return user_latent_matrix_tuples, item_latent_matrix_positive_item_tuples, item_latent_matrix_negative_item_tuples
    
    def predict(self):
        return self.user_latent_factor_matrix.dot(self.item_latent_factor_matrix.T)
    
    def _predict_user(self, user_index):
        return self.user_latent_factor_matrix[user_index].dot(self.item_latent_factor_matrix.T)
    
    def _recommend_user(self, user_item_rating_matrix, user_index, n_best_recommendation, item_ids):

        scores = self._predict_user(user_index)

        user_latent_factor_dictionary = {item_ids[i]: scores[i] for i in range(len(item_ids))}

        print('len(user_latent_factor_dictionary):\n', len(user_latent_factor_dictionary))

        n_item = user_item_rating_matrix.shape[1]
        positive_item_ids = []
        for x in range(n_item):
            if (user_item_rating_matrix[user_index,x]) == 1:
                positive_item_ids.append(item_ids[x])

        user_latent_factor_dictionary = {key: user_latent_factor_dictionary[key] for key in item_ids if key not in positive_item_ids}
        user_latent_factor_dictionary = dict(sorted(user_latent_factor_dictionary.items(), key=lambda item: item[1], reverse=True))
        best_recommendition_items = dict(islice(user_latent_factor_dictionary.items(), n_best_recommendation))

        print('best_recommendition_items:\n', best_recommendition_items)

        return list(best_recommendition_items.keys())[0:n_best_recommendation]

In [27]:
bpr_params = {
    'user_ids': user_ids,
    'item_ids': item_ids,
    'learning_rate': 0.1,
    'regularization_constant': 0.01,
    'n_iteration': 100,
    'n_factor': 15,
    'n_batch': 10
}
bpr = BPR(**bpr_params)
bpr.train(user_item_rating_matrix)

<__main__.BPR at 0x7f0ee6433f90>

In [11]:
# print(bpr.predict())
print(bpr._recommend_user(user_item_rating_matrix, 0, 3, bpr.item_ids))

len(user_latent_factor_dictionary):
 2997
best_recommendition_items:
 {3001: 17.772410672646984, 2617: 15.093194414299836, 2260: 14.841441566852485}
[3001, 2617, 2260]
