<a href="https://colab.research.google.com/github/samohtwal/BPR/blob/20221027/BPR_20221028.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [82]:
import numpy as np
import pandas as pd
import torch
import copy
from itertools import islice
from google.colab import files

In [103]:
global raw_data_user_column_name
global raw_data_item_column_name

global raw_data_group_by_user_id_dictionary_key
global raw_data_group_by_item_id_dictionary_key

global user_latent_factor_matrix_dictionary_key
global item_latent_factor_matrix_dictionary_key

global user_latent_factor_matrix_file_name
global item_latent_factor_matrix_file_name

raw_data_user_column_name = 'user_id'
raw_data_item_column_name = 'item_id'

raw_data_group_by_user_id_dictionary_key = 'raw_data_group_by_user_id'
raw_data_group_by_item_id_dictionary_key = 'raw_data_group_by_item_id'

user_latent_factor_matrix_dictionary_key = 'user_latent_factor_matrix'
item_latent_factor_matrix_dictionary_key = 'item_latent_factor_matrix'

user_latent_factor_matrix_file_name = 'user_latent_factor_matrix.csv'
item_latent_factor_matrix_file_name = 'item_latent_factor_matrix.csv'

In [104]:
class DataHelper:

    def load_raw_data_dictionary(self, raw_data_url):

        column_names = [raw_data_user_column_name, raw_data_item_column_name]
        raw_data = pd.read_csv(raw_data_url, names = column_names, header=0)

        raw_data_group_by_user_id = raw_data.groupby(raw_data_user_column_name)[raw_data_item_column_name].apply(list).reset_index(name=raw_data_item_column_name).sort_values(by=[raw_data_user_column_name])
        raw_data_group_by_item_id = raw_data.groupby(raw_data_item_column_name)[raw_data_user_column_name].apply(list).reset_index(name=raw_data_user_column_name).sort_values(by=[raw_data_item_column_name])

        raw_data_dictionary = {
            raw_data_group_by_user_id_dictionary_key: raw_data_group_by_user_id, 
            raw_data_group_by_item_id_dictionary_key: raw_data_group_by_item_id
        }

        return raw_data_dictionary

    def save_latent_factor_matrix_dictionary(self, latent_factor_matrix_dictionary):

        user_latent_factor_matrix = latent_factor_matrix_dictionary[user_latent_factor_matrix_dictionary_key]
        item_latent_factor_matrix = latent_factor_matrix_dictionary[item_latent_factor_matrix_dictionary_key]
        
        pd.DataFrame(user_latent_factor_matrix).to_csv(user_latent_factor_matrix_file_name)
        pd.DataFrame(item_latent_factor_matrix).to_csv(item_latent_factor_matrix_file_name)

        return None

    def load_latent_factor_matrix_dictionary(self):

        user_latent_factor_matrix = pd.read_csv(user_latent_factor_matrix_file_name).values
        item_latent_factor_matrix = pd.read_csv(item_latent_factor_matrix_file_name).values

        latent_factor_matrix_dictionary = {
            user_latent_factor_matrix_dictionary_key: user_latent_factor_matrix, 
            item_latent_factor_matrix_dictionary_key: item_latent_factor_matrix
        }

        return latent_factor_matrix_dictionary

In [105]:
class BayseianPersonalizedRankingModel:

    def __init__(self, random_seed = 1234, n_latent_factor = 15, n_batch = 10, learning_rate = 0.01, regularization_constant = 0.01):
        self.random_seed = random_seed
        self.n_latent_factor = n_latent_factor
        self.n_batch = n_batch
        self.learning_rate = learning_rate
        self.regularization_constant = regularization_constant

    def fit(self, raw_data_dictionary):

        raw_data_group_by_user_id = raw_data_dictionary[raw_data_group_by_user_id_dictionary_key]
        raw_data_group_by_item_id = raw_data_dictionary[raw_data_group_by_item_id_dictionary_key]

        n_user = len(raw_data_group_by_user_id[raw_data_user_column_name])
        n_item = len(raw_data_group_by_item_id[raw_data_item_column_name])

        rstate = np.random.RandomState(self.random_seed)

        user_latent_factor_matrix = rstate.normal(size = (n_user, self.n_latent_factor))
        item_latent_factor_matrix = rstate.normal(size = (n_item, self.n_latent_factor))

        user_item_rating_matrix = self._create_user_item_rating_matrix(raw_data_group_by_user_id, raw_data_group_by_item_id)

        user_latent_factor_matrixes = []
        item_latent_factor_matrixes = []
        sum_of_square_errors = []

        is_converge = True
        counter = 1
        while is_converge:
            
            sample_latent_factor_matrix_row_indexes = self._create_sample_latent_factor_matrix_row_indexes(raw_data_group_by_user_id, raw_data_group_by_item_id)
            sample_user_id_indexes, sample_positive_item_id_indexes, sample_negative_item_id_indexes = sample_latent_factor_matrix_row_indexes
            user_latent_factor_matrix, item_latent_factor_matrix = self._update_sample_latent_factor_matrix_rows(user_latent_factor_matrix, item_latent_factor_matrix, sample_user_id_indexes, sample_positive_item_id_indexes, sample_negative_item_id_indexes)
            sum_of_square_error = np.sum((user_item_rating_matrix - self._predict(user_latent_factor_matrix, item_latent_factor_matrix))**2)
            
            user_latent_factor_matrixes.append(user_latent_factor_matrix)
            item_latent_factor_matrixes.append(item_latent_factor_matrix)
            sum_of_square_errors.append(sum_of_square_error)
            
            print('debug - iteration:', counter, end='\n')
            print('debug - sum_of_square_error:', sum_of_square_error, end='\n')
            print('', end='\n')
            
            n_sum_of_square_error = len(sum_of_square_errors)
            # if (sum_of_square_errors[n_sum_of_square_error-1] > sum_of_square_errors[n_sum_of_square_error-2]):
            if counter == 2:
                is_converge = False

            counter = counter + 1

        latent_factor_matrix_dictionary = {
            user_latent_factor_matrix_dictionary_key: user_latent_factor_matrix, 
            item_latent_factor_matrix_dictionary_key: item_latent_factor_matrix
        }

        return latent_factor_matrix_dictionary

    def _create_user_item_rating_matrix(self, raw_data_group_by_user_id, raw_data_group_by_item_id):

        n_user = len(raw_data_group_by_user_id[raw_data_user_column_name])
        n_item = len(raw_data_group_by_item_id[raw_data_item_column_name])

        user_ids = raw_data_group_by_user_id[raw_data_user_column_name].tolist()
        item_ids = raw_data_group_by_item_id[raw_data_item_column_name].tolist()

        user_item_rating_matrix = np.zeros((n_user, n_item), dtype = np.int64)
        for user_id_index, user_id in enumerate(user_ids):
            item_id_indexes = np.where(np.isin(raw_data_group_by_user_id.iloc[user_id_index][raw_data_item_column_name],item_ids))[0]
            for item_id_index in item_id_indexes:
                user_item_rating_matrix[user_id_index, item_id_index] = 1

        return user_item_rating_matrix

    def _create_sample_latent_factor_matrix_row_indexes(self, raw_data_group_by_user_id, raw_data_group_by_item_id):

        n_user = len(raw_data_group_by_user_id[raw_data_user_column_name])
        n_item = len(raw_data_group_by_item_id[raw_data_item_column_name])

        item_ids = raw_data_group_by_item_id[raw_data_item_column_name].tolist()

        sample_user_id_indexes = np.random.choice(n_user, size = self.n_batch, replace = False)
        sample_positive_item_id_indexes = np.zeros(self.n_batch, dtype = np.int64)
        sample_negative_item_id_indexes = np.zeros(self.n_batch, dtype = np.int64)

        for index, sampled_user_id_index in enumerate(sample_user_id_indexes):
            possible_positive_item_ids = raw_data_group_by_user_id.iloc[sampled_user_id_index][raw_data_item_column_name]
            possible_negative_item_ids = list(set(item_ids).difference(possible_positive_item_ids))
            sample_positive_item_id = np.random.choice(possible_positive_item_ids)
            sample_negative_item_id = np.random.choice(possible_negative_item_ids)
            sample_positive_item_id_indexes[index] = item_ids.index(sample_positive_item_id)
            sample_negative_item_id_indexes[index] = item_ids.index(sample_negative_item_id)

        return sample_user_id_indexes, sample_positive_item_id_indexes, sample_negative_item_id_indexes

    def _update_sample_latent_factor_matrix_rows(self, user_latent_factor_matrix, item_latent_factor_matrix, sample_user_id_indexes, sample_positive_item_id_indexes, sample_negative_item_id_indexes):

        sample_user_latent_factor_matrix_tuple = user_latent_factor_matrix[sample_user_id_indexes]
        sample_item_latent_factor_matrix_positive_item_tuple = item_latent_factor_matrix[sample_positive_item_id_indexes]
        sample_item_latent_factor_matrix_negative_item_tuple = item_latent_factor_matrix[sample_negative_item_id_indexes]

        r_uij = np.sum(sample_user_latent_factor_matrix_tuple * (sample_item_latent_factor_matrix_positive_item_tuple - sample_item_latent_factor_matrix_negative_item_tuple), axis = 1)
        sigmoid = np.exp(-r_uij) / (1.0 + np.exp(-r_uij))
        sigmoid_tiled = np.tile(sigmoid, (self.n_latent_factor, 1)).T

        gradient_user = sigmoid_tiled * (sample_item_latent_factor_matrix_negative_item_tuple - sample_item_latent_factor_matrix_positive_item_tuple) + self.regularization_constant * sample_user_latent_factor_matrix_tuple
        gradient_positive_item = sigmoid_tiled * -sample_user_latent_factor_matrix_tuple + self.regularization_constant * sample_item_latent_factor_matrix_positive_item_tuple
        gradient_negative_item = sigmoid_tiled * sample_user_latent_factor_matrix_tuple + self.regularization_constant * sample_item_latent_factor_matrix_negative_item_tuple

        user_latent_factor_matrix[sample_user_id_indexes] -= self.learning_rate * gradient_user
        item_latent_factor_matrix[sample_positive_item_id_indexes] -= self.learning_rate * gradient_positive_item
        item_latent_factor_matrix[sample_negative_item_id_indexes] -= self.learning_rate * gradient_negative_item

        return user_latent_factor_matrix, item_latent_factor_matrix

    def recommend_user(self, raw_data_dictionary, latent_factor_matrix_dictionary, user_id, n_best_recommendation):

        raw_data_group_by_user_id = raw_data_dictionary[raw_data_group_by_user_id_dictionary_key]
        raw_data_group_by_item_id = raw_data_dictionary[raw_data_group_by_item_id_dictionary_key]

        user_latent_factor_matrix = latent_factor_matrix_dictionary[user_latent_factor_matrix_dictionary_key]
        item_latent_factor_matrix = latent_factor_matrix_dictionary[item_latent_factor_matrix_dictionary_key]

        user_ids = raw_data_group_by_user_id[raw_data_user_column_name].tolist()
        item_ids = raw_data_group_by_item_id[raw_data_item_column_name].tolist()

        user_index = user_ids.index(user_id)

        scores = self._predict_user(user_latent_factor_matrix, item_latent_factor_matrix, user_index)
        user_latent_factor_dictionary = {item_ids[i]: scores[i] for i in range(len(item_ids))}

        positive_item_ids = raw_data_group_by_user_id.iloc[user_index][raw_data_item_column_name]

        user_latent_factor_dictionary = {key: user_latent_factor_dictionary[key] for key in item_ids if key not in positive_item_ids}
        user_latent_factor_dictionary = dict(sorted(user_latent_factor_dictionary.items(), key=lambda item: item[1], reverse=True))
        best_recommendition_items = dict(islice(user_latent_factor_dictionary.items(), n_best_recommendation))

        return list(best_recommendition_items.keys())[0:n_best_recommendation]

    def _predict(self, user_latent_factor_matrix, item_latent_factor_matrix):
        return user_latent_factor_matrix.dot(item_latent_factor_matrix.T)

    def _predict_user(self, user_latent_factor_matrix, item_latent_factor_matrix, user_index):
        return user_latent_factor_matrix[user_index].dot(item_latent_factor_matrix.T)

In [106]:
data_helper = DataHelper()
bayseian_personalized_ranking_parameters = {
    'random_seed' : 1234, 
    'n_latent_factor' : 15, 
    'n_batch' : 10, 
    'learning_rate' : 0.01, 
    'regularization_constant' : 0.01
}
bayseian_personalized_ranking_model = BayseianPersonalizedRankingModel(**bayseian_personalized_ranking_parameters)

In [107]:
test_raw_data_url = 'https://raw.githubusercontent.com/samohtwal/BPR/main/data/vali-20221026.csv'
raw_data_dictionary = data_helper.load_raw_data_dictionary(test_raw_data_url)

print('debug - raw_data_dictionary[raw_data_group_by_user_id_dictionary_key]:', end='\n')
print(raw_data_dictionary[raw_data_group_by_user_id_dictionary_key], end='\n')
print('', end='\n')

print('debug - raw_data_dictionary[raw_data_group_by_item_id_dictionary_key]:', end='\n')
print(raw_data_dictionary[raw_data_group_by_item_id_dictionary_key], end='\n')
print('', end='\n')

debug - raw_data_dictionary[raw_data_group_by_user_id_dictionary_key]:
      user_id                                            item_id
0           0                             [128, 319, 2352, 2688]
1           1  [109, 985, 1201, 1294, 1324, 1345, 1427, 1664,...
2           2                                             [2633]
3           3  [142, 150, 237, 252, 320, 333, 461, 477, 479, ...
4           4                                             [1779]
...       ...                                                ...
3859     3919                                       [1883, 2002]
3860     3920                                 [1805, 1845, 1907]
3861     3921                                       [1884, 2135]
3862     3922                                 [1803, 1885, 1926]
3863     3923               [1983, 2201, 2214, 2314, 2413, 2489]

[3864 rows x 2 columns]

debug - raw_data_dictionary[raw_data_group_by_item_id_dictionary_key]:
      item_id                                       

In [108]:
latent_factor_matrix_dictionary = bayseian_personalized_ranking_model.fit(raw_data_dictionary)

print('debug - latent_factor_matrix_dictionary[user_latent_factor_matrix_dictionary_key]:', end='\n')
print(latent_factor_matrix_dictionary[user_latent_factor_matrix_dictionary_key], end='\n')
print('', end='\n')

print('debug - latent_factor_matrix_dictionary[item_latent_factor_matrix_dictionary_key]:', end='\n')
print(latent_factor_matrix_dictionary[item_latent_factor_matrix_dictionary_key], end='\n')
print('', end='\n')

data_helper.save_latent_factor_matrix_dictionary(latent_factor_matrix_dictionary)

debug - iteration: 1
debug - sum_of_square_error: 174762318.62022904

debug - iteration: 2
debug - sum_of_square_error: 174758464.92789662

debug - latent_factor_matrix_dictionary[user_latent_factor_matrix_dictionary_key]:
[[ 0.47143516 -1.19097569  1.43270697 ...  0.95332413 -2.02125482
  -0.33407737]
 [ 0.00211836  0.40545341  0.28909194 ... -1.81702723 -0.18310854
   1.05896919]
 [-0.39784023  0.33743765  1.04757857 ...  0.03614194 -2.0749776
   0.2477922 ]
 ...
 [ 1.14072459  0.24480566 -1.82190179 ... -0.58508961  0.75557621
   0.46752227]
 [-0.8545777   1.35200613  0.78975155 ... -0.02283265 -1.69484832
  -0.36669186]
 [-0.26712078 -0.5349804  -1.22014469 ...  0.14257988 -0.05139749
   0.47218592]]

debug - latent_factor_matrix_dictionary[item_latent_factor_matrix_dictionary_key]:
[[-0.61184541 -0.34683366 -1.03568407 ... -0.99889766  0.94556836
   1.85714986]
 [ 2.02333715 -1.38537805 -0.90726791 ... -1.14099655  1.29412492
  -2.83481026]
 [ 1.23193918 -0.6037652  -0.360057   ..

In [109]:
latent_factor_matrix_dictionary = data_helper.load_latent_factor_matrix_dictionary()

print('debug - latent_factor_matrix_dictionary[user_latent_factor_matrix_dictionary_key]:', end='\n')
print(latent_factor_matrix_dictionary[user_latent_factor_matrix_dictionary_key], end='\n')
print('', end='\n')

print('debug - latent_factor_matrix_dictionary[item_latent_factor_matrix_dictionary_key]:', end='\n')
print(latent_factor_matrix_dictionary[item_latent_factor_matrix_dictionary_key], end='\n')
print('', end='\n')

debug - latent_factor_matrix_dictionary[user_latent_factor_matrix_dictionary_key]:
[[ 0.00000000e+00  4.71435164e-01 -1.19097569e+00 ...  9.53324128e-01
  -2.02125482e+00 -3.34077366e-01]
 [ 1.00000000e+00  2.11836468e-03  4.05453412e-01 ... -1.81702723e+00
  -1.83108540e-01  1.05896919e+00]
 [ 2.00000000e+00 -3.97840228e-01  3.37437654e-01 ...  3.61419367e-02
  -2.07497760e+00  2.47792200e-01]
 ...
 [ 3.86100000e+03  1.14072459e+00  2.44805661e-01 ... -5.85089606e-01
   7.55576209e-01  4.67522265e-01]
 [ 3.86200000e+03 -8.54577700e-01  1.35200613e+00 ... -2.28326467e-02
  -1.69484832e+00 -3.66691863e-01]
 [ 3.86300000e+03 -2.67120782e-01 -5.34980404e-01 ...  1.42579881e-01
  -5.13974858e-02  4.72185916e-01]]

debug - latent_factor_matrix_dictionary[item_latent_factor_matrix_dictionary_key]:
[[ 0.00000000e+00 -6.11845411e-01 -3.46833662e-01 ... -9.98897662e-01
   9.45568364e-01  1.85714986e+00]
 [ 1.00000000e+00  2.02333715e+00 -1.38537805e+00 ... -1.14099655e+00
   1.29412492e+00 -2.8

In [111]:
recommendation_item_ids = bayseian_personalized_ranking_model.recommend_user(raw_data_dictionary, latent_factor_matrix_dictionary, 0, 5)
print('debug - recommendation_item_ids:', end='\n')
print(recommendation_item_ids, end='\n')
print('', end='\n')

debug - recommendation_item_ids:
[3001, 2617, 2260, 2704, 1989]

