<a href="https://colab.research.google.com/github/samohtwal/BPR/blob/20221027/BPR_20221028.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
import numpy as np
import pandas as pd
import torch
import copy
from itertools import islice
from google.colab import files
import os

In [46]:
global raw_data_user_column_name
global raw_data_item_column_name

global raw_data_group_by_user_id_dictionary_key
global raw_data_group_by_item_id_dictionary_key

global user_latent_factor_matrix_dictionary_key
global item_latent_factor_matrix_dictionary_key

global working_file_path

global user_latent_factor_matrix_file_name
global item_latent_factor_matrix_file_name

global error_file_name

raw_data_user_column_name = 'user_id'
raw_data_item_column_name = 'item_id'

raw_data_group_by_user_id_dictionary_key = 'raw_data_group_by_user_id'
raw_data_group_by_item_id_dictionary_key = 'raw_data_group_by_item_id'

user_latent_factor_matrix_dictionary_key = 'user_latent_factor_matrix'
item_latent_factor_matrix_dictionary_key = 'item_latent_factor_matrix'

working_file_path = '/content/working/'

user_latent_factor_matrix_file_name = 'user_latent_factor_matrix.csv'
item_latent_factor_matrix_file_name = 'item_latent_factor_matrix.csv'

error_file_name = 'error.csv'

In [95]:
class DataHelper:

    def load_raw_data_dictionary(self, raw_data_url):

        column_names = [raw_data_user_column_name, raw_data_item_column_name]
        raw_data = pd.read_csv(raw_data_url, names = column_names, header=0)

        raw_data_group_by_user_id = raw_data.groupby(raw_data_user_column_name)[raw_data_item_column_name].apply(list).reset_index(name=raw_data_item_column_name).sort_values(by=[raw_data_user_column_name])
        raw_data_group_by_item_id = raw_data.groupby(raw_data_item_column_name)[raw_data_user_column_name].apply(list).reset_index(name=raw_data_user_column_name).sort_values(by=[raw_data_item_column_name])

        raw_data_dictionary = {
            raw_data_group_by_user_id_dictionary_key: raw_data_group_by_user_id, 
            raw_data_group_by_item_id_dictionary_key: raw_data_group_by_item_id
        }

        return raw_data_dictionary

    def save_latent_factor_matrix_dictionary(self, latent_factor_matrix_dictionary):

        user_latent_factor_matrix = latent_factor_matrix_dictionary[user_latent_factor_matrix_dictionary_key]
        item_latent_factor_matrix = latent_factor_matrix_dictionary[item_latent_factor_matrix_dictionary_key]
        
        if os.path.exists(working_file_path) == False:
            os.mkdir(working_file_path)

        pd.DataFrame(user_latent_factor_matrix).to_csv(working_file_path + user_latent_factor_matrix_file_name)
        pd.DataFrame(item_latent_factor_matrix).to_csv(working_file_path + item_latent_factor_matrix_file_name)

        return None

    def load_latent_factor_matrix_dictionary(self):

        user_latent_factor_matrix = pd.read_csv(working_file_path + user_latent_factor_matrix_file_name).values
        item_latent_factor_matrix = pd.read_csv(working_file_path + item_latent_factor_matrix_file_name).values

        latent_factor_matrix_dictionary = {
            user_latent_factor_matrix_dictionary_key: user_latent_factor_matrix, 
            item_latent_factor_matrix_dictionary_key: item_latent_factor_matrix
        }

        return latent_factor_matrix_dictionary
    
    def save_error(self, errors):
        if os.path.exists(working_file_path) == False:
            os.mkdir(working_file_path)
        pd.DataFrame(errors).to_csv(working_file_path + error_file_name)

        return None

In [48]:
errors = np.zeros(10, dtype = np.int64)
DataHelper().save_error(errors)

In [90]:
class BayseianPersonalizedRankingModel:

    def __init__(self, random_seed = 1234, n_latent_factor = 15, n_batch = 10, learning_rate = 0.01, regularization_constant = 0.01):
        self.random_seed = random_seed
        self.n_latent_factor = n_latent_factor
        self.n_batch = n_batch
        self.learning_rate = learning_rate
        self.regularization_constant = regularization_constant

    def fit(self, raw_data_dictionary):

        raw_data_group_by_user_id = raw_data_dictionary[raw_data_group_by_user_id_dictionary_key]
        raw_data_group_by_item_id = raw_data_dictionary[raw_data_group_by_item_id_dictionary_key]

        n_user = len(raw_data_group_by_user_id[raw_data_user_column_name])
        n_item = len(raw_data_group_by_item_id[raw_data_item_column_name])

        rstate = np.random.RandomState(self.random_seed)

        user_latent_factor_matrix = rstate.normal(size = (n_user, self.n_latent_factor))
        item_latent_factor_matrix = rstate.normal(size = (n_item, self.n_latent_factor))

        user_item_rating_matrix = self._create_user_item_rating_matrix(raw_data_group_by_user_id, raw_data_group_by_item_id)

        user_latent_factor_matrixes = []
        item_latent_factor_matrixes = []
        root_mean_square_errors = []

        is_converge = True
        index = 0
        while is_converge:

            # self.learning_rate = self.learning_rate * np.exp(-5*index)
            # self.learning_rate = self.learning_rate / (1 + 2 * index)
            
            sample_latent_factor_matrix_row_indexes = self._create_sample_latent_factor_matrix_row_indexes(raw_data_group_by_user_id, raw_data_group_by_item_id)
            sample_user_id_indexes, sample_positive_item_id_indexes, sample_negative_item_id_indexes = sample_latent_factor_matrix_row_indexes
            user_latent_factor_matrix, item_latent_factor_matrix = self._update_sample_latent_factor_matrix_rows(user_latent_factor_matrix, item_latent_factor_matrix, sample_user_id_indexes, sample_positive_item_id_indexes, sample_negative_item_id_indexes)
            sum_of_square_error = np.sum((user_item_rating_matrix - self._predict(user_latent_factor_matrix, item_latent_factor_matrix))**2)
            root_mean_square_error = np.sqrt((sum_of_square_error / n_user * n_item))

            user_latent_factor_matrixes.append(user_latent_factor_matrix)
            item_latent_factor_matrixes.append(item_latent_factor_matrix)
            root_mean_square_errors.append(root_mean_square_error)
            
            print('debug - iteration:', index+1, end='\n')
            print('debug - self.learning_rate:', self.learning_rate, end='\n')
            print('debug - root_mean_square_error:', root_mean_square_error, end='\n')
            print('', end='\n')
            
            # if (root_mean_square_errors[index] 
            #     > root_mean_square_errors[index-1] 
            #     > root_mean_square_errors[index-2]
            #     > root_mean_square_errors[index-3]):
            if index == 10:
                is_converge = False

            index = index + 1

        latent_factor_matrix_dictionary = {
            user_latent_factor_matrix_dictionary_key: user_latent_factor_matrixes[index-4], 
            item_latent_factor_matrix_dictionary_key: item_latent_factor_matrix[index-4]
        }

        return latent_factor_matrix_dictionary, root_mean_square_errors[0:index-4]

    def _create_user_item_rating_matrix(self, raw_data_group_by_user_id, raw_data_group_by_item_id):

        n_user = len(raw_data_group_by_user_id[raw_data_user_column_name])
        n_item = len(raw_data_group_by_item_id[raw_data_item_column_name])

        user_ids = raw_data_group_by_user_id[raw_data_user_column_name].tolist()
        item_ids = raw_data_group_by_item_id[raw_data_item_column_name].tolist()

        user_item_rating_matrix = np.zeros((n_user, n_item), dtype = np.int64)
        for user_id_index, user_id in enumerate(user_ids):
            item_id_indexes = np.where(np.isin(raw_data_group_by_user_id.iloc[user_id_index][raw_data_item_column_name],item_ids))[0]
            for item_id_index in item_id_indexes:
                user_item_rating_matrix[user_id_index, item_id_index] = 1

        return user_item_rating_matrix

    def _create_sample_latent_factor_matrix_row_indexes(self, raw_data_group_by_user_id, raw_data_group_by_item_id):

        n_user = len(raw_data_group_by_user_id[raw_data_user_column_name])
        n_item = len(raw_data_group_by_item_id[raw_data_item_column_name])

        item_ids = raw_data_group_by_item_id[raw_data_item_column_name].tolist()

        sample_user_id_indexes = np.random.choice(n_user, size = self.n_batch, replace = False)
        sample_positive_item_id_indexes = np.zeros(self.n_batch, dtype = np.int64)
        sample_negative_item_id_indexes = np.zeros(self.n_batch, dtype = np.int64)

        for index, sampled_user_id_index in enumerate(sample_user_id_indexes):
            possible_positive_item_ids = raw_data_group_by_user_id.iloc[sampled_user_id_index][raw_data_item_column_name]
            possible_negative_item_ids = list(set(item_ids).difference(possible_positive_item_ids))
            sample_positive_item_id = np.random.choice(possible_positive_item_ids)
            sample_negative_item_id = np.random.choice(possible_negative_item_ids)
            sample_positive_item_id_indexes[index] = item_ids.index(sample_positive_item_id)
            sample_negative_item_id_indexes[index] = item_ids.index(sample_negative_item_id)

        return sample_user_id_indexes, sample_positive_item_id_indexes, sample_negative_item_id_indexes

    def _update_sample_latent_factor_matrix_rows(self, user_latent_factor_matrix, item_latent_factor_matrix, sample_user_id_indexes, sample_positive_item_id_indexes, sample_negative_item_id_indexes):

        sample_user_latent_factor_matrix_tuple = user_latent_factor_matrix[sample_user_id_indexes]
        sample_item_latent_factor_matrix_positive_item_tuple = item_latent_factor_matrix[sample_positive_item_id_indexes]
        sample_item_latent_factor_matrix_negative_item_tuple = item_latent_factor_matrix[sample_negative_item_id_indexes]

        r_uij = np.sum(sample_user_latent_factor_matrix_tuple * (sample_item_latent_factor_matrix_positive_item_tuple - sample_item_latent_factor_matrix_negative_item_tuple), axis = 1)
        sigmoid = np.exp(-r_uij) / (1.0 + np.exp(-r_uij))
        sigmoid_tiled = np.tile(sigmoid, (self.n_latent_factor, 1)).T

        gradient_user = sigmoid_tiled * (sample_item_latent_factor_matrix_negative_item_tuple - sample_item_latent_factor_matrix_positive_item_tuple) + self.regularization_constant * sample_user_latent_factor_matrix_tuple
        gradient_positive_item = sigmoid_tiled * -sample_user_latent_factor_matrix_tuple + self.regularization_constant * sample_item_latent_factor_matrix_positive_item_tuple
        gradient_negative_item = sigmoid_tiled * sample_user_latent_factor_matrix_tuple + self.regularization_constant * sample_item_latent_factor_matrix_negative_item_tuple

        user_latent_factor_matrix[sample_user_id_indexes] -= self.learning_rate * gradient_user
        item_latent_factor_matrix[sample_positive_item_id_indexes] -= self.learning_rate * gradient_positive_item
        item_latent_factor_matrix[sample_negative_item_id_indexes] -= self.learning_rate * gradient_negative_item

        return user_latent_factor_matrix, item_latent_factor_matrix

    def recommend_user(self, raw_data_dictionary, latent_factor_matrix_dictionary, user_id, n_best_recommendation):

        raw_data_group_by_user_id = raw_data_dictionary[raw_data_group_by_user_id_dictionary_key]
        raw_data_group_by_item_id = raw_data_dictionary[raw_data_group_by_item_id_dictionary_key]

        user_latent_factor_matrix = latent_factor_matrix_dictionary[user_latent_factor_matrix_dictionary_key]
        item_latent_factor_matrix = latent_factor_matrix_dictionary[item_latent_factor_matrix_dictionary_key]

        user_ids = raw_data_group_by_user_id[raw_data_user_column_name].tolist()
        item_ids = raw_data_group_by_item_id[raw_data_item_column_name].tolist()

        user_index = user_ids.index(user_id)

        scores = self._predict_user(user_latent_factor_matrix, item_latent_factor_matrix, user_index)
        user_latent_factor_dictionary = {item_ids[i]: scores[i] for i in range(len(item_ids))}

        positive_item_ids = raw_data_group_by_user_id.iloc[user_index][raw_data_item_column_name]

        user_latent_factor_dictionary = {key: user_latent_factor_dictionary[key] for key in item_ids if key not in positive_item_ids}
        user_latent_factor_dictionary = dict(sorted(user_latent_factor_dictionary.items(), key=lambda item: item[1], reverse=True))
        best_recommendition_items = dict(islice(user_latent_factor_dictionary.items(), n_best_recommendation))

        return list(best_recommendition_items.keys())[0:n_best_recommendation]

    def _predict(self, user_latent_factor_matrix, item_latent_factor_matrix):
        return user_latent_factor_matrix.dot(item_latent_factor_matrix.T)

    def _predict_user(self, user_latent_factor_matrix, item_latent_factor_matrix, user_index):
        return user_latent_factor_matrix[user_index].dot(item_latent_factor_matrix.T)

In [96]:
data_helper = DataHelper()
bayseian_personalized_ranking_parameters = {
    'random_seed' : 1234, 
    'n_latent_factor' : 15, 
    'n_batch' : 1000, 
    'learning_rate' : 0.1, 
    'regularization_constant' : 0.1
}
bayseian_personalized_ranking_model = BayseianPersonalizedRankingModel(**bayseian_personalized_ranking_parameters)

In [92]:
train_raw_data_url = 'https://raw.githubusercontent.com/samohtwal/BPR/20221027/data/train-20221026.csv'
train_raw_data_dictionary = data_helper.load_raw_data_dictionary(train_raw_data_url)

In [93]:
latent_factor_matrix_dictionary, root_mean_square_errors = bayseian_personalized_ranking_model.fit(train_raw_data_dictionary)
data_helper.save_latent_factor_matrix_dictionary(latent_factor_matrix_dictionary)
data_helper.save_error(root_mean_square_errors)

debug - iteration: 1
debug - self.learning_rate: 0.1
debug - root_mean_square_error: 11726.031585409168

debug - iteration: 2
debug - self.learning_rate: 0.1
debug - root_mean_square_error: 11585.44411849621

debug - iteration: 3
debug - self.learning_rate: 0.1
debug - root_mean_square_error: 11445.280154237184

debug - iteration: 4
debug - self.learning_rate: 0.1
debug - root_mean_square_error: 11302.392858935575

debug - iteration: 5
debug - self.learning_rate: 0.1
debug - root_mean_square_error: 11168.770589457832

debug - iteration: 6
debug - self.learning_rate: 0.1
debug - root_mean_square_error: 11037.348792836037

debug - iteration: 7
debug - self.learning_rate: 0.1
debug - root_mean_square_error: 10907.976248254232

debug - iteration: 8
debug - self.learning_rate: 0.1
debug - root_mean_square_error: 10772.852717867458

debug - iteration: 9
debug - self.learning_rate: 0.1
debug - root_mean_square_error: 10643.500038144632

debug - iteration: 10
debug - self.learning_rate: 0.1
de

In [97]:
latent_factor_matrix_dictionary = data_helper.load_latent_factor_matrix_dictionary()

print('debug - latent_factor_matrix_dictionary[user_latent_factor_matrix_dictionary_key]:', end='\n')
print(latent_factor_matrix_dictionary[user_latent_factor_matrix_dictionary_key], end='\n')
print('', end='\n')

print('debug - latent_factor_matrix_dictionary[item_latent_factor_matrix_dictionary_key]:', end='\n')
print(latent_factor_matrix_dictionary[item_latent_factor_matrix_dictionary_key], end='\n')
print('', end='\n')

debug - latent_factor_matrix_dictionary[user_latent_factor_matrix_dictionary_key]:
[[ 0.00000000e+00  4.55123543e-01 -1.15544346e+00 ...  9.26987635e-01
  -1.96049969e+00 -3.25796551e-01]
 [ 1.00000000e+00 -3.74057897e-02  4.43792625e-01 ... -1.80297014e+00
  -1.94943276e-01  1.02303861e+00]
 [ 2.00000000e+00 -4.04080305e-01  4.29245739e-01 ...  3.16530820e-02
  -1.86935695e+00  2.29883567e-01]
 ...
 [ 3.92100000e+03 -8.85253218e-01  3.11179977e-01 ...  9.69071351e-01
  -1.09549331e+00  1.20046147e+00]
 [ 3.92200000e+03  6.38418786e-02  1.08987382e-01 ... -4.73397130e-01
   1.16381845e-02  1.83625582e-01]
 [ 3.92300000e+03  1.18017866e+00  8.55391205e-01 ...  5.59962799e-01
  -9.07106706e-01  1.85294749e+00]]

debug - latent_factor_matrix_dictionary[item_latent_factor_matrix_dictionary_key]:
[[ 0.00000000e+00 -1.03240406e+00]
 [ 1.00000000e+00 -1.55904903e+00]
 [ 2.00000000e+00 -9.71678842e-01]
 [ 3.00000000e+00  1.29189884e+00]
 [ 4.00000000e+00 -6.38108557e-02]
 [ 5.00000000e+00 -4.3

In [98]:
vali_raw_data_url = 'https://raw.githubusercontent.com/samohtwal/BPR/20221027/data/vali-20221026.csv'
vali_raw_data_dictionary = data_helper.load_raw_data_dictionary(vali_raw_data_url)

recommendation_item_ids = bayseian_personalized_ranking_model.recommend_user(vali_raw_data_dictionary, latent_factor_matrix_dictionary, 0, 5)
print('debug - recommendation_item_ids:', end='\n')
print(recommendation_item_ids, end='\n')
print('', end='\n')

ValueError: ignored