<a href="https://colab.research.google.com/github/samohtwal/BPR/blob/20221027/BPR_20221028.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import pandas as pd
import torch
import copy
from itertools import islice
from google.colab import files

In [21]:
global raw_data_user_column_name
global raw_data_item_column_name

global raw_data_group_by_user_id_dictionary_key
global raw_data_group_by_item_id_dictionary_key

global user_latent_factor_matrix_dictionary_key
global item_latent_factor_matrix_dictionary_key

global user_latent_factor_matrix_file_name
global item_latent_factor_matrix_file_name

raw_data_user_column_name = 'user_id'
raw_data_item_column_name = 'item_id'

raw_data_group_by_user_id_dictionary_key = 'raw_data_group_by_user_id'
raw_data_group_by_item_id_dictionary_key = 'raw_data_group_by_item_id'

user_latent_factor_matrix_dictionary_key = 'user_latent_factor_matrix'
item_latent_factor_matrix_dictionary_key = 'item_latent_factor_matrix'

user_latent_factor_matrix_file_name = 'user_latent_factor_matrix'
item_latent_factor_matrix_file_name = 'item_latent_factor_matrix'

In [33]:
class DataHelper:

    def load_raw_data_dictionary(raw_data_url):

        column_names = [raw_data_user_column_name, raw_data_item_column_name]
        raw_data = pd.read_csv(raw_data_url, names = column_names, header=0)

        raw_data_group_by_user_id = raw_data.groupby(raw_data_user_column_name)[raw_data_item_column_name].apply(list).reset_index(name=raw_data_item_column_name).sort_values(by=[raw_data_user_column_name])
        raw_data_group_by_item_id = raw_data.groupby(raw_data_item_column_name)[raw_data_user_column_name].apply(list).reset_index(name=raw_data_user_column_name).sort_values(by=[raw_data_item_column_name])

        raw_data_dictionary = {
            raw_data_group_by_user_id_dictionary_key: raw_data_group_by_user_id, 
            raw_data_group_by_item_id_dictionary_key: raw_data_group_by_item_id
        }

        return raw_data_dictionary

    def save_latent_factor_matrix_dictionary(self, latent_factor_matrix_dictionary):

        user_latent_factor_matrix = latent_factor_matrix_dictionary.get(user_latent_factor_matrix_dictionary_key)
        item_latent_factor_matrix = latent_factor_matrix_dictionary.get(item_latent_factor_matrix_dictionary_key)
        
        pd.DataFrame(user_latent_factor_matrix).to_csv(user_latent_factor_matrix_file_name)
        pd.DataFrame(item_latent_factor_matrix).to_csv(item_latent_factor_matrix_file_name)

        return None

    def load_latent_factor_matrix_dictionary(self):

        user_latent_factor_matrix = pd.read_csv(user_latent_factor_matrix_file_name)
        item_latent_factor_matrix = pd.read_csv(item_latent_factor_matrix_file_name)

        latent_factor_matrix_dictionary = {
            user_latent_factor_matrix_dictionary_key: user_latent_factor_matrix, 
            item_latent_factor_matrix_dictionary_key: item_latent_factor_matrix
        }

        return latent_factor_matrix_dictionary

In [36]:
class BayseianPersonalizedRanking:

    def __init__(self, random_seed = 1234, n_latent_factor = 15, n_batch = 10, learning_rate = 0.01, regularization_constant = 0.01):
        self.random_seed = random_seed
        self.n_latent_factor = n_latent_factor
        self.n_batch = n_batch
        self.learning_rate = learning_rate
        self.regularization_constant = regularization_constant

    def fit(self, raw_data_group_by_user_id, raw_data_group_by_item_id):

        n_user = len(raw_data_group_by_user_id[raw_data_user_column_name])
        n_item = len(raw_data_group_by_item_id[raw_data_item_column_name])

        rstate = np.random.RandomState(self.random_seed)

        user_latent_factor_matrix = rstate.normal(size = (n_user, self.n_latent_factor))
        item_latent_factor_matrix = rstate.normal(size = (n_item, self.n_latent_factor))

        user_item_rating_matrix = self._create_user_item_rating_matrix(raw_data_group_by_user_id, raw_data_group_by_item_id)

        user_latent_factor_matrixes = []
        item_latent_factor_matrixes = []
        sum_of_square_errors = []

        is_converage = True
        counter = 1
        while is_converge:
            
            sample_latent_factor_matrix_row_indexes = self._create_sample_latent_factor_matrix_row_indexes(raw_data_group_by_user_id, raw_data_group_by_item_id)
            sample_user_id_indexes, sample_positive_item_id_indexes, sample_negative_item_id_indexes = sample_latent_factor_matrix_row_indexes
            user_latent_factor_matrix, item_latent_factor_matrix = self._update_sample_latent_factor_matrix_rows(user_latent_factor_matrix, item_latent_factor_matrix, sample_user_id_indexes, sample_positive_item_id_indexes, sample_negative_item_id_indexes)
            sum_of_square_error = np.sum((user_item_rating_matrix - self._predict(user_latent_factor_matrix, item_latent_factor_matrix))**2)
            
            user_latent_factor_matrixes.append(user_latent_factor_matrix)
            item_latent_factor_matrixes.append(item_latent_factor_matrix)
            sum_of_square_errors.append(sum_of_square_error)
            
            print('debug - iteration:', counter, end='\n')
            print('debug - sum_of_square_error:', sum_of_square_error, end='\n')
            
            n_sum_of_square_error = len(sum_of_square_errors)
            if (sum_of_square_errors[n_sum_of_square_error-1] > sum_of_square_errors[n_sum_of_square_error-2]):
                is_converge = False

            counter = counter + 1

        latent_factor_matrix_dictionary = {
            user_latent_factor_matrix_dictionary_key: user_latent_factor_matrix, 
            item_latent_factor_matrix_dictionary_key: item_latent_factor_matrix
        }

        return latent_factor_matrix_dictionary

    def _create_user_item_rating_matrix(raw_data_group_by_user_id, raw_data_group_by_item_id):

        n_user = len(raw_data_group_by_user_id[raw_data_user_column_name])
        n_item = len(raw_data_group_by_item_id[raw_data_item_column_name])

        user_ids = raw_data_group_by_user_id[raw_data_user_column_name].tolist()
        item_ids = raw_data_group_by_item_id[raw_data_item_column_name].tolist()

        user_item_rating_matrix = np.zeros((n_user, n_item), dtype = np.int64)
        for user_id_index, user_id in enumerate(user_ids):
            item_id_indexes = np.where(np.isin(raw_data_group_by_user_id.iloc[user_id_index][raw_data_item_column_name],item_ids))[0]
            for item_id_index in item_id_indexes:
                user_item_rating_matrix[user_id_index, item_id_index] = 1

        return user_item_rating_matrix

    def _create_sample_latent_factor_matrix_row_indexes(self, raw_data_group_by_user_id, raw_data_group_by_item_id):

        n_user = len(raw_data_group_by_user_id[raw_data_user_column_name])
        n_item = len(raw_data_group_by_item_id[raw_data_item_column_name])

        item_ids = raw_data_group_by_item_id[raw_data_item_column_name].tolist()

        sample_user_id_indexes = np.random.choice(n_user, size = self.n_batch, replace = False)
        sample_positive_item_id_indexes = np.zeros(self.n_batch, dtype = np.int64)
        sample_negative_item_id_indexes = np.zeros(self.n_batch, dtype = np.int64)

        for index, sampled_user_id_index in enumerate(sample_user_id_indexes):
            possible_positive_item_ids = raw_data_group_by_user_id.iloc[sampled_user_id_index][raw_data_item_column_name]
            possible_negative_item_ids = list(set(item_ids).difference(possible_positive_item_ids))
            sample_positive_item_id = np.random.choice(possible_positive_item_ids)
            sample_negative_item_id = np.random.choice(possible_negative_item_ids)
            sample_positive_item_id_indexes[index] = item_ids.index(sample_positive_item_id)
            sample_negative_item_id_indexes[index] = item_ids.index(sample_negative_item_id)

        return sample_user_id_indexes, sample_positive_item_id_indexes, sample_negative_item_id_indexes

    def _update_sample_latent_factor_matrix_rows(self, user_latent_factor_matrix, item_latent_factor_matrix, sample_user_id_indexes, sample_positive_item_id_indexes, sample_negative_item_id_indexes):

        sample_user_latent_factor_matrix_tuple = user_latent_factor_matrix[sample_user_id_indexes]
        sample_item_latent_factor_matrix_positive_item_tuple = item_latent_factor_matrix[sample_positive_item_id_indexes]
        sample_item_latent_factor_matrix_negative_item_tuple = item_latent_factor_matrix[sample_negative_item_id_indexes]

        r_uij = np.sum(sample_user_latent_factor_matrix_tuple * (sample_item_latent_factor_matrix_positive_item_tuple - sample_item_latent_factor_matrix_negative_item_tuple), axis = 1)
        sigmoid = np.exp(-r_uij) / (1.0 + np.exp(-r_uij))
        sigmoid_tiled = np.tile(sigmoid, (self.n_latent_factor, 1)).T

        gradient_user = sigmoid_tiled * (sample_item_latent_factor_matrix_negative_item_tuple - sample_item_latent_factor_matrix_positive_item_tuple) + self.regularization_constant * sample_user_latent_factor_matrix_tuple
        gradient_positive_item = sigmoid_tiled * -sample_user_latent_factor_matrix_tuple + self.regularization_constant * sample_item_latent_factor_matrix_positive_item_tuple
        gradient_negative_item = sigmoid_tiled * sample_user_latent_factor_matrix_tuple + self.regularization_constant * sample_item_latent_factor_matrix_negative_item_tuple

        user_latent_factor_matrix[sample_user_id_indexes] -= self.learning_rate * gradient_user
        item_latent_factor_matrix[sample_positive_item_id_indexes] -= self.learning_rate * gradient_positive_item
        item_latent_factor_matrix[sample_negative_item_id_indexes] -= self.learning_rate * gradient_negative_item

        return user_latent_factor_matrix, item_latent_factor_matrix

    def recommend_user(self, user_latent_factor_matrix, item_latent_factor_matrix, raw_data_group_by_user_id, raw_data_group_by_item_id, user_id, n_best_recommendation):

        user_ids = raw_data_group_by_user_id[raw_data_user_column_name].tolist()
        item_ids = raw_data_group_by_item_id[raw_data_item_column_name].tolist()

        user_index = user_ids.index(user_id)

        scores = self._predict_user(user_latent_factor_matrix, item_latent_factor_matrix, user_index)
        user_latent_factor_dictionary = {item_ids[i]: scores[i] for i in range(len(item_ids))}

        positive_item_ids = raw_data_group_by_user_id.iloc[user_index][raw_data_item_column_name]

        user_latent_factor_dictionary = {key: user_latent_factor_dictionary[key] for key in item_ids if key not in positive_item_ids}
        user_latent_factor_dictionary = dict(sorted(user_latent_factor_dictionary.items(), key=lambda item: item[1], reverse=True))
        best_recommendition_items = dict(islice(user_latent_factor_dictionary.items(), n_best_recommendation))

        return list(best_recommendition_items.keys())[0:n_best_recommendation]

    def _predict(user_latent_factor_matrix, item_latent_factor_matrix):
        return user_latent_factor_matrix.dot(item_latent_factor_matrix.T)

    def _predict_user(user_latent_factor_matrix, item_latent_factor_matrix, user_index):
        return user_latent_factor_matrix[user_index].dot(item_latent_factor_matrix.T)