<a href="https://colab.research.google.com/github/samohtwal/BPR/blob/20221027/BPR_20221027.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
import numpy as np
import pandas as pd
import torch
import copy
from itertools import islice

global raw_data_user_column_name
global raw_data_item_column_name
global raw_data_group_by_user_id_dictionary_key
global raw_data_group_by_item_id_dictionary_key

global random_seed

global n_latent_factor
global n_batch
global regularization_constant
global learning_rate
global n_iteration

raw_data_user_column_name = 'user_id'
raw_data_item_column_name = 'item_id'
raw_data_group_by_user_id_dictionary_key = 'raw_data_group_by_user_id'
raw_data_group_by_item_id_dictionary_key = 'raw_data_group_by_item_id'

random_seed = 1234

n_latent_factor = 15
n_batch = 1000
regularization_constant = 0.1
learning_rate = 0.1
n_iteration = 4000

In [2]:
# load data
raw_data_url = 'https://raw.githubusercontent.com/samohtwal/BPR/main/data/vali-20221026.csv'
column_names = [raw_data_user_column_name, raw_data_item_column_name]
raw_data = pd.read_csv(raw_data_url, names = column_names, header=0)

raw_data_group_by_user_id = raw_data.groupby(raw_data_user_column_name)[raw_data_item_column_name].apply(list).reset_index(name=raw_data_item_column_name).sort_values(by=[raw_data_user_column_name])
raw_data_group_by_item_id = raw_data.groupby(raw_data_item_column_name)[raw_data_user_column_name].apply(list).reset_index(name=raw_data_user_column_name).sort_values(by=[raw_data_item_column_name])

raw_data_dictionary = {
    raw_data_group_by_user_id_dictionary_key: raw_data_group_by_user_id, 
    raw_data_group_by_item_id_dictionary_key: raw_data_group_by_item_id
}

print('debug - raw_data_group_by_user_id:\n', raw_data_dictionary.get(raw_data_group_by_user_id_dictionary_key))
print('debug - raw_data_group_by_item_id:\n', raw_data_dictionary.get(raw_data_group_by_item_id_dictionary_key))

print('debug - raw_data_group_by_user_id.iloc[0]:\n', raw_data_dictionary.get(raw_data_group_by_user_id_dictionary_key).iloc[0])


debug - raw_data_group_by_user_id:
       user_id                                            item_id
0           0                             [128, 319, 2352, 2688]
1           1  [109, 985, 1201, 1294, 1324, 1345, 1427, 1664,...
2           2                                             [2633]
3           3  [142, 150, 237, 252, 320, 333, 461, 477, 479, ...
4           4                                             [1779]
...       ...                                                ...
3859     3919                                       [1883, 2002]
3860     3920                                 [1805, 1845, 1907]
3861     3921                                       [1884, 2135]
3862     3922                                 [1803, 1885, 1926]
3863     3923               [1983, 2201, 2214, 2314, 2413, 2489]

[3864 rows x 2 columns]
debug - raw_data_group_by_item_id:
       item_id                                            user_id
0           0                                          [6,

In [24]:
def fit(raw_data_group_by_user_id, raw_data_group_by_item_id):

    n_user = len(raw_data_group_by_user_id[raw_data_user_column_name])
    n_item = len(raw_data_group_by_item_id[raw_data_item_column_name])

    rstate = np.random.RandomState(random_seed)

    user_latent_factor_matrix = rstate.normal(size = (n_user, n_latent_factor))
    item_latent_factor_matrix = rstate.normal(size = (n_item, n_latent_factor))

    user_item_rating_matrix = _create_user_item_rating_matrix(raw_data_group_by_user_id, raw_data_group_by_item_id)

    for _ in range(n_iteration):
        sample_lantent_factor_matrix_row_indexes = _create_sample_lantent_factor_matrix_row_indexes(raw_data_group_by_user_id, raw_data_group_by_item_id)
        sample_user_id_indexes, sample_positive_item_id_indexes, sample_negative_item_id_indexes = sample_lantent_factor_matrix_row_indexes
        user_latent_factor_matrix, item_latent_factor_matrix = _update_sample_lantent_factor_matrix_rows(user_latent_factor_matrix, item_latent_factor_matrix, sample_user_id_indexes, sample_positive_item_id_indexes, sample_negative_item_id_indexes)
        
        sum_of_square = np.sum((user_item_rating_matrix - predict(user_latent_factor_matrix, item_latent_factor_matrix))**2)
        print('debug - sum_of_square:', end='\n')
        print(_, end='\n')
        print(sum_of_square, end='\n')
        print('', end='\n')


    return user_latent_factor_matrix, item_latent_factor_matrix

def _create_sample_lantent_factor_matrix_row_indexes(raw_data_group_by_user_id, raw_data_group_by_item_id):

    n_user = len(raw_data_group_by_user_id[raw_data_user_column_name])
    n_item = len(raw_data_group_by_item_id[raw_data_item_column_name])

    item_ids = raw_data_group_by_item_id[raw_data_item_column_name].tolist()

    sample_user_id_indexes = np.random.choice(n_user, size = n_batch, replace = False)
    sample_positive_item_id_indexes = np.zeros(n_batch, dtype = np.int64)
    sample_negative_item_id_indexes = np.zeros(n_batch, dtype = np.int64)

    for index, sampled_user_id_index in enumerate(sample_user_id_indexes):
        possible_positive_item_ids = raw_data_group_by_user_id.iloc[sampled_user_id_index][raw_data_item_column_name]
        possible_negative_item_ids = list(set(item_ids).difference(possible_positive_item_ids))
        sample_positive_item_id = np.random.choice(possible_positive_item_ids)
        sample_negative_item_id = np.random.choice(possible_negative_item_ids)
        sample_positive_item_id_indexes[index] = item_ids.index(sample_positive_item_id)
        sample_negative_item_id_indexes[index] = item_ids.index(sample_negative_item_id)

    return sample_user_id_indexes, sample_positive_item_id_indexes, sample_negative_item_id_indexes

def _update_sample_lantent_factor_matrix_rows(user_latent_factor_matrix, item_latent_factor_matrix, sample_user_id_indexes, sample_positive_item_id_indexes, sample_negative_item_id_indexes):
    
    sample_user_latent_factor_matrix_tuple = user_latent_factor_matrix[sample_user_id_indexes]
    sample_item_latent_factor_matrix_positive_item_tuple = item_latent_factor_matrix[sample_positive_item_id_indexes]
    sample_item_latent_factor_matrix_negative_item_tuple = item_latent_factor_matrix[sample_negative_item_id_indexes]

    r_uij = np.sum(sample_user_latent_factor_matrix_tuple * (sample_item_latent_factor_matrix_positive_item_tuple - sample_item_latent_factor_matrix_negative_item_tuple), axis = 1)
    sigmoid = np.exp(-r_uij) / (1.0 + np.exp(-r_uij))
    sigmoid_tiled = np.tile(sigmoid, (n_latent_factor, 1)).T
        
    gradient_user = sigmoid_tiled * (sample_item_latent_factor_matrix_negative_item_tuple - sample_item_latent_factor_matrix_positive_item_tuple) + regularization_constant * sample_user_latent_factor_matrix_tuple
    gradient_positive_item = sigmoid_tiled * -sample_user_latent_factor_matrix_tuple + regularization_constant * sample_item_latent_factor_matrix_positive_item_tuple
    gradient_negative_item = sigmoid_tiled * sample_user_latent_factor_matrix_tuple + regularization_constant * sample_item_latent_factor_matrix_negative_item_tuple

    user_latent_factor_matrix[sample_user_id_indexes] -= learning_rate * gradient_user
    item_latent_factor_matrix[sample_positive_item_id_indexes] -= learning_rate * gradient_positive_item
    item_latent_factor_matrix[sample_negative_item_id_indexes] -= learning_rate * gradient_negative_item

    return user_latent_factor_matrix, item_latent_factor_matrix

def predict(user_latent_factor_matrix, item_latent_factor_matrix):
    return user_latent_factor_matrix.dot(item_latent_factor_matrix.T)

def _predict_user(user_latent_factor_matrix, item_latent_factor_matrix, user_index):
    return user_latent_factor_matrix[user_index].dot(item_latent_factor_matrix.T)

def recommend_user(raw_data_group_by_user_id, user_latent_factor_matrix, item_latent_factor_matrix, user_index, n_best_recommendation):

    n_item = len(raw_data_group_by_item_id[raw_data_item_column_name])
    item_ids = raw_data_group_by_item_id[raw_data_item_column_name].tolist()

    scores = _predict_user(user_latent_factor_matrix, item_latent_factor_matrix, user_index)
    user_latent_factor_dictionary = {item_ids[i]: scores[i] for i in range(len(item_ids))}

    positive_item_ids = raw_data_group_by_user_id.iloc[user_index][raw_data_item_column_name]

    user_latent_factor_dictionary = {key: user_latent_factor_dictionary[key] for key in item_ids if key not in positive_item_ids}
    user_latent_factor_dictionary = dict(sorted(user_latent_factor_dictionary.items(), key=lambda item: item[1], reverse=True))
    best_recommendition_items = dict(islice(user_latent_factor_dictionary.items(), n_best_recommendation))

    return list(best_recommendition_items.keys())[0:n_best_recommendation]

def _create_user_item_rating_matrix(raw_data_group_by_user_id, raw_data_group_by_item_id):

    n_user = len(raw_data_group_by_user_id[raw_data_user_column_name])
    n_item = len(raw_data_group_by_item_id[raw_data_item_column_name])

    user_ids = raw_data_group_by_user_id[raw_data_user_column_name].tolist()
    item_ids = raw_data_group_by_item_id[raw_data_item_column_name].tolist()

    user_item_rating_matrix = np.zeros((n_user, n_item), dtype = np.int64)
    for user_id_index, user_id in enumerate(user_ids):
        item_id_indexes = np.where(np.isin(raw_data_group_by_user_id.iloc[user_id_index][raw_data_item_column_name],item_ids))[0]
        for item_id_index in item_id_indexes:
            user_item_rating_matrix[user_id_index, item_id_index] = 1

    return user_item_rating_matrix

In [10]:
_create_user_item_rating_matrix(raw_data_group_by_user_id, raw_data_group_by_item_id)

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 1, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

In [30]:
user_latent_factor_matrix, item_latent_factor_matrix = fit(raw_data_dictionary.get(raw_data_group_by_user_id_dictionary_key), raw_data_dictionary.get(raw_data_group_by_item_id_dictionary_key))

'''
print('debug - user_latent_factor_matrix:', end='\n')
print(user_latent_factor_matrix, end='\n')
print('', end='\n')

print('debug - item_latent_factor_matrix:', end='\n')
print(item_latent_factor_matrix, end='\n')
print('', end='\n')

best_recommendation = recommend_user(raw_data_group_by_user_id, user_latent_factor_matrix, item_latent_factor_matrix, 0, 5)
print('debug - best_recommendation:', end='\n')
print(best_recommendation, end='\n')
print('', end='\n')
'''

debug - sum_of_square:
0
170254705.0790106

debug - sum_of_square:
1
166024437.78492188

debug - sum_of_square:
2
162124294.96521732

debug - sum_of_square:
3
157970510.90761432

debug - sum_of_square:
4
154162436.89773452

debug - sum_of_square:
5
150586760.69310933

debug - sum_of_square:
6
147142248.83231872

debug - sum_of_square:
7
143698693.9721568

debug - sum_of_square:
8
140277639.5810112

debug - sum_of_square:
9
137005287.36246628

debug - sum_of_square:
10
133863935.03688052

debug - sum_of_square:
11
131010275.26901917

debug - sum_of_square:
12
128042849.8963918

debug - sum_of_square:
13
125221466.11133318

debug - sum_of_square:
14
122523897.7320972

debug - sum_of_square:
15
119973106.63132302

debug - sum_of_square:
16
117388934.95412365

debug - sum_of_square:
17
114671025.90297455

debug - sum_of_square:
18
112174359.82836886

debug - sum_of_square:
19
109811621.61930819

debug - sum_of_square:
20
107519401.68247268

debug - sum_of_square:
21
105349360.34779246

deb

KeyboardInterrupt: ignored