<a href="https://colab.research.google.com/github/samohtwal/BPR/blob/20221027/BPR_20221027.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
import numpy as np
import pandas as pd
import torch
import copy
from itertools import islice

global raw_data_user_column_name
global raw_data_item_column_name
global raw_data_group_by_user_id_dictionary_key
global raw_data_group_by_item_id_dictionary_key

global random_seed

global n_latent_factor
global n_batch
global regularization_constant
global learning_rate
global n_iteration

raw_data_user_column_name = 'user_id'
raw_data_item_column_name = 'item_id'
raw_data_group_by_user_id_dictionary_key = 'raw_data_group_by_user_id'
raw_data_group_by_item_id_dictionary_key = 'raw_data_group_by_item_id'

random_seed = 1234

n_latent_factor = 15
n_batch = 10
regularization_constant = 0.01
learning_rate = 0.01
n_iteration = 10

In [37]:
# load data
raw_data_url = 'https://raw.githubusercontent.com/samohtwal/BPR/main/data/vali-20221026.csv'
column_names = [raw_data_user_column_name, raw_data_item_column_name]
raw_data = pd.read_csv(raw_data_url, names = column_names, header=0)

raw_data_group_by_user_id = raw_data.groupby(raw_data_user_column_name)[raw_data_item_column_name].apply(list).reset_index(name=raw_data_item_column_name).sort_values(by=[raw_data_user_column_name])
raw_data_group_by_item_id = raw_data.groupby(raw_data_item_column_name)[raw_data_user_column_name].apply(list).reset_index(name=raw_data_user_column_name).sort_values(by=[raw_data_item_column_name])

raw_data_dictionary = {
    raw_data_group_by_user_id_dictionary_key: raw_data_group_by_user_id, 
    raw_data_group_by_item_id_dictionary_key: raw_data_group_by_item_id
}

print('debug - raw_data_group_by_user_id:\n', raw_data_dictionary.get(raw_data_group_by_user_id_dictionary_key))
print('debug - raw_data_group_by_item_id:\n', raw_data_dictionary.get(raw_data_group_by_item_id_dictionary_key))

print('debug - raw_data_group_by_user_id.iloc[0]:\n', raw_data_dictionary.get(raw_data_group_by_user_id_dictionary_key).iloc[0])


debug - raw_data_group_by_user_id:
       user_id                                            item_id
0           0                             [128, 319, 2352, 2688]
1           1  [109, 985, 1201, 1294, 1324, 1345, 1427, 1664,...
2           2                                             [2633]
3           3  [142, 150, 237, 252, 320, 333, 461, 477, 479, ...
4           4                                             [1779]
...       ...                                                ...
3859     3919                                       [1883, 2002]
3860     3920                                 [1805, 1845, 1907]
3861     3921                                       [1884, 2135]
3862     3922                                 [1803, 1885, 1926]
3863     3923               [1983, 2201, 2214, 2314, 2413, 2489]

[3864 rows x 2 columns]
debug - raw_data_group_by_item_id:
       item_id                                            user_id
0           0                                          [6,

In [42]:
def train(raw_data_group_by_user_id, raw_data_group_by_item_id):

    n_user = len(raw_data_group_by_user_id[raw_data_user_column_name])
    n_item = len(raw_data_group_by_item_id[raw_data_item_column_name])

    rstate = np.random.RandomState(random_seed)

    user_latent_factor_matrix = rstate.normal(size = (n_user, n_latent_factor))
    item_latent_factor_matrix = rstate.normal(size = (n_item, n_latent_factor))

    for _ in range(n_iteration):
        sample_lantent_factor_matrix_row_indexes = _create_sample_lantent_factor_matrix_row_indexes(raw_data_group_by_user_id, raw_data_group_by_item_id)
        sample_user_id_indexes, sample_positive_item_id_indexes, sample_negative_item_id_indexes = sample_lantent_factor_matrix_row_indexes
        user_latent_factor_matrix, item_latent_factor_matrix = _update_sample_lantent_factor_matrix_rows(user_latent_factor_matrix, item_latent_factor_matrix, sample_user_id_indexes, sample_positive_item_id_indexes, sample_negative_item_id_indexes)

    return user_latent_factor_matrix, item_latent_factor_matrix

def _create_sample_lantent_factor_matrix_row_indexes(raw_data_group_by_user_id, raw_data_group_by_item_id):

    n_user = len(raw_data_group_by_user_id[raw_data_user_column_name])
    n_item = len(raw_data_group_by_item_id[raw_data_item_column_name])

    item_ids = raw_data_group_by_item_id[raw_data_item_column_name].tolist()

    '''
    print('debug - item_ids:', end='\n')
    print(item_ids, end='\n')
    print('', end='\n')
    '''

    sample_user_id_indexes = np.random.choice(n_user, size = n_batch, replace = False)
    sample_positive_item_id_indexes = np.zeros(n_batch, dtype = np.int64)
    sample_negative_item_id_indexes = np.zeros(n_batch, dtype = np.int64)

    '''
    print('debug - raw_data_group_by_user_id.iloc[0]:', end='\n')
    print(raw_data_group_by_user_id.iloc[0], end='\n')
    print('', end='\n')

    print('debug - raw_data_group_by_user_id.iloc[0][raw_data_item_column_name]:', end='\n')
    print(raw_data_group_by_user_id.iloc[0][raw_data_item_column_name], end='\n')
    print('', end='\n')
    '''

    for index, sampled_user_index in enumerate(sample_user_id_indexes):
        possible_positive_item_ids = raw_data_group_by_user_id.iloc[sampled_user_index][raw_data_item_column_name]
        possible_negative_item_ids = list(set(item_ids).difference(possible_positive_item_ids))
        sample_positive_item_id = np.random.choice(possible_positive_item_ids)
        sample_negative_item_id = np.random.choice(possible_negative_item_ids)
        sample_positive_item_id_indexes[index] = item_ids.index(sample_positive_item_id)
        sample_negative_item_id_indexes[index] = item_ids.index(sample_negative_item_id)
        '''
        print('debug - sample_positive_item_id:', end='\n')
        print(sample_positive_item_id, end='\n')
        print('', end='\n')
        print('debug - item_ids.index(sample_positive_item_id):', end='\n')
        print(item_ids.index(sample_positive_item_id), end='\n')
        print('', end='\n')
        '''

    return sample_user_id_indexes, sample_positive_item_id_indexes, sample_negative_item_id_indexes

def _update_sample_lantent_factor_matrix_rows(user_latent_factor_matrix, item_latent_factor_matrix, sample_user_id_indexes, sample_positive_item_id_indexes, sample_negative_item_id_indexes):
    
    sample_user_latent_factor_matrix_tuple = user_latent_factor_matrix[sample_user_id_indexes]
    sample_item_latent_factor_matrix_positive_item_tuple = item_latent_factor_matrix[sample_positive_item_id_indexes]
    sample_item_latent_factor_matrix_negative_item_tuple = item_latent_factor_matrix[sample_negative_item_id_indexes]

    r_uij = np.sum(sample_user_latent_factor_matrix_tuple * (sample_item_latent_factor_matrix_positive_item_tuple - sample_item_latent_factor_matrix_negative_item_tuple), axis = 1)
    sigmoid = np.exp(-r_uij) / (1.0 + np.exp(-r_uij))
    sigmoid_tiled = np.tile(sigmoid, (n_latent_factor, 1)).T
        
    gradient_user = sigmoid_tiled * (sample_item_latent_factor_matrix_negative_item_tuple - sample_item_latent_factor_matrix_positive_item_tuple) + regularization_constant * sample_user_latent_factor_matrix_tuple
    gradient_positive_item = sigmoid_tiled * -sample_user_latent_factor_matrix_tuple + regularization_constant * sample_item_latent_factor_matrix_positive_item_tuple
    gradient_negative_item = sigmoid_tiled * sample_user_latent_factor_matrix_tuple + regularization_constant * sample_item_latent_factor_matrix_negative_item_tuple

    user_latent_factor_matrix[sample_user_id_indexes] -= learning_rate * gradient_user
    item_latent_factor_matrix[sample_positive_item_id_indexes] -= learning_rate * gradient_positive_item
    item_latent_factor_matrix[sample_negative_item_id_indexes] -= learning_rate * gradient_negative_item

    return user_latent_factor_matrix, item_latent_factor_matrix

def predict(user_latent_factor_matrix, item_latent_factor_matrix):
    return user_latent_factor_matrix.dot(item_latent_factor_matrix.T)

def _predict_user(user_latent_factor_matrix, item_latent_factor_matrix, user_index):
    return user_latent_factor_matrix[user_index].dot(item_latent_factor_matrix.T)

def recommend_user(user_item_rating_matrix, item_latent_factor_matrix, user_index, n_best_recommendation):

    n_item = len(raw_data_group_by_item_id[raw_data_item_column_name])
    item_ids = raw_data_group_by_item_id[raw_data_item_column_name].tolist()

    scores = _predict_user(user_index)
    user_latent_factor_dictionary = {item_ids[i]: scores[i] for i in range(len(item_ids))}

    positive_item_ids = raw_data_group_by_user_id.iloc[user_index][raw_data_item_column_name]

    user_latent_factor_dictionary = {key: user_latent_factor_dictionary[key] for key in item_ids if key not in positive_item_ids}
    user_latent_factor_dictionary = dict(sorted(user_latent_factor_dictionary.items(), key=lambda item: item[1], reverse=True))
    best_recommendition_items = dict(islice(user_latent_factor_dictionary.items(), n_best_recommendation))

    return list(best_recommendition_items.keys())[0:n_best_recommendation]

In [41]:
user_latent_factor_matrix, item_latent_factor_matrix = train(raw_data_dictionary.get(raw_data_group_by_user_id_dictionary_key), raw_data_dictionary.get(raw_data_group_by_item_id_dictionary_key))

print('debug - user_latent_factor_matrix:', end='\n')
print(user_latent_factor_matrix, end='\n')
print('', end='\n')

print('debug - item_latent_factor_matrix:', end='\n')
print(item_latent_factor_matrix, end='\n')
print('', end='\n')

debug - user_latent_factor_matrix:
[[ 0.47143516 -1.19097569  1.43270697 ...  0.95332413 -2.02125482
  -0.33407737]
 [ 0.00211836  0.40545341  0.28909194 ... -1.81702723 -0.18310854
   1.05896919]
 [-0.39784023  0.33743765  1.04757857 ...  0.03614194 -2.0749776
   0.2477922 ]
 ...
 [ 1.14072459  0.24480566 -1.82190179 ... -0.58508961  0.75557621
   0.46752227]
 [-0.8545777   1.35200613  0.78975155 ... -0.02283265 -1.69484832
  -0.36669186]
 [-0.26712078 -0.5349804  -1.22014469 ...  0.14257988 -0.05139749
   0.47218592]]

debug - item_latent_factor_matrix:
[[-0.61184541 -0.34683366 -1.03568407 ... -0.99889766  0.94556836
   1.85714986]
 [ 2.02333715 -1.38537805 -0.90726791 ... -1.14099655  1.29412492
  -2.83481026]
 [ 1.23193918 -0.6037652  -0.360057   ... -1.52522888 -2.87911973
  -0.17196555]
 ...
 [-0.10523079 -0.61247412 -0.80403136 ...  1.06027036  1.86682769
   0.78543836]
 [-0.55436149 -0.16218603  0.39552328 ...  0.19400603  0.27679674
   0.23326004]
 [ 1.61294651  1.54093746 -1