In [33]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My Drive/RecSys/libs

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/My Drive/RecSys/libs


In [34]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [35]:
%matplotlib inline
import numpy as np
import scipy as sp
import scipy.sparse as sps
import matplotlib.pyplot as plt
import random
from scipy import stats
from scipy.optimize import fmin
import pandas as pd

In [57]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

data = pd.read_csv(filepath_or_buffer="/gdrive/My Drive/RecSys/libs/data_train.csv",
                   sep=",")
data.columns = ["UserID", "ItemID", "Interaction"]

mapped_id, original_id = pd.factorize(data["UserID"].unique())
user_original_ID_to_index = pd.Series(mapped_id, index=original_id)

mapped_id, original_id = pd.factorize(data["ItemID"].unique())
item_original_ID_to_index = pd.Series(mapped_id, index=original_id)

data["UserID"] = data["UserID"].map(user_original_ID_to_index)
data["ItemID"] = data["ItemID"].map(item_original_ID_to_index)


URM_all = sps.coo_matrix((data["Interaction"].values,
                          (data["UserID"].values, data["ItemID"].values)))

URM_all.tocsr()

#URM_train, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8)




In [60]:
train_test_split = 0.80

n_interactions = URM_all.nnz


train_mask = np.random.choice([True,False], n_interactions, p=[train_test_split, 1-train_test_split])

URM_train = sps.csr_matrix((URM_all.data[train_mask],
                            (URM_all.row[train_mask], URM_all.col[train_mask])))


test_mask = np.logical_not(train_mask)

URM_test = sps.csr_matrix((URM_all.data[test_mask],
                            (URM_all.row[test_mask], URM_all.col[test_mask])))

URM_train.shape, URM_test.shape

((12638, 22222), (12638, 22215))

In [58]:
n_users, n_items = URM_train.shape
n_users, n_items

(12638, 22222)

In [38]:
item_item_S = np.zeros((n_items, n_items), dtype = float)
item_item_S

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

##**Gradient descent with scipy**

In [39]:
%%cython

import numpy as np
import time

from libc.stdlib cimport rand, srand, RAND_MAX

def train_multiple_epochs(URM_train, learning_rate_input, regularization_2_input, n_epochs):

    URM_train_coo = URM_train.tocoo()
    cdef int n_items = URM_train.shape[1]
    cdef int n_interactions = URM_train.nnz
    cdef int[:] URM_train_coo_row = URM_train_coo.row
    cdef int[:] URM_train_coo_col = URM_train_coo.col
    cdef double[:] URM_train_coo_data = URM_train_coo.data
    cdef int[:] URM_train_indices = URM_train.indices
    cdef int[:] URM_train_indptr = URM_train.indptr
    cdef double[:] URM_train_data = URM_train.data

    cdef double[:,:] item_item_S = np.zeros((n_items, n_items), dtype = float)
    cdef double learning_rate = learning_rate_input
    cdef double regularization_2 = regularization_2_input
    cdef double loss = 0.0
    cdef long start_time
    cdef double true_rating, predicted_rating, prediction_error, profile_rating
    cdef int start_profile, end_profile
    cdef int index, sample_num, user_id, item_id, profile_item_id

    for n_epoch in range(n_epochs):

        loss = 0.0
        start_time = time.time()

        for sample_num in range(n_interactions):

            # Randomly pick sample
            index = rand() % n_interactions

            user_id = URM_train_coo_row[index]
            item_id = URM_train_coo_col[index]
            true_rating = URM_train_coo_data[index]

            # Compute prediction
            start_profile = URM_train_indptr[user_id]
            end_profile = URM_train_indptr[user_id+1]
            predicted_rating = 0.0

            for index in range(start_profile, end_profile):
                profile_item_id = URM_train_indices[index]
                profile_rating = URM_train_data[index]
                predicted_rating += item_item_S[profile_item_id,item_id] * profile_rating

            # Compute prediction error, or gradient
            prediction_error = true_rating - predicted_rating
            loss += prediction_error**2

            # Update model, in this case the similarity
            for index in range(start_profile, end_profile):
                profile_item_id = URM_train_indices[index]
                profile_rating = URM_train_data[index]
                item_item_S[profile_item_id,item_id] += learning_rate * (prediction_error * profile_rating -
                                                                         regularization_2 * item_item_S[profile_item_id,item_id])

            # Ensure diagonal is always zero
            item_item_S[item_id,item_id] = 0.0

#             if sample_num % 1000000 == 0:
#                 print("Epoch {}: {:.2f}%".format(n_epoch+1, sample_num/n_interactions*100))


        elapsed_time = time.time() - start_time
        samples_per_second = (sample_num+1)/elapsed_time

        print("Epoch {} complete in in {:.2f} seconds, loss is {:.3E}. Samples per second {:.2f}".format(n_epoch+1, time.time() - start_time, loss/(sample_num+1), samples_per_second))

    return np.array(item_item_S), loss/(sample_num+1), samples_per_second

In [40]:
n_items = URM_train.shape[1]
learning_rate = 1e-4
regularization_2 = 1e-3

item_item_S, loss, samples_per_second = train_multiple_epochs(URM_train, learning_rate, regularization_2, 10)

Epoch 1 complete in in 7.04 seconds, loss is 9.634E-01. Samples per second 54362.92
Epoch 2 complete in in 3.69 seconds, loss is 8.993E-01. Samples per second 103823.91
Epoch 3 complete in in 3.46 seconds, loss is 8.464E-01. Samples per second 110689.96
Epoch 4 complete in in 3.00 seconds, loss is 8.018E-01. Samples per second 127473.21
Epoch 5 complete in in 2.82 seconds, loss is 7.644E-01. Samples per second 135965.68
Epoch 6 complete in in 4.32 seconds, loss is 7.306E-01. Samples per second 88631.15
Epoch 7 complete in in 3.52 seconds, loss is 7.019E-01. Samples per second 108867.27
Epoch 8 complete in in 3.05 seconds, loss is 6.747E-01. Samples per second 125465.38
Epoch 9 complete in in 2.92 seconds, loss is 6.509E-01. Samples per second 131297.13
Epoch 10 complete in in 3.82 seconds, loss is 6.293E-01. Samples per second 100296.65


##**Prediction**

In [41]:
URM_test_coo = URM_test.tocoo()

sample_index = np.random.randint(URM_test_coo.nnz)
sample_index

48689

In [42]:
user_id = URM_test_coo.row[sample_index]
item_id = URM_test_coo.col[sample_index]
true_rating = URM_test_coo.data[sample_index]

(user_id, item_id, true_rating)

(6401, 12767, 1.0)

In [43]:
predicted_rating = URM_test[user_id].dot(item_item_S[:,item_id])[0]
predicted_rating

0.016113650657816152

In [44]:
prediction_error = true_rating - predicted_rating
prediction_error

0.9838863493421839

**Let's try with all the training set**

In [62]:
prediction_error = 0
for i in range(URM_train.shape[0]):
  user_id = URM_train_coo.row[i]
  item_id = URM_train_coo.col[i]
  true_rating = URM_train_coo.data[i]

  predicted_rating = URM_train[user_id].dot(item_item_S[:,item_id])[0]

  prediction_error = prediction_error + (true_rating - predicted_rating)**2

prediction_error

8542.01611838526