In [21]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My Drive/RecSys/libs

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/My Drive/RecSys/libs


In [22]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [23]:
%matplotlib inline
import numpy as np
import scipy as sp
import scipy.sparse as sps
import matplotlib.pyplot as plt
import random
from scipy import stats
from scipy.optimize import fmin
import pandas as pd

In [24]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

data = pd.read_csv(filepath_or_buffer="/gdrive/My Drive/RecSys/libs/data_train.csv",
                   sep=",")
data.columns = ["UserID", "ItemID", "Interaction"]

mapped_id, original_id = pd.factorize(data["UserID"].unique())
user_original_ID_to_index = pd.Series(mapped_id, index=original_id)

mapped_id, original_id = pd.factorize(data["ItemID"].unique())
item_original_ID_to_index = pd.Series(mapped_id, index=original_id)

data["UserID"] = data["UserID"].map(user_original_ID_to_index)
data["ItemID"] = data["ItemID"].map(item_original_ID_to_index)


URM_all = sps.coo_matrix((data["Interaction"].values,
                          (data["UserID"].values, data["ItemID"].values)))

URM_all.tocsr()

#URM_train, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8)


<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in Compressed Sparse Row format>

In [25]:
#from sklearn.model_selection import train_test_split

#URM_train, URM_test = train_test_split(URM_all, random_state=42, test_size=0.1)

In [26]:
#URM_train.shape, URM_test.shape
URM_train = URM_all.tocsr()

In [27]:
n_users, n_items = URM_train.shape
n_users, n_items

(12638, 22222)

In [28]:
%%cython

import numpy as np
import time

from libc.stdlib cimport rand, srand, RAND_MAX

def train_multiple_epochs(URM_train, learning_rate_input, regularization_2_input, n_epochs):

    URM_train_coo = URM_train.tocoo()
    cdef int n_items = URM_train.shape[1]
    cdef int n_interactions = URM_train.nnz
    cdef int[:] URM_train_coo_row = URM_train_coo.row
    cdef int[:] URM_train_coo_col = URM_train_coo.col
    cdef double[:] URM_train_coo_data = URM_train_coo.data
    cdef int[:] URM_train_indices = URM_train.indices
    cdef int[:] URM_train_indptr = URM_train.indptr
    cdef double[:] URM_train_data = URM_train.data

    cdef double[:,:] item_item_S = np.zeros((n_items, n_items), dtype = float)
    cdef double learning_rate = learning_rate_input
    cdef double regularization_2 = regularization_2_input
    cdef double loss = 0.0
    cdef long start_time
    cdef double true_rating, predicted_rating, prediction_error, profile_rating
    cdef int start_profile, end_profile
    cdef int index, sample_num, user_id, item_id, profile_item_id

    for n_epoch in range(n_epochs):

        loss = 0.0
        start_time = time.time()

        for sample_num in range(n_interactions):

            # Randomly pick sample
            index = rand() % n_interactions

            user_id = URM_train_coo_row[index]
            item_id = URM_train_coo_col[index]
            true_rating = URM_train_coo_data[index]

            # Compute prediction
            start_profile = URM_train_indptr[user_id]
            end_profile = URM_train_indptr[user_id+1]
            predicted_rating = 0.0

            for index in range(start_profile, end_profile):
                profile_item_id = URM_train_indices[index]
                profile_rating = URM_train_data[index]
                predicted_rating += item_item_S[profile_item_id,item_id] * profile_rating

            # Compute prediction error, or gradient
            prediction_error = true_rating - predicted_rating
            loss += prediction_error**2

            # Update model, in this case the similarity
            for index in range(start_profile, end_profile):
                profile_item_id = URM_train_indices[index]
                profile_rating = URM_train_data[index]
                item_item_S[profile_item_id,item_id] += learning_rate * (prediction_error * profile_rating -
                                                                         regularization_2 * item_item_S[profile_item_id,item_id])

            # Ensure diagonal is always zero
            item_item_S[item_id,item_id] = 0.0

#             if sample_num % 1000000 == 0:
#                 print("Epoch {}: {:.2f}%".format(n_epoch+1, sample_num/n_interactions*100))


        elapsed_time = time.time() - start_time
        samples_per_second = (sample_num+1)/elapsed_time

        print("Epoch {} complete in in {:.2f} seconds, loss is {:.3E}. Samples per second {:.2f}".format(n_epoch+1, time.time() - start_time, loss/(sample_num+1), samples_per_second))

    return np.array(item_item_S), loss/(sample_num+1), samples_per_second


In [29]:
n_items = URM_train.shape[1]
learning_rate = 1e-5
regularization_2 = 1e-3

item_item_S, loss, samples_per_second = train_multiple_epochs(URM_train, learning_rate, regularization_2, 50)

Epoch 1 complete in in 14.82 seconds, loss is 9.931E-01. Samples per second 32307.11
Epoch 2 complete in in 4.84 seconds, loss is 9.795E-01. Samples per second 98977.20
Epoch 3 complete in in 6.41 seconds, loss is 9.665E-01. Samples per second 74685.45
Epoch 4 complete in in 4.58 seconds, loss is 9.540E-01. Samples per second 104576.93
Epoch 5 complete in in 4.35 seconds, loss is 9.422E-01. Samples per second 110125.46
Epoch 6 complete in in 5.08 seconds, loss is 9.308E-01. Samples per second 94246.59
Epoch 7 complete in in 4.47 seconds, loss is 9.203E-01. Samples per second 107214.53
Epoch 8 complete in in 4.25 seconds, loss is 9.094E-01. Samples per second 112626.01
Epoch 9 complete in in 4.36 seconds, loss is 8.991E-01. Samples per second 109718.92
Epoch 10 complete in in 4.74 seconds, loss is 8.894E-01. Samples per second 100936.64
Epoch 11 complete in in 4.47 seconds, loss is 8.801E-01. Samples per second 106979.93
Epoch 12 complete in in 4.24 seconds, loss is 8.703E-01. Samples p

In [54]:
item_item_S

array([[0.        , 0.05922065, 0.03894701, ..., 0.        , 0.        ,
        0.        ],
       [0.05000935, 0.        , 0.03017013, ..., 0.        , 0.00051859,
        0.        ],
       [0.02945732, 0.02668081, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.00032776, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [30]:
URM_train_coo = URM_train.tocoo()

sample_index = np.random.randint(URM_train_coo.nnz)

user_id = URM_train_coo.row[sample_index]
item_id = URM_train_coo.col[sample_index]
true_rating = URM_train_coo.data[sample_index]

(user_id, item_id, true_rating)

predicted_rating = URM_train[user_id].dot(item_item_S[:,item_id])[0]

result = true_rating - predicted_rating
result

0.9303716355274384

In [44]:
def recommend(URM, S,  user_id, at=10, exclude_seen=True):
        # compute the scores using the dot product
        user_profile = URM[user_id]
        scores = user_profile.dot(S)

        #if exclude_seen:
        #    scores = self.filter_seen(user_id, scores)

        # rank items
        ranking = scores.argsort()[::-1]

        return ranking[:at]

In [61]:
recommender = recommend(URM_train, item_item_S,9100,10)
recommender

array([[15548, 21079,  8347, ...,   294,  1166,  1169]])

##****SUBMIT****

In [33]:
users_to_recommend_raw = pd.read_csv(filepath_or_buffer="/gdrive/My Drive/RecSys/libs/data_target_users_test.csv",
                   sep=",",
                  dtype={"user_id": np.int32}).to_numpy()
users_to_recommend = []
for elem in users_to_recommend_raw:
  users_to_recommend.append(elem[0])

len(users_to_recommend)

10882

In [34]:
ratings = pd.read_csv("/gdrive/My Drive/RecSys/libs/data_train2.csv",
                       sep=",",
                       names=["user_id", "item_id", "ratings"],
                       header=None,
                       dtype={"user_id": np.int64,
                               "item_id": np.int64,
                               "ratings": np.int64})
ratings.shape

(478730, 3)

In [35]:
def preprocess_data(ratings: pd.DataFrame):
    unique_users = ratings.user_id.unique()
    unique_items = ratings.item_id.unique()

    num_users, min_user_id, max_user_id = unique_users.size, unique_users.min(), unique_users.max()
    num_items, min_item_id, max_item_id = unique_items.size, unique_items.min(), unique_items.max()

    print(num_users, min_user_id, max_user_id)
    print(num_items, min_item_id, max_item_id)

    mapping_user_id = pd.DataFrame({"mapped_user_id": np.arange(num_users), "user_id": unique_users})
    mapping_item_id = pd.DataFrame({"mapped_item_id": np.arange(num_items), "item_id": unique_items})

    ratings = pd.merge(left=ratings,
                       right=mapping_user_id,
                       how="inner",
                       on="user_id")

    ratings = pd.merge(left=ratings,
                       right=mapping_item_id,
                       how="inner",
                       on="item_id")

    return ratings


In [36]:
ratings = preprocess_data(ratings)

12638 1 13024
22222 1 22347


In [None]:
users_ids_and_mappings = ratings[ratings.user_id.isin(users_to_recommend)][["user_id", "mapped_user_id"]].drop_duplicates()
items_ids_and_mappings = ratings[["item_id", "mapped_item_id"]].drop_duplicates()
users_ids_and_mappings

In [58]:
def prepare_submission(ratings: pd.DataFrame, users_to_recommend: np.array, urm_train: sps.csr_matrix, recommender: object):
    users_ids_and_mappings = ratings[ratings.user_id.isin(users_to_recommend)][["user_id", "mapped_user_id"]].drop_duplicates()
    items_ids_and_mappings = ratings[["item_id", "mapped_item_id"]].drop_duplicates()

    mapping_to_item_id = dict(zip(ratings.mapped_item_id, ratings.item_id))


    recommendation_length = 10
    submission = []
    for idx, row in users_ids_and_mappings.iterrows():
        user_id = row.user_id
        mapped_user_id = row.mapped_user_id
        recommendations = []
        if (mapped_user_id in ratings):
          recommendations = recommend(ratings, mapped_user_id,recommendation_length)
        else:
          recommendations = [ 517,  189,   44,    0,  284,  808,  285,  557,    1, 1266]
        submission.append((user_id, [mapping_to_item_id[item_id] for item_id in recommendations]))


    return submission

In [59]:
recommender = recommend(URM_train, item_item_S,user_id,10)
submission = prepare_submission(ratings, users_to_recommend, URM_train, recommender)
len(submission)

10661

In [60]:
submission

[(1, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (2, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (26, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (36, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (41, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (47, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (54, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (73, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (88, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (89, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (95, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (100, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (101, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (102, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (104, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (114, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (124, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (127, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (138, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (145, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (152, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (178, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (189, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (202, [2, 4, 1, 7, 3, 6, 8, 9, 15, 20]),
 (208

In [None]:
missing_users = []
for elem in users_to_recommend:
  miss = True
  for sub in submission:
    if elem == sub[0]:
      miss = False
  if miss:
    missing_users.append(elem)
len(missing_users)
missing_users