In [90]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from time import time
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_distances

from data_loader import load_ratings, load_user_item_matrix

### User/item matrix loading

In [5]:
user_item = load_user_item_matrix()
print('User item matrix shape: ', user_item.shape)

User item matrix shape:  (138493, 26744)


### User-user collaborative filtering

In [111]:
def calculate_distances(U, UI):
    return np.array(U.dot(UI.T) / np.sqrt(np.sum(U.power(2), axis=1).reshape(-1, 1) * np.sum(UI.power(2), axis=1).reshape(1, -1)))

def predict_user_item_values(U, UI, k=5, report_progress=True):
    start_time = time()
    D = calculate_distances(U, UI)
    res = np.zeros(shape=(U.shape[0], UI.shape[1]))
    for movie_id in range(UI.shape[1]):
        movie_mask = (UI[:, movie_id] != 0).T.toarray()
        masked_distances = np.where(movie_mask, D, 0)
        topk = np.argpartition(masked_distances, -k, axis=1)[:, -k:]
        predicted_ratings = np.mean(UI[topk, movie_id].toarray(), axis=1)
        res[:, movie_id] = predicted_ratings
        if report_progress and movie_id % 1000 == 0:
            percentage = float(100) * movie_id // UI.shape[1]
            time_elapsed = time() - start_time
            total_time = float(100) * time_elapsed / percentage if percentage > 0 else float('nan')
            total_min = total_time / 60
            total_sec = total_time % 60
            time_left = total_time - time_elapsed
            min_left = time_left / 60
            sec_left = time_left % 60
            print(f'{percentage}% after {time_elapsed:.2f} s, estimated total time: {total_min:.2f} min {total_sec:.2f} sec, etl: {min_left:.2f} min {sec_left:.2f}')
    return res

In [113]:
predicted = predict_user_item_values(user_item[:10], user_item[10:])

0.0% after 0.93 s, estimated total time: nan min nan sec, etl: nan min nan
3.0% after 43.99 s, estimated total time: 24.44 min 26.39 sec, etl: 23.71 min 42.39
7.0% after 87.18 s, estimated total time: 20.76 min 45.43 sec, etl: 19.30 min 18.25
11.0% after 129.13 s, estimated total time: 19.56 min 33.87 sec, etl: 17.41 min 24.74
14.0% after 169.58 s, estimated total time: 20.19 min 11.30 sec, etl: 17.36 min 21.72
18.0% after 208.49 s, estimated total time: 19.30 min 18.26 sec, etl: 15.83 min 49.77
22.0% after 246.49 s, estimated total time: 18.67 min 40.40 sec, etl: 14.57 min 33.91
26.0% after 284.19 s, estimated total time: 18.22 min 13.03 sec, etl: 13.48 min 28.85
29.0% after 321.04 s, estimated total time: 18.45 min 27.05 sec, etl: 13.10 min 6.01
33.0% after 357.17 s, estimated total time: 18.04 min 2.34 sec, etl: 12.09 min 5.17
37.0% after 393.21 s, estimated total time: 17.71 min 42.74 sec, etl: 11.16 min 9.53
41.0% after 428.93 s, estimated total time: 17.44 min 26.18 sec, etl: 10.

4.1