In [4]:
import dataclasses
import numpy as np
import os
import sys
import shelve
import psutil
import time

from scipy.sparse import csc_array, vstack, coo_array

# Append parent dir to system path.
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)


import rec_sys.data_util as cfd
import rec_sys.cf_algorithms_to_complete as cfa

# MMD 2024, Problem Sheet 1: Collaborative Filtering

Group: Daniela Fichiu, Aaron Maekel, Manuel Senger


## Exercise 1

Please see `cf_algorithms_to_complete.py`.

## Exercise 2

Please see `cf_algorithms_to_complete.py`, `centered_cosine_sim` and `centered_fast_cosine_sim`.

Below are the tests for the `centered_cosine_sim` function. For the fast variant, we did not implement any unit tests since we test the function in Exercise 3 by comparing the results of the sparse implementation with the results of the dense implementation.

In [5]:
def test_centered_cosine_sim_neg_corr(k: int = 100, tol: float = 1e-6):
    x = np.array([i + 1 for i in range(k)])
    y = x[::-1]

    res = cfa.centered_cosine_sim(
        coo_array(x),
        coo_array(y),
    )

    expected_res = -1
    assert np.abs(res - expected_res) <= tol


def test_centered_cosine_sim_nan(k: int = 100, tol: float = 1e-6):
    x = np.array([i + 1 for i in range(k)], dtype=float)
    y = x[::-1].copy()
    
    for c in [2, 3, 4, 5, 6]:
        for shift in range(0, 100, 10):
            # In scipy.sparse, the missing values have to be 0.
            # This does not affect the functionality of the code,
            # since the algorithm presented in the lecture
            # replaces the nan's with 0's after centering and
            # the sparse vectors only use the non-zero entries
            # for opperations.
            x[c + shift] = 0

    res = cfa.centered_cosine_sim(
        coo_array(x),
        coo_array(y),
    )

    expected_res = -0.9998540
    assert np.abs(res - expected_res) <= tol

test_centered_cosine_sim_neg_corr()
test_centered_cosine_sim_nan()

## Exercise 3

In [6]:

@dataclasses.dataclass
class config:
    max_rows: int = int(1e5)
    dowload_url: str = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
    download_dir: str = "data/"
    unzipped_dir: str = download_dir + "ml-25m/"
    file_path: str = download_dir + "ml-25m/ratings.csv"


In [7]:
# Load the MovieLens and Lecture datasets
um_movielens = cfd.get_um_by_name(config, "movielens")
um_lecture = cfd.get_um_by_name(config, "lecture_1")

Dir 'data/ml-25m/' already exists, skipping download

### Start reading data from 'data/ml-25m/ratings.csv'
Loaded data from 'data/ml-25m/ratings.csv', df shape: (100000, 3), size in MB: 1.1444091796875 
Pivoting the data
Utility matrix, df shape: (9786, 757), size in MB: 29.142929077148438 
Final utility matrix (numpy array as np.float32), df shape: (9786, 757), size in MB: 28.25928497314453 


In [8]:
### Dense implementation ###

# Rate all items for the lecture toy dataset
all_ratings = cfa.rate_all_items_old(um_lecture, 4, 2)
print ("all_ratings lecture toy dataset:", all_ratings)

# Rate all items the MovieLens data
all_ratings_movielens = cfa.rate_all_items_old(um_movielens, 0, 2)
print("all_ratings_movielens:", all_ratings_movielens)


>>> CF computation for UM w/ shape: (6, 6), user_index: 4, neighborhood_size: 2

item_idx: 0, neighbors: [5 2], rating: 2.4
item_idx: 1, neighbors: [2 3], rating: 4.8
all_ratings lecture toy dataset: [2.4, 4.8, 2.0, 5.0, 4.0, 3.0]

>>> CF computation for UM w/ shape: (9786, 757), user_index: 0, neighborhood_size: 2

item_idx: 70, neighbors: [645 420], rating: 3.4000000953674316
item_idx: 71, neighbors: [439 755], rating: 2.799999952316284
item_idx: 72, neighbors: [ 37 678], rating: 3.4000000953674316
item_idx: 73, neighbors: [201 355], rating: 4.099999904632568
item_idx: 74, neighbors: [612 216], rating: 3.299999952316284
item_idx: 75, neighbors: [105  11], rating: 3.9000000953674316
item_idx: 76, neighbors: [ 37 420], rating: 5.099999904632568
item_idx: 77, neighbors: [ 11 565], rating: 4.699999809265137
item_idx: 78, neighbors: [723 565], rating: 3.799999952316284
item_idx: 79, neighbors: [551], rating: 4.599999904632568
item_idx: 80, neighbors: [565 420], rating: 2.299999952316284


  um_normalized = utility_matrix / norms


item_idx: 511, neighbors: [ 37 678], rating: 3.9000000953674316
item_idx: 512, neighbors: [755  37], rating: 3.5999999046325684
item_idx: 513, neighbors: [476 713], rating: 4.099999904632568
item_idx: 514, neighbors: [283   2], rating: 2.0999999046325684
item_idx: 515, neighbors: [113 755], rating: 3.799999952316284
item_idx: 516, neighbors: [755 545], rating: 4.5
item_idx: 517, neighbors: [755  37], rating: 3.799999952316284
item_idx: 518, neighbors: [755  37], rating: 4.099999904632568
item_idx: 519, neighbors: [439 755], rating: 3.799999952316284
item_idx: 520, neighbors: [  2 317], rating: 3.0999999046325684
item_idx: 521, neighbors: [439  37], rating: 3.4000000953674316
item_idx: 522, neighbors: [  2 317], rating: 3.4000000953674316
item_idx: 523, neighbors: [713 317], rating: 3.200000047683716
item_idx: 524, neighbors: [755  37], rating: 4.300000190734863
item_idx: 525, neighbors: [  2 113], rating: 3.5
item_idx: 526, neighbors: [300 545], rating: 4.0
item_idx: 527, neighbors: [5

  np.sum(similarities[best_among_who_rated] * clean_utility_matrix[item_index, best_among_who_rated])


item_idx: 3927, neighbors: [540 108], rating: 3.700000047683716
item_idx: 3928, neighbors: [540 108], rating: 3.4000000953674316
item_idx: 3929, neighbors: [607 755], rating: 1.899999976158142
item_idx: 3930, neighbors: [540 108], rating: 3.700000047683716
item_idx: 3931, neighbors: [439 755], rating: 3.299999952316284
item_idx: 3932, neighbors: [108 755], rating: 1.2999999523162842
item_idx: 3933, neighbors: [378 108], rating: 5.099999904632568
item_idx: 3934, neighbors: [108 439], rating: 3.299999952316284
item_idx: 3935, neighbors: [540 108], rating: 3.4000000953674316
item_idx: 3936, neighbors: [540 108], rating: 4.199999809265137
item_idx: 3937, neighbors: [184 607], rating: 4.5
item_idx: 3938, neighbors: [540 108], rating: 4.199999809265137
item_idx: 3939, neighbors: [108], rating: 2.799999952316284
item_idx: 3940, neighbors: [315 108], rating: 2.9000000953674316
item_idx: 3941, neighbors: [540 108], rating: 3.700000047683716
item_idx: 3942, neighbors: [540 108], rating: 4.199999

In [17]:
### Sparse implementation ### 

# We start by operating on the columns.
um_lecture = csc_array(np.nan_to_num(um_lecture))
print(f'Utility matrix size: {um_lecture.data.nbytes} Bytes')
# Rate all items for the lecture toy dataset
all_ratings = cfa.rate_all_items(um_lecture, 4, 2)
print ("all_ratings lecture toy dataset:", all_ratings)

um_movielens = csc_array(np.nan_to_num(um_movielens))
print(f'Utility matrix size: {um_movielens.data.nbytes / 10 ** 6} MB')
# Rate all items the MovieLens data
all_ratings_movielens = cfa.rate_all_items(um_movielens, 0, 2)
print("all_ratings_movielens:", all_ratings_movielens)

Utility matrix size: 152 Bytes
item_idx: 0, neighbors: [5 2], rating: 2.4
item_idx: 1, neighbors: [2 3], rating: 4.8
all_ratings lecture toy dataset: [2.4, 4.8, 2.0, 5.0, 4.0, 3.0]
Utility matrix size: 0.4 MB
item_idx: 70, neighbors: [645 420], rating: 3.4
item_idx: 71, neighbors: [439 755], rating: 2.8
item_idx: 72, neighbors: [ 37 678], rating: 3.4
item_idx: 73, neighbors: [201 355], rating: 4.1
item_idx: 74, neighbors: [612 216], rating: 3.3
item_idx: 75, neighbors: [105  11], rating: 3.9
item_idx: 76, neighbors: [ 37 420], rating: 5.1
item_idx: 77, neighbors: [ 11 565], rating: 4.7
item_idx: 78, neighbors: [723 565], rating: 3.8
item_idx: 79, neighbors: [551], rating: 4.6
item_idx: 80, neighbors: [565 420], rating: 2.3
item_idx: 81, neighbors: [456 355], rating: 2.9
item_idx: 82, neighbors: [645 420], rating: 4.9
item_idx: 83, neighbors: [755 645], rating: 3.9
item_idx: 84, neighbors: [355 420], rating: 2.9
item_idx: 85, neighbors: [355], rating: 4.5
item_idx: 86, neighbors: [645 4

**Results:** As we can see, the sparse implementation produces the same results as the dense one. Moreover, the size of the sparse matrices (only the memeory allocated for the elements is considered!) has been drastically reduced.

## Exercise 4

Please see file `generate_data_structs.py`. We did not upload the generated data files due to memory constrains.

## Exercise 5

In [10]:
def estimate_rating(user_id, item_id, user_col, rated_by):
    if str(item_id) not in rated_by:
        print("{} not yet rated by anyone".format(item_id))
        return 0
    
    if str(user_id) not in user_col:
        print("{} not yet rated any item".format(user_id))
        return 0 
    
    # find all users who rated the item
    users = rated_by[str(item_id)]

    # Load all vectors user_col[user] for all users who rated the item
    vectors = [user_col[str(user)] for user in users]
    U = csc_array(vstack(vectors).T)
    rating = cfa.rate_all_items(U, 0, 2)
    print(rating[item_id])
    return rating[item_id]

In [11]:
rated_by = shelve.open("../rated_by", flag="r")
user_col = shelve.open("../user_col", flag="r")

user_item_pairs = [
    (828, 11), (2400, 4725), (3765, 1270), (4299, 4020), (5526, 2432),
    (6063, 4525), (7045, 4100), (8160, 6300), (9682, 1212), (10277, 7355)
]

# run the algorithm for all user-item pairs and report the results
results = [estimate_rating(user_id, item_id, user_col, rated_by) for user_id, item_id in user_item_pairs]
print(results)


item_idx: 0, neighbors: [], rating: nan
item_idx: 3, neighbors: [335 218], rating: 5.4
item_idx: 6, neighbors: [59 50], rating: 4.0
item_idx: 25, neighbors: [335  50], rating: 4.1
item_idx: 27, neighbors: [218  50], rating: 4.9
item_idx: 29, neighbors: [335 218], rating: 5.4
item_idx: 30, neighbors: [348  59], rating: 5.2
item_idx: 33, neighbors: [335  50], rating: 5.1
item_idx: 35, neighbors: [335  50], rating: 5.1
item_idx: 41, neighbors: [218  50], rating: 3.9
item_idx: 42, neighbors: [335  50], rating: 4.1
item_idx: 44, neighbors: [335  50], rating: 5.1
item_idx: 45, neighbors: [ 56 335], rating: 4.6
item_idx: 46, neighbors: [335  50], rating: 5.1
item_idx: 49, neighbors: [335  50], rating: 4.7
item_idx: 50, neighbors: [218  50], rating: 4.4
item_idx: 51, neighbors: [335  50], rating: 5.1
item_idx: 53, neighbors: [335 218], rating: 2.3
item_idx: 54, neighbors: [335 218], rating: 3.3
item_idx: 57, neighbors: [335  50], rating: 5.1
item_idx: 59, neighbors: [265  56], rating: 3.4
item

In [12]:
def measure_memory_usage(user_id, item_id, user_col, rated_by):
    # Start tracking memory usage
    process = psutil.Process(os.getpid())
    start_mem = process.memory_info().rss / (1024 * 1024)  # Convert to MB
    
    # Estimate rating and track peak memory usage
    rating = estimate_rating(user_id, item_id, user_col, rated_by)
    max_mem = start_mem
    for _ in range(10):  # Monitor memory usage over a short period
        current_mem = process.memory_info().rss / (1024 * 1024)  # In MB
        max_mem = max(max_mem, current_mem)
        time.sleep(0.1)  # Sleep for a short time to allow peak tracking
    
    return rating, max_mem

# Load your data structures (shelve or otherwise)
rated_by = shelve.open("../rated_by", flag="r")
user_col = shelve.open("../user_col", flag="r")

# User-item pairs for testing
user_item_pairs = [
    (828, 11), (2400, 4725), (3765, 1270), (4299, 4020), (5526, 2432), (6063, 4525)
]

# Evaluate memory usage for each user-item pair
results = []
for user_id, item_id in user_item_pairs:
    rating, max_memory = measure_memory_usage(user_id, item_id, user_col, rated_by)
    results.append((user_id, item_id, rating, max_memory))
    print(f"User {user_id}, Item {item_id} - Rating: {rating}, Max Memory Usage: {max_memory:.2f} MB")

# Close the shelve files
rated_by.close()
user_col.close()

# Display all results
for user_id, item_id, rating, max_memory in results:
    print(f"User {user_id}, Item {item_id}: Estimated rating = {rating}, Max memory usage = {max_memory:.2f} MB")


item_idx: 0, neighbors: [], rating: nan
item_idx: 3, neighbors: [335 218], rating: 5.4
item_idx: 6, neighbors: [59 50], rating: 4.0
item_idx: 25, neighbors: [335  50], rating: 4.1
item_idx: 27, neighbors: [218  50], rating: 4.9
item_idx: 29, neighbors: [335 218], rating: 5.4
item_idx: 30, neighbors: [348  59], rating: 5.2
item_idx: 33, neighbors: [335  50], rating: 5.1
item_idx: 35, neighbors: [335  50], rating: 5.1
item_idx: 41, neighbors: [218  50], rating: 3.9
item_idx: 42, neighbors: [335  50], rating: 4.1
item_idx: 44, neighbors: [335  50], rating: 5.1
item_idx: 45, neighbors: [ 56 335], rating: 4.6
item_idx: 46, neighbors: [335  50], rating: 5.1
item_idx: 49, neighbors: [335  50], rating: 4.7
item_idx: 50, neighbors: [218  50], rating: 4.4
item_idx: 51, neighbors: [335  50], rating: 5.1
item_idx: 53, neighbors: [335 218], rating: 2.3
item_idx: 54, neighbors: [335 218], rating: 3.3
item_idx: 57, neighbors: [335  50], rating: 5.1
item_idx: 59, neighbors: [265  56], rating: 3.4
item