# Laboratorium 5 - rekomendacje grupowe

## Przygotowanie

 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab5`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas scipy matplotlib`

## Część 1. - przygotowanie danych

In [2]:
# importujemy wszystkie potrzebne pakiety

import math
import numpy as np
import pandas as pd
from pandas.core.interchange.dataframe_protocol import DataFrame
from scipy.sparse.linalg import svds


from random import choice, sample
from statistics import mean, stdev

In [3]:
PATH = 'ml-latest-small'

In [4]:
# wczytujemy oceny uzytkownikow i obliczamy (za pomoc dekompozycji macierzy) wszystkie przewidywane oceny filmow

def read_ratings(path, k=600, scale_factor=2.0, print_stats=True):
    # idea: https://www.kaggle.com/code/indralin/movielens-project-1-2-collaborative-filtering
    reviews = pd.read_csv(f'{path}/ratings.csv', names=['userId', 'movieId', 'rating', 'time'], delimiter=',', engine='python', skiprows=1)
    
    reviews.drop(['time'], axis=1, inplace=True)
    reviews_no, _ = reviews.shape
    reviews_matrix = reviews.pivot(index='userId', columns='movieId', values='rating')
    movies = reviews_matrix.columns
    users = reviews_matrix.index
    users_no, movies_no = reviews_matrix.shape
    print(f'Got {reviews_no} reviews for {movies_no} movies and {users_no} users.')

    user_ratings_mean = np.nanmean(reviews_matrix.values, axis=1)
    normalized_reviews_matrix = np.nan_to_num(reviews_matrix.values - user_ratings_mean.reshape(-1, 1), 0.0)

    U, sigma, Vt = svds(normalized_reviews_matrix, k=k)
    sigma = np.diag(sigma)
    predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1).clip(0.5, 5.0)
    mean_square_error = np.nanmean(np.square(predicted_ratings - reviews_matrix.values))
    std_square_error = np.nanstd(np.square(predicted_ratings - reviews_matrix.values))
    print(f'Reviews prediction mean square error = {mean_square_error}')
    print(f'Reviews prediction standatd deviation of square error = {std_square_error}')

    if print_stats:
        stats = [
            ('metric', 'dataset', 'prediction'),
            ('avg', np.nanmean(reviews_matrix), np.mean(predicted_ratings)),
            ('st_dev', np.nanstd(reviews_matrix), np.std(predicted_ratings)),
            ('median', np.nanmedian(reviews_matrix), np.median(predicted_ratings)),
            ('p25', np.nanquantile(reviews_matrix, 0.25), np.quantile(predicted_ratings, 0.25)),
            ('p75', np.nanquantile(reviews_matrix, 0.75), np.quantile(predicted_ratings, 0.75))
        ]
        print('Stats (for raings in original range [0.5, 5.0]):')
        print('\n'.join([str(s) for s in stats]))

    rounded_predictions = np.rint(scale_factor * predicted_ratings) # cast values to {1, 2, ..., 10}
    return pd.DataFrame(data=rounded_predictions, index=list(users), columns=list(movies))
    
ratings = read_ratings(PATH)
# dostep do danych:
# ratings[movieId][userId] pobiera 1 wartosc
# ratings.loc[:, movieId] pobiera wektor dla danego filmu
# ratings.loc[userId, :] pobiera wektor dla danego uzytkownika
ratings

Got 100836 reviews for 9724 movies and 610 users.
Reviews prediction mean square error = 1.6577787842925017e-05
Reviews prediction standatd deviation of square error = 0.0007928950536518832
Stats (for raings in original range [0.5, 5.0]):
('metric', 'dataset', 'prediction')
('avg', 3.501556983616962, 3.657222337747399)
('st_dev', 1.042524069618056, 0.49546237560971024)
('median', 3.5, 3.7052240088117694)
('p25', 3.0, 3.3574517183164025)
('p75', 4.0, 3.9999816268830006)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,8.0,9.0,8.0,9.0,9.0,8.0,9.0,9.0,9.0,9.0,...,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
2,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
3,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
4,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
5,8.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,5.0,7.0,7.0,7.0,7.0,7.0,5.0,7.0,7.0,7.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
607,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
608,5.0,4.0,4.0,6.0,6.0,6.0,6.0,6.0,6.0,8.0,...,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
609,6.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,8.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0


In [5]:
# wczytujemy nazwy filmow i kategorie

movies_metadata = pd.read_csv('ml-latest-small/movies.csv').set_index('movieId')
movies_metadata

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [6]:
# wczytujemy przykladowe grupy uzytkownikow
groups = pd.read_csv('groups.csv').values.tolist()
groups

[[111, 307, 474, 599, 414],
 [469, 182, 232, 448, 600],
 [508, 581, 497, 402, 566],
 [300, 515, 245, 568, 507],
 [2, 371, 252, 518, 37],
 [269, 360, 469, 287, 308],
 [243, 527, 418, 118, 370],
 [186, 559, 327, 553, 314]]

In [7]:
# przygotowujemy funkcje pomocnicza

def describe_group(group, N=10):
    print(f'\n\nUser ids: {group}')
    group_size = len(group)
    
    mean_stdev = ratings.loc[group].std(axis=0).mean()
    median_stdev = ratings.loc[group].std(axis=0).median()
    std_stdev = ratings.loc[group].std(axis=0).std()
    print(f'\nMean ratings deviation: {mean_stdev}')
    print(f'Median ratings deviation: {median_stdev}')
    print(f'Standard deviation of ratings deviation: {std_stdev}')
    
    average_scores = ratings.iloc[group].mean(axis=0)
    average_scores = average_scores.sort_values()
    best_movies = [(movies_metadata['title'][movie_id], average_scores[movie_id]) for movie_id in list(average_scores[-N:].index)]
    worst_movies = [(movies_metadata['title'][movie_id], average_scores[movie_id]) for movie_id in list(average_scores[:N].index)]
    
    print('\nBest movies:')
    for movie, score in best_movies[::-1]:
        print(f'{movie}, {score}*')
    print('\nWorst movies:')
    for movie, score in worst_movies:
        print(f'{movie}, {score}*')

describe_group(groups[5])



User ids: [269, 360, 469, 287, 308]

Mean ratings deviation: 1.1259149574579788
Median ratings deviation: 1.0954451150103321
Standard deviation of ratings deviation: 0.17836724055768716

Best movies:
Toy Story (1995), 8.2*
Forrest Gump (1994), 8.2*
Willy Wonka & the Chocolate Factory (1971), 8.0*
Braveheart (1995), 8.0*
Terminator 2: Judgment Day (1991), 7.8*
Schindler's List (1993), 7.8*
Dances with Wolves (1990), 7.6*
James and the Giant Peach (1996), 7.6*
Dead Man Walking (1995), 7.6*
Nixon (1995), 7.6*

Worst movies:
Broken Arrow (1996), 5.2*
Sleepy Hollow (1999), 5.4*
The Devil's Advocate (1997), 5.4*
Cable Guy, The (1996), 5.4*
Mission: Impossible (1996), 5.6*
Nutty Professor, The (1996), 5.6*
Down Periscope (1996), 5.8*
Cheech and Chong's Up in Smoke (1978), 5.8*
Wrong Man, The (1956), 5.8*
Fog, The (2005), 5.8*


## Część 2. - algorytmy proste

In [10]:
# zdefiniujmy interfejs dla wszystkich algorytmow rekomendacyjnych

class Recommender:
    def recommend(self, movies: list[int], ratings: pd.DataFrame, group: list[int], size: int) -> list[int]:
        pass


# jako pierwszy zaimplementujemy algorytm losowy - dla porownania
    
class RandomRecommender(Recommender):
    def __init__(self):
        self.name = 'random'
        
    def recommend(self, movies, ratings, group, size):
        return sample(movies, size)

RandomRecommender().recommend(list(ratings.columns), ratings, groups[0], 10)

[105246, 2617, 4078, 7669, 138632, 135436, 2417, 26375, 70637, 72733]

In [59]:
# algorytm rekomendujacy filmy o najwyzszej sredniej ocen

class AverageRecommender(Recommender):
    def __init__(self):
        self.name = 'average'
    
    def recommend(self, movies: list[int], ratings: pd.DataFrame, group: list[int], size: int) -> list[int]:
        group_ratings = ratings.loc[group]
        average_ratings = group_ratings.mean(axis=0)
        sorted_movies = average_ratings.sort_values(ascending=False)
        top_movies = sorted_movies.head(size)
        return top_movies.index.tolist()


AverageRecommender().recommend(list(ratings.columns), ratings, groups[0], 10)

[2959, 318, 3897, 4878, 3160, 4226, 1221, 858, 296, 1247]

In [82]:
# algorytm rekomendujacy filmy o najwyzszej sredniej ocen,
#   ale rownoczesnie wykluczajacy te filmy, ktore otrzymaly choc jedna ocene ponizej thresholdu

class AverageWithoutMiseryRecommender(Recommender):
    def __init__(self, score_threshold):
        self.name = 'average_without_misery'
        self.score_threshold = score_threshold
        
    def recommend(self, movies: list[int], ratings: pd.DataFrame, group: list[int], size: int) -> list[int]:
        group_ratings = ratings.loc[group]
        above_threshold_mask = group_ratings >= self.score_threshold
        group_ratings_excluded_misery = group_ratings.loc[:, above_threshold_mask.all(axis=0)]
        average_ratings = group_ratings_excluded_misery.mean(axis=0)
        sorted_movies = average_ratings.sort_values(ascending=False)
        top_movies = sorted_movies.head(size)

        return top_movies.index.tolist()
    
AverageWithoutMiseryRecommender(4).recommend(list(ratings.columns), ratings, groups[0], 10)

[2959, 318, 4878, 1221, 296, 858, 4226, 3160, 3897, 1089]

In [85]:
from itertools import cycle


# algorytm uwzgledniajacy preferencje tylko jednego uzytkownika w kazdej iteracji

class FairnessRecommender(Recommender):
    def __init__(self):
        self.name = 'fairness'
        
    def recommend(self, movies: list[int], ratings: pd.DataFrame, group: list[int], size: int) -> list[int]:
        result = []
        for user_id in cycle(group):
            user_ratings = ratings.loc[user_id]
            sorted_user_ratings = user_ratings.sort_values(ascending=False)
            for move_id in sorted_user_ratings.index:
                if move_id not in result:
                    result.append(move_id)
                    break

            if len(result) == size:
                break

        return result

FairnessRecommender().recommend(list(ratings.columns), ratings, groups[0], 10)

[60756, 59315, 5377, 3703, 750, 74532, 51931, 2762, 260, 1199]

In [89]:
from collections import defaultdict

# wybrany algorytm wyborczy (dyktatura, Borda, Copeland)

class VotingRecommender(Recommender):
    def __init__(self):
        self.name = "borda"
    
    def recommend(self, movies: list[int], ratings: pd.DataFrame, group: list[int], size: int) -> list[int]:
        borda_scores = defaultdict(int)

        for user_id in group:
            user_ratings = ratings.loc[user_id]
            sorted_user_ratings = user_ratings.sort_values(ascending=True)
            for i, movie_id in enumerate(sorted_user_ratings.index):
                borda_scores[movie_id] += i

        borda_scores = list(borda_scores.items())

        group_ratings = ratings.loc[group]
        average_ratings = group_ratings.mean(axis=0)
        borda_scores_with_avg = [(movie_id, borda_score, average_ratings[movie_id]) for movie_id, borda_score in borda_scores]
        sorted_borda_scores = sorted(borda_scores_with_avg, key=lambda x: (x[1], x[2]), reverse=True)
        return [movie_id for movie_id, *_ in sorted_borda_scores[:size]]

VotingRecommender().recommend(list(ratings.columns), ratings, groups[0], 10)

[318, 2959, 3897, 4226, 4878, 1682, 48516, 4963, 4993, 5989]

In [101]:
# algorytm zachlanny, aproksymujacy metode Proportional Approval Voting
#   w kazdej iteracji wybieramy ten film, ktory najbardziej zwieksza zadowolenie zgodnie z punktacja PAV

class ProportionalApprovalVotingRecommender(Recommender):
    def __init__(self, threshold):
        self.threshold = threshold
        self.name = 'PAV'
        
    def recommend(self, movies: list[int], ratings: pd.DataFrame, group: list[int], size: int) -> list[int]:
        result = set()
        user_satisfaction = defaultdict(lambda: 1)
        for _ in range(size):
            movie_ratings = defaultdict(int)
            for user_id in group:
                user_ratings = ratings.loc[user_id]
                sorted_user_ratings = user_ratings.sort_values(ascending=False)
                iteration_user_satisfaction = 0
                for movie_id in sorted_user_ratings.index:
                    if movie_id in result:
                        continue
                    if ratings.loc[user_id, movie_id] >= self.threshold:
                        movie_ratings[movie_id] += 1 / (user_satisfaction[user_id] + iteration_user_satisfaction)
                        iteration_user_satisfaction += 1

            sorted_movies = sorted(movie_ratings.items(), key=lambda x: x[1], reverse=True)
            best_movie = sorted_movies[0][0]
            result.add(best_movie)

            for user_id in group:
                if ratings.loc[user_id, best_movie] >= self.threshold:
                    user_satisfaction[user_id] += 1

        return list(result)

ProportionalApprovalVotingRecommender(5).recommend(list(ratings.columns), ratings, groups[0], 10)

[5377, 1221, 2762, 2858, 1196, 4973, 750, 2959, 1208, 1210]

## Część 3. - funkcje celu

In [103]:
# dwie funkcje pomocnicze:
#  - znajdujaca ulubione filmy danego uzytkownika
#  - obliczajaca sume ocen wystawionych przez uzytkownika wszystkim filmom w rekomendacji

def top_n_movies_for_user(ratings: pd.DataFrame, movies: list[int], user_id: int, n: int) -> list[int]:
    user_ratings = ratings.loc[user_id]
    sorted_user_ratings = user_ratings.sort_values(ascending=False)
    return sorted_user_ratings.head(n).index.tolist()

def total_score(recommendation: list[int], user_id: int, ratings: pd.DataFrame) -> float:
    return sum(ratings.loc[user_id, movie_id] for movie_id in recommendation)

In [104]:
# funkcja obliczajaca zadowolenie pojedynczego uzytkownika
#  - iloraz zadowolenia z wygenerowanej rekomendacji oraz zadowolenia z hipotetycznej rekomendacji idealnej
def overall_user_satisfaction(recommendation: list[int], user_id: int, movies: list[int], ratings: pd.DataFrame) -> float:
    n = len(recommendation)
    recommendation_score = total_score(recommendation, user_id, ratings)
    top_movies_score = total_score(top_n_movies_for_user(ratings, movies, user_id, n), user_id, ratings)
    return recommendation_score / top_movies_score

# funkcja celu - srednia z zadowolenia wszystkich uzytkownikow w grupie
def overall_group_satisfaction(recommendation: list[int], group: list[int], movies: list[int], ratings: pd.DateOffset) -> float:
    satisfaction_sum = sum(overall_user_satisfaction(recommendation, user_id, movies, ratings) for user_id in group)
    return satisfaction_sum / len(group)

# funkcja celu - roznica miedzy maksymalnym i minimalnym zadowolenie w grupie
def group_disagreement(recommendation: list[int], group: list[int], movies: list[int], ratings: pd.DateOffset) -> float:
    group_satisfaction = [overall_user_satisfaction(recommendation, user_id, movies, ratings) for user_id in group]
    return max(group_satisfaction) - min(group_satisfaction)

## Część 4. - Sequential Hybrid Aggregation

In [107]:
# algorytm balansujacy pomiedzy wyborem elementow o najwyzszej sredniej ocen
#   i o najwyzszej minimalnej ocenie
#   wyliczajacy w kazdej iteracji parametr alfa - jak na wykladzie
class SequentialHybridAggregationRecommender(Recommender):
    def __init__(self):
        self.name = 'sequential_hybrid_aggregation'
    
    def recommend(self, movies: list[int], ratings: pd.DataFrame, group: list[int], size: int) -> list[int]:
        avg_score = ratings.loc[group].mean(axis=0)
        lowest_score = ratings.loc[group].min(axis=0)
        alpha = 1

        score = defaultdict(int)
        result = []

        for _ in range(size):
            for movie_id in movies:
                if movie_id in result:
                    score[movie_id] = -1
                else:
                    score[movie_id] = (1 - alpha) * avg_score[movie_id] + alpha * lowest_score[movie_id]

            best_movie = max(score.items(), key=lambda x: x[1])[0]
            result.append(best_movie)

            alpha = group_disagreement(result, group, movies, ratings)

        return result

SequentialHybridAggregationRecommender().recommend(list(ratings.columns), ratings, groups[0], 10)

[318, 2959, 296, 858, 1221, 3160, 3897, 4226, 4878, 1089]

## Część 5. - porównanie algorytmów

In [118]:
recommenders = [
    RandomRecommender(),
    AverageRecommender(),
    AverageWithoutMiseryRecommender(5),
    FairnessRecommender(),
    VotingRecommender(),
    ProportionalApprovalVotingRecommender(5),
    SequentialHybridAggregationRecommender()
]

recommendation_size = 10

# dla kazdego algorytmu:
#  - wygenerujmy jedna rekomendacje dla kazdej grupy
#  - obliczmy wartosci obu funkcji celu dla kazdej rekomendacji
#  - wypiszmy wyniki na konsole

print(f"{'Recommender':<25}\t{'Group':<25}\t{'Overall satisfaction':<25}\t{'Disagreement':<25}")
for recommender in recommenders:
    for group in groups:
        recommendation = recommender.recommend(list(ratings.columns), ratings, group, recommendation_size)
        overall_satisfaction = overall_group_satisfaction(recommendation, group, list(ratings.columns), ratings)
        disagreement = group_disagreement(recommendation, group, list(ratings.columns), ratings)
        print(f'{recommender.name:<30}\t{str(group):<25}\t{overall_satisfaction:.3f}\t{disagreement:.3f}')

Recommender              	Group                    	Overall satisfaction     	Disagreement             
random                        	[111, 307, 474, 599, 414]	0.626	0.200
random                        	[469, 182, 232, 448, 600]	0.664	0.140
random                        	[508, 581, 497, 402, 566]	0.759	0.261
random                        	[300, 515, 245, 568, 507]	0.870	0.242
random                        	[2, 371, 252, 518, 37]   	0.829	0.122
random                        	[269, 360, 469, 287, 308]	0.638	0.278
random                        	[243, 527, 418, 118, 370]	0.800	0.209
random                        	[186, 559, 327, 553, 314]	0.765	0.275
average                       	[111, 307, 474, 599, 414]	0.886	0.230
average                       	[469, 182, 232, 448, 600]	0.894	0.250
average                       	[508, 581, 497, 402, 566]	0.859	0.207
average                       	[300, 515, 245, 568, 507]	0.932	0.106
average                       	[2, 371, 252, 518, 37]   	0.879	0.101