# Laboratorium 5 - rekomendacje grupowe

## Przygotowanie

 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab5`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas matplotlib`

## Część 1. - przygotowanie danych

In [2]:
# importujemy wszystkie potrzebne pakiety

import math
import numpy as np
import pandas

from random import choice, sample
from statistics import mean, stdev

from reco_utils import *

In [3]:
# wczytujemy oceny uytkownikow i obliczamy (za pomocą collaborative filtering) wszystkie przewidywane oceny filmow

raw_ratings = pandas.read_csv("ml-latest-small/ratings.csv").drop(columns=["timestamp"])
movies = list(raw_ratings["movieId"].unique())
users = list(raw_ratings["userId"].unique())
ratings = get_predicted_ratings(raw_ratings)
ratings

Total error: 215078.69941173744
Total error: 208260.78333854632
Total error: 201919.17687232734
Total error: 196006.5088897945
Total error: 190481.24729053234
Total error: 185306.85564094983
Total error: 180451.0897719262
Total error: 175885.40790435046
Total error: 171584.47348527217
Total error: 167525.73420939335
Total error: 163689.0640123628
Total error: 160056.45739779476
Total error: 156611.767477869
Total error: 153340.48069983342
Total error: 150229.5224960085
Total error: 147267.0891067947
Total error: 144442.50164047375
Total error: 141746.07909274945
Total error: 139169.02758554678
Total error: 136703.34352371123
Total error: 134341.72872944147
Total error: 132077.51591278854
Total error: 129904.60308437726
Total error: 127817.39572310178
Total error: 125810.75568450704
Total error: 123879.95598085373
Total error: 122020.6406864148
Total error: 120228.7893251752
Total error: 118500.68518609543
Total error: 116832.88708595169
Total error: 115222.20416366955
Total error: 1136

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,6,6,5,8,7,6,5,8,4,5,...,7,7,2,5,5,8,3,6,7,6
2,6,4,8,7,3,8,7,9,2,4,...,10,7,4,3,6,7,8,10,8,4
3,10,10,2,10,5,10,7,10,4,4,...,10,1,2,9,0,8,1,5,8,0
4,6,5,6,5,8,3,6,7,3,7,...,6,8,2,3,8,7,7,7,6,8
5,5,8,10,10,2,6,4,3,2,5,...,8,3,6,2,10,10,0,5,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,6,6,7,6,6,6,6,6,6,6,...,5,6,6,6,6,6,5,6,6,7
607,6,7,8,7,3,5,7,10,0,4,...,10,4,2,2,8,4,3,8,10,3
608,6,6,5,7,7,6,5,6,7,6,...,6,6,6,6,5,7,5,6,6,6
609,5,6,7,0,0,2,10,2,6,7,...,10,2,10,6,0,5,6,2,10,7


In [4]:
# wczytujemy nazwy filmow i kategorie

movies_metadata = pandas.read_csv("ml-latest-small/movies.csv").set_index("movieId")
movies_metadata

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
# wczytujemy przykladowe grupy uzytkownikow
groups = pandas.read_csv("groups.csv", header=None).values.tolist()
groups

[[606, 274, 474, 599, 448],
 [111, 307, 474, 599, 414],
 [469, 182, 232, 448, 600],
 [508, 581, 497, 402, 566],
 [300, 515, 245, 568, 507],
 [2, 371, 252, 518, 37],
 [269, 360, 469, 287, 308],
 [243, 527, 418, 118, 370],
 [186, 559, 327, 553, 314]]

In [14]:
# przygotowujemy funkcje pomocnicza


def describe_group(group, N=10):
    print(f"\n\nUser ids: {group}")
    group_size = len(group)

    mean_stdev = ratings.loc[group].std(axis=0).mean()
    median_stdev = ratings.loc[group].std(axis=0).median()
    std_stdev = ratings.loc[group].std(axis=0).std()
    print(f"\nMean ratings deviation: {mean_stdev}")
    print(f"Median ratings deviation: {median_stdev}")
    print(f"Standard deviation of ratings deviation: {std_stdev}")

    average_scores = ratings.loc[group].mean(axis=0)
    average_scores = average_scores.sort_values()
    best_movies = [
        (movies_metadata["title"][movie_id], average_scores[movie_id])
        for movie_id in list(average_scores[-N:].index)
    ]
    worst_movies = [
        (movies_metadata["title"][movie_id], average_scores[movie_id])
        for movie_id in list(average_scores[:N].index)
    ]

    print("\nBest movies:")
    for movie, score in best_movies[::-1]:
        print(f"{movie}, {score}*")
    print("\nWorst movies:")
    for movie, score in worst_movies:
        print(f"{movie}, {score}*")


describe_group(groups[8])



User ids: [186, 559, 327, 553, 314]

Mean ratings deviation: 3.1572601928971697
Median ratings deviation: 3.2093613071762426
Standard deviation of ratings deviation: 0.9236084047575772

Best movies:
Lovesick (2014), 9.8*
Gia (1998), 9.8*
Jezebel (1938), 9.8*
James and the Giant Peach (1996), 9.6*
Babe (1995), 9.6*
Fast Food, Fast Women (2000), 9.6*
Song of the Sea (2014), 9.6*
Wallace & Gromit: A Close Shave (1995), 9.6*
High School (2010), 9.6*
Sweet November (2001), 9.4*

Worst movies:
Puppet Master (1989), 0.6*
Cold in July (2014), 0.8*
Stroszek (1977), 1.0*
School for Scoundrels (2006), 1.0*
Stray Dog (Nora inu) (1949), 1.0*
Phantom Tollbooth, The (1970), 1.0*
Okja (2017), 1.0*
Civil Action, A (1998), 1.0*
Ice Guardians (2016), 1.2*
Son of the Mask (2005), 1.2*


## Część 2. - algorytmy proste

In [7]:
# zdefiniujmy interfejs dla wszystkich algorytmow rekomendacyjnych


class Recommender:
    def recommend(self, movies, ratings, group, size):
        pass


# jako pierwszy zaimplementujemy algorytm losowy - dla porownania


class RandomRecommender(Recommender):
    def __init__(self):
        self.name = "random"

    def recommend(self, movies, ratings, group, size):
        return sample(movies, size)

In [8]:
# algorytm rekomendujacy filmy o najwyzszej sredniej ocen


class AverageRecommender(Recommender):
    def __init__(self):
        self.name = "average"

    def recommend(self, movies, ratings, group, size):
        avg = ratings.loc[group].mean(axis=0).nlargest(size)
        return list(avg.index)

In [16]:
# algorytm rekomendujacy filmy o najwyzszej sredniej ocen,
#   ale rownoczesnie wykluczajacy te filmy, ktore otrzymaly choc jedna ocene ponizej thresholdu


class AverageWithoutMiseryRecommender(Recommender):
    def __init__(self, score_threshold):
        self.name = "average_without_misery"
        self.score_threshold = score_threshold

    def recommend(self, movies, ratings, group, size):
        filtered_ratings = ratings.iloc[group]
        movie_filter = (filtered_ratings >= self.score_threshold).all(axis=0)
        filtered_ratings = filtered_ratings.loc[:, movie_filter]
        avg = filtered_ratings.mean(axis=0).nlargest(size)
        return list(avg.index)

In [17]:
# algorytm uwzgledniajacy preferencje tylko jednego uzytkownika w kazdej iteracji


class FairnessRecommender(Recommender):
    def __init__(self):
        self.name = "fairness"

    def recommend(self, movies, ratings, group, size):
        group_size = len(group)
        recommendations = []

        best_movies_by_person = dict()
        for i in range(group_size):
            best_movies_by_person[i] = list(
                ratings.loc[group[i % group_size]].nlargest(size).index
            )

        for i in range(size):
            for j in range(size):
                if best_movies_by_person[i % group_size][j] not in recommendations:
                    recommendations.append(best_movies_by_person[i % group_size][j])
                    break

        return recommendations

In [64]:
# wybrany algorytm wyborczy (dyktatura, glosowanie proste, Borda, Copeland)


class VotingRecommender(Recommender):
    def __init__(self, threshold=7):
        self.name = "simple_vote"
        self.threshold = threshold
    
    def recommend(self, movies, ratings, group, size):
        filtered_ratings = ratings.loc[group]
        movie_filter = (filtered_ratings >= self.threshold)

        rec = movie_filter.sum(axis=0).nlargest(size)

        return list(rec.index)

In [52]:
# algorytm zachlanny, aproksymujacy metode Proportional Approval Voting
#   w kazdej iteracji wybieramy ten film, ktory najbardziej zwieksza zadowolenie zgodnie z punktacja PAV


class ProportionalApprovalVotingRecommender(Recommender):
    def __init__(self, threshold):
        self.threshold = threshold
        self.name = "PAV"

    def recommend(self, movies, ratings, group, size):
        weights = [1.0] * len(group)

        filtered_ratings = ratings.loc[group]
        movie_filter = (filtered_ratings >= self.threshold).astype(int)

        recommendations = []
        for i in range(size):
            weights_pd = pandas.concat(
                [pandas.DataFrame(weights)] * len(movies), axis=1, ignore_index=True
            )
            weights_pd.index = movie_filter.index
            weights_pd.columns = movie_filter.columns

            best_movie = movie_filter.mul(weights_pd).sum(axis=0).nlargest(1).index
            recommendations.append(best_movie[0])
            for i in range(len(group)):
                if movie_filter[best_movie].T[group[i]].values[0] == 1:
                    weights[i] = weights[i] / (weights[i] + 1)

        return recommendations

In [56]:
rec = ProportionalApprovalVotingRecommender(7)
reco = rec.recommend(movies, ratings, groups[0], 5)

## Część 3. - funkcje celu

In [62]:
# dwie funkcje pomocnicze:
#  - znajdujaca ulubione filmy danego uzytkownika
#  - obliczajaca sume ocen wystawionych przez uzytkownika wszystkim filmom w rekomendacji


def top_n_movies_for_user(ratings, movies, user_id, n):
    return list(ratings.loc[user_id].nlargest(n).index)


def total_score(recommendation, user_id, ratings):
    return np.sum(ratings.loc[user_id, recommendation])

In [68]:
# funkcja obliczajaca zadowolenie pojedynczego uzytkownika
#  - iloraz zadowolenia z wygenerowanej rekomendacji oraz zadowolenia z hipotetycznej rekomendacji idealnej
def overall_user_satisfaction(recommendation, user_id, movies, ratings):
    return total_score(recommendation, user_id, ratings) / total_score(
        top_n_movies_for_user(ratings, movies, user_id, len(recommendation)),
        user_id,
        ratings,
    )

# funkcja celu - srednia z zadowolenia wszystkich uzytkownikow w grupie
def overall_group_satisfaction(recommendation, group, movies, ratings):
    sat = 0
    for user_id in group:
        sat += overall_user_satisfaction(recommendation, user_id, movies, ratings)
    
    return sat / len(group)


# funkcja celu - roznica miedzy maksymalnym i minimalnym zadowolenie w grupie
def group_disagreement(recommendation, group, movies, ratings):
    sats = [overall_user_satisfaction(recommendation, user_id, movies, ratings) for user_id in group]
    return max(sats) - min(sats)

## Część 4. - Sequential Hybrid Aggregation

In [None]:
# algorytm balansujacy pomiedzy wyborem elementow o najwyzszej sredniej ocen
#   i o najwyzszej minimalnej ocenie
#   wyliczajacy w kazdej iteracji parametr alfa - jak na wykladzie

# TODO

class SequentialHybridAggregationRecommender(Recommender):
     def __init__(self):
        self.name = 'sequential_hybrid_aggregation'
    
    def recommend(self, movies, ratings, group, size):
        raise NotImplementedError()

## Część 5. - porównanie algorytmów

In [70]:
recommenders = [
    RandomRecommender(),
    AverageRecommender(),
    AverageWithoutMiseryRecommender(7),
    FairnessRecommender(),
    VotingRecommender(),
    ProportionalApprovalVotingRecommender(7),
    # SequentialHybridAggregationRecommender(),
]

recommendation_size = 10

# dla kazdego algorytmu:
#  - wygenerujmy jedna rekomendacje dla kazdej grupy
#  - obliczmy wartosci obu funkcji celu dla kazdej rekomendacji
#  - obliczmy srednia i odchylenie standardowe dla obu funkcji celu
#  - wypiszmy wyniki na konsole

for group in groups:
    recommendations = []
    for recommender in recommenders:
        recommendations.append(recommender.recommend(movies, ratings, group, recommendation_size))

    sats = [overall_group_satisfaction(recommendation, group, movies, ratings) for recommendation in recommendations ]
    dises = [group_disagreement(recommendation, group, movies, ratings) for recommendation in recommendations]
    avg_sat = np.average(sats)
    avg_dis = np.average(dises)
    std_sat = np.std(sats)
    std_dis = np.std(dises)

    print(f'Group: {group}, avg_sat: {avg_sat:.2f}, std_sat: {std_sat:.2f}, avg_dis: {avg_dis:.2f}, std_dis: {std_dis:.2f}')


Group: [606, 274, 474, 599, 448], avg_sat: 0.90, std_sat: 0.04, avg_dis: 0.07, std_dis: 0.03
Group: [111, 307, 474, 599, 414], avg_sat: 0.87, std_sat: 0.04, avg_dis: 0.16, std_dis: 0.08
Group: [469, 182, 232, 448, 600], avg_sat: 0.84, std_sat: 0.04, avg_dis: 0.22, std_dis: 0.05
Group: [508, 581, 497, 402, 566], avg_sat: 0.71, std_sat: 0.10, avg_dis: 0.37, std_dis: 0.20
Group: [300, 515, 245, 568, 507], avg_sat: 0.72, std_sat: 0.13, avg_dis: 0.32, std_dis: 0.10
Group: [2, 371, 252, 518, 37], avg_sat: 0.71, std_sat: 0.16, avg_dis: 0.22, std_dis: 0.10
Group: [269, 360, 469, 287, 308], avg_sat: 0.71, std_sat: 0.13, avg_dis: 0.32, std_dis: 0.20
Group: [243, 527, 418, 118, 370], avg_sat: 0.60, std_sat: 0.26, avg_dis: 0.22, std_dis: 0.06
Group: [186, 559, 327, 553, 314], avg_sat: 0.67, std_sat: 0.19, avg_dis: 0.38, std_dis: 0.22
