# Laboratorium 2 - collaborative filtering

## Przygotowanie

 * dataset i potrzebne biblioteki są dokładnie takie same jak na poprzednim laboratorium
 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas sklearn`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

import math
import numpy as np
import pandas

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import confusion_matrix

In [2]:
# liczba parametrow opisujacych filmy i uzytkownikow zalezy tylko od nas
K = 20

In [3]:
# wczytujemy oceny uytkownikow i od razu dzielimy je na dwa zbiory - treningowy i testowy

all_ratings = pandas.read_csv('ml-latest-small/ratings.csv').drop(columns=['timestamp'])
train_ratings_set, test_ratings_set = train_test_split(all_ratings, test_size=0.05)
train_ratings_set

Unnamed: 0,userId,movieId,rating
44349,294,3361,2.0
51455,332,2085,3.5
8774,61,2012,2.0
3860,23,3108,3.5
35421,239,2355,3.5
...,...,...,...
42261,288,1042,3.0
3887,23,6920,3.0
32369,221,1284,4.5
35473,239,4571,4.0


In [4]:
# inicjalizujemy macierz preferencji uzytkownikow liczbami losowymi z przedzialu [0.0, 5.0]

def initialize_users(raw_ratings, k):
    users_no = raw_ratings['userId'].unique().size
    users = pandas.DataFrame(5.0 * np.random.uniform(size=(users_no, k)), index=raw_ratings['userId'].unique(), columns=['x%s' % i for i in range(k)])
    users.sort_index(inplace=True) 
    return users_no, users

users_no, users = initialize_users(train_ratings_set, K)
users

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19
1,3.378008,3.784948,2.368150,1.815751,0.482617,1.249901,4.556021,0.243010,3.759423,2.916803,0.914162,0.394620,2.871835,4.431172,1.139786,3.479087,2.045746,4.104480,1.237092,2.803078
2,3.679483,1.149956,4.530480,3.510997,2.165041,3.060001,3.014411,3.798552,0.386757,4.728726,0.689278,1.682396,4.665060,4.105266,1.936721,0.107967,1.240119,1.133437,1.378281,1.001038
3,3.216410,2.910393,2.645533,2.629411,2.245503,3.079439,1.808537,0.401297,2.597564,3.837402,2.978688,4.690448,1.830417,4.898391,4.456673,2.078588,2.222725,0.252854,0.337342,4.625009
4,0.293856,0.298275,2.979469,3.638218,3.942188,0.646745,0.487836,2.235565,0.770722,0.040753,3.866083,1.970421,2.751928,3.482600,4.129953,4.164641,0.470554,2.014381,1.955806,2.994654
5,3.873521,2.078634,2.003401,1.758353,2.335294,4.781166,1.698245,2.152524,2.757395,1.210498,0.364588,1.445837,4.958439,4.588833,3.894845,4.188241,3.049933,3.347576,4.200189,3.849577
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,4.890324,0.080556,1.131153,1.048532,4.263418,1.067323,2.494523,1.996952,3.744176,0.792737,3.328668,0.344068,4.996260,0.400690,4.187235,4.383672,4.246427,1.525961,3.060617,2.005463
607,4.480096,3.239673,3.809891,3.306107,4.775946,2.284876,4.571475,0.891716,2.429844,3.898924,2.302277,3.524431,4.057839,4.763471,1.068441,3.389565,3.391172,0.084577,4.018015,0.515297
608,1.628890,0.277483,0.596341,1.210849,4.569455,4.883977,2.948201,3.769402,2.990103,2.841632,4.368665,0.264091,2.343737,4.529849,1.612087,3.984105,3.325399,2.523548,4.850144,3.845180
609,3.565040,2.069075,4.054384,4.957494,3.571533,4.804736,3.914162,0.132803,2.098574,0.614224,1.476545,3.547873,2.799433,2.152907,2.910190,2.465913,0.553056,0.655519,1.193982,1.501272


In [5]:
# inicjalizujemy macierz cech filmow liczbami losowymi z przedzialu [0.0, 1.0]

def initialize_movies(raw_ratings, k):
    movies_no = raw_ratings['movieId'].unique().size
    movies = pandas.DataFrame((1/K)* np.random.uniform(size=(movies_no, k)), index=raw_ratings['movieId'].unique(), columns=['x%s' % i for i in range(k)])
    movies.sort_index(inplace=True) 
    return movies_no, movies

movies_no, movies = initialize_movies(train_ratings_set, K)
movies

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19
1,0.028112,0.008667,0.039117,0.044315,0.038563,0.012159,0.045632,0.006306,0.014648,0.021766,0.022783,0.043810,0.016407,0.020776,0.001434,0.019435,0.026297,0.012995,0.039205,0.002649
2,0.018057,0.039463,0.015741,0.046057,0.005446,0.028129,0.035433,0.018789,0.029395,0.041575,0.039086,0.017032,0.012947,0.007890,0.019506,0.040615,0.006048,0.002839,0.003432,0.041052
3,0.037982,0.022294,0.044521,0.018125,0.031889,0.012778,0.010551,0.038649,0.045176,0.001516,0.010578,0.033036,0.040999,0.009871,0.016540,0.025672,0.036153,0.028042,0.046073,0.023453
4,0.037993,0.009227,0.010896,0.048846,0.039317,0.017726,0.016431,0.029439,0.044734,0.045834,0.001021,0.003454,0.004809,0.030744,0.046028,0.013785,0.004304,0.035564,0.025098,0.015806
5,0.039977,0.040256,0.046082,0.008645,0.025541,0.032204,0.030422,0.020711,0.030145,0.022723,0.047746,0.007380,0.001978,0.029658,0.029336,0.011890,0.042481,0.028544,0.031313,0.026771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.036470,0.009972,0.022552,0.033645,0.030364,0.038376,0.002280,0.003833,0.019616,0.025521,0.015141,0.044827,0.022592,0.009311,0.011149,0.041388,0.049539,0.000490,0.006924,0.003908
193583,0.036350,0.002686,0.048922,0.009608,0.003104,0.031388,0.044443,0.029869,0.034760,0.029555,0.004353,0.002388,0.045535,0.038409,0.049561,0.026187,0.032425,0.027129,0.018613,0.019583
193585,0.034817,0.048877,0.017854,0.034682,0.008400,0.028226,0.017920,0.047484,0.010853,0.011052,0.016793,0.014873,0.029714,0.027277,0.003996,0.034566,0.032754,0.021276,0.016103,0.048265
193587,0.010241,0.029686,0.046054,0.008061,0.024984,0.009234,0.020203,0.008815,0.043145,0.048981,0.042750,0.019020,0.029716,0.004263,0.017435,0.042276,0.020082,0.007824,0.004365,0.012934


In [6]:
# za pomoca sprytnej sztuczki przeksztalcamy oceny z formatu dostarczonego przez MovieLens do uzytecznej macierzy
# zwroc uwage na to, ze czesci filmow i uzytkownikow moze brakowac po podziale datasetu na dwie czesci
#   - byc moze warto uzupelnic brakujace kolumny i wiersze

def get_ratings(raw_ratings, movies, nan=False):
    ratings = raw_ratings.pivot(*raw_ratings.columns)
    if not nan:
        ratings = ratings.fillna(0.0)
    
    missing_movies = set(movies.index).difference(set(raw_ratings['movieId']))
    for movie in missing_movies:
        ratings[movie] = 0.0
    
    ratings = ratings.reindex(sorted(ratings.columns), axis=1)
    return ratings

ratings = get_ratings(train_ratings_set, movies)
ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Część 2. - trening modelu

In [7]:
# trenujemy model iteracyjnie, wykorzystujac gradient descent

alpha = 0.00003 # learning speed
delta = 100 # minimal upgrade for each step
lambd = 0.01 # regularization weight

def calculate_user_preferences(users, movies, ratings, raw_ratings, users_no, movies_no, alpha, delta, lambd):
    total_error = 0.0
    users_model = users.copy()
    movies_model = movies.copy()
    
    while(True):
        previous_total_error = total_error

        predicted_ratings = np.dot(users_model, movies_model.T)
        errors = np.where(ratings==0.0, pandas.DataFrame(np.zeros((users_no, movies_no))), predicted_ratings - ratings)
        users_gradient = errors.dot(movies)
        movies_gradient = errors.T.dot(users)
        
        # zauwaz, ze nie uzywamy biasow i nie potrzebujemy dodatkowej macierzy do regularyzacji
        #  - wystarczy, ze uzyjemy odpowiednio macierzy users_model i movies_model
        
        # musimy zaktualizowac dwa modele
        
        users_model = users_model - alpha*(users_gradient - lambd * users_model)
        movies_model = movies_model - alpha*(movies_gradient - lambd * movies_model)

        total_error = np.sum(errors ** 2)
        print(total_error)
        progress = abs(previous_total_error - total_error)
        if progress < delta:
            break
            
    return users_model, movies_model

users_model, movies_model = calculate_user_preferences(users, movies, ratings, train_ratings_set, users_no, movies_no, alpha, delta, lambd)

print(users_model)
print(movies_model)

587011.2644995516
395958.4827362339
321795.12294225325
278953.5791747354
250422.72899262313
229854.5317080323
214226.12332906362
201891.50709729994
191870.48358187664
183540.12032956787
176484.48888486557
170414.88616249996
165124.51963026368
160461.34301256933
156311.04589989266
152586.012628097
149217.9376113608
146152.75875892723
143347.10385585474
140765.74935018236
138379.7714413501
136165.17971458947
134101.89291658203
132172.96110180233
130363.96771276128
128662.56479698294
127058.10793349672
125541.3666817663
124104.29284073673
122739.83340082648
121441.77837176286
120204.63606555463
119023.53017234271
117894.11426998017
116812.50038294995
115775.19894188309
114779.0680548637
113821.27043134224
112899.23663169239
112010.63357421786
111153.3374344243
110325.41023168188
109525.07952582811
108750.7207481606
108000.8417732172
107274.06940404403
106569.13749755225
105884.87650060031
105220.20420363054
104574.11754850726
103945.68535193263
103334.04182638417
102738.38079768827
102157

## Część 3. - podobieństwo elementów

In [8]:
# przygotujmy funkcje obliczajaca odleglosc cosinusowa miedzy kazda para elementow (filmow lub uzytkownikow)

def cosine_similarity(vectors):
    # przydadza nam sie dlugosci wektorow
    # poniewaz w kolejnej czesci bedziemy korzystac z masked arrays, nie mozemy uzyc najprostszej metody
    # lengths = np.linalg.norm(vectors, axis=1)
    # musimy zaimplementowac to sami
    lengths = np.sqrt(np.sum(vectors ** 2, axis=1))
    # podobienstwo liczymy w dwoch krokach - najpierw liczymy iloczyn skalarny kazdej pary wektorow
    dot_products = vectors.dot(vectors.T)
    # nastepnie dzielimy zarowno wiersze jak i kolumny przez dlugosci wektorow - przyda sie zmienna lengths oraz funkcja divide()
    similarity = ((dot_products/lengths).T/lengths).T
    return similarity

cosine_similarity(movies_model)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,1.000000,0.757032,0.664651,0.805832,0.640653,0.843667,0.759303,0.821825,0.809194,0.640538,...,0.870686,0.834051,0.763675,0.826726,0.825558,0.848838,0.767579,0.822671,0.796575,0.766113
2,0.757032,1.000000,0.704716,0.668086,0.747827,0.723339,0.820912,0.788389,0.726835,0.784605,...,0.802787,0.827148,0.837594,0.805480,0.771604,0.797215,0.780811,0.793590,0.846975,0.754939
3,0.664651,0.704716,1.000000,0.634001,0.787279,0.795704,0.701174,0.686507,0.755551,0.730067,...,0.702448,0.640195,0.682501,0.653366,0.698276,0.692913,0.742448,0.699455,0.643604,0.686412
4,0.805832,0.668086,0.634001,1.000000,0.602846,0.800679,0.722299,0.779359,0.779029,0.624280,...,0.864074,0.827069,0.676280,0.743705,0.774994,0.726155,0.737134,0.750074,0.691895,0.770932
5,0.640653,0.747827,0.787279,0.602846,1.000000,0.722542,0.803341,0.805831,0.733679,0.687968,...,0.727705,0.720271,0.762460,0.772492,0.805174,0.781589,0.815199,0.786491,0.768606,0.829682
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.848838,0.797215,0.692913,0.726155,0.781589,0.854325,0.766891,0.858218,0.762457,0.764797,...,0.908872,0.884398,0.943511,0.936070,0.917627,1.000000,0.908398,0.925081,0.936869,0.800908
193583,0.767579,0.780811,0.742448,0.737134,0.815199,0.886661,0.762449,0.855948,0.760967,0.778776,...,0.899566,0.882687,0.912859,0.905720,0.929316,0.908398,1.000000,0.910784,0.913765,0.806353
193585,0.822671,0.793590,0.699455,0.750074,0.786491,0.803983,0.818232,0.903903,0.837542,0.785437,...,0.881279,0.903323,0.931957,0.925396,0.920362,0.925081,0.910784,1.000000,0.897707,0.833649
193587,0.796575,0.846975,0.643604,0.691895,0.768606,0.828264,0.739599,0.862971,0.680944,0.734740,...,0.898919,0.911817,0.957198,0.942702,0.908365,0.936869,0.913765,0.897707,1.000000,0.748493


In [9]:
# teraz mozemy znalexc k elementow najbardziej podobnych do danego

def k_most_similar(vectors, i, k):
    sim_matrix = cosine_similarity(vectors)
    # przyda sie funkcja np.argsort()
    # usuwam porównanie z samym sobą
    vector_similarities = sim_matrix[i].drop([i])
    most_similar_vectors = vector_similarities.iloc[vector_similarities.argsort()[:-k:-1]]
    print(most_similar_vectors)
    return most_similar_vectors.index

k_most_similar(movies, 193581, 8)

639      0.916017
2540     0.907170
86377    0.906321
71500    0.905540
2575     0.904816
53129    0.903558
8591     0.902016
Name: 193581, dtype: float64


Int64Index([639, 2540, 86377, 71500, 2575, 53129, 8591], dtype='int64')

## Część 4. - Item2Item collaborative filtering

In [19]:
# sprobujmy innego podejscia - Item2Item CF przewiduje rating tylko na podstawie macierzy ratingow, bez koniecznosci trenowania
#   dodatkowych macierzy

# zauwaz, ze nie chcemy przeprowadzac obliczen tam, gdzie brakuje nam elementow
#   - oblicz macierz ratings z parametrem nan=True oraz wykorzystaj tzw. masked arrays: np.ma.array(x, mask=np.isnan(x))
#   w ten sposob unikniesz przeprowadzania niepotrzebnych obliczen

def item_to_item(ratings):
    # ustawiłem fillvalue na 3 (klasifikacja neutralna, nie zepsuje macierzy konfuzji)
    ratings_masked = np.ma.array(ratings, mask=np.isnan(ratings), fill_value=3)
    similarity = cosine_similarity(ratings_masked.T) # prawdopodobnie bedziesz musial zmodyfikowac te funkcje, by obslugiwala NaN
    sums = similarity.sum(axis=1)
    model = ratings_masked.dot(similarity)/sums# srednia ocen wystawionych przez uzytkownika wazona podobienstwem elementow
    model = pandas.DataFrame(model.filled(), index=ratings.index, columns=ratings.columns)
    return model

item_to_item(ratings)

movieId,1,2,3,4,5,6,7,8,9,10,...,190219,190221,191005,193565,193567,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.252533,0.209637,0.306170,0.241414,0.221439,0.245088,0.216600,0.190561,0.322374,0.244980,...,0.162390,0.162390,0.007931,0.007931,0.007931,0.007931,0.007931,0.007931,0.007931,0.151261
2,0.019309,0.018898,0.010383,0.005086,0.018364,0.017048,0.007814,0.013816,0.008338,0.018006,...,0.000000,0.000000,0.032175,0.032175,0.032175,0.032175,0.032175,0.032175,0.032175,0.163365
3,0.008448,0.008822,0.012377,0.004406,0.008036,0.010748,0.008250,0.007552,0.022017,0.009952,...,0.004249,0.004249,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.153603,0.118994,0.133638,0.142258,0.142317,0.140145,0.167063,0.079774,0.121670,0.130405,...,0.108286,0.108286,0.010699,0.010699,0.010699,0.010699,0.010699,0.010699,0.010699,0.156548
5,0.050921,0.042802,0.048308,0.117055,0.066062,0.041017,0.073690,0.061563,0.073908,0.047868,...,0.105590,0.105590,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.059135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.628809,0.489923,0.489924,0.601525,0.567039,0.527580,1.034661,0.325741,0.393005,0.507968,...,0.349902,0.349902,0.058701,0.058701,0.058701,0.058701,0.058701,0.058701,0.058701,0.864417
607,0.167536,0.139613,0.172496,0.190714,0.148547,0.145519,0.155680,0.147561,0.264970,0.160985,...,0.032754,0.032754,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.044740
608,0.582221,0.559832,0.647520,0.570366,0.584016,0.558474,0.554552,0.587535,0.790751,0.677489,...,0.355989,0.355989,0.047072,0.047072,0.047072,0.047072,0.047072,0.047072,0.047072,1.199932
609,0.037437,0.029785,0.038057,0.063438,0.047931,0.030777,0.051829,0.049059,0.082953,0.043596,...,0.025542,0.025542,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.027799


## Część 5. - ocena jakości algorytmu

In [11]:
# na podstawie zbioru testowego i wytrenowanego modelu obliczamy metryki opisujace jakosc modelu

positive_threshold = 4.0
negative_threshold = 2.0

# Funkcja pomocnicza zamieniająca ratingi trójwartościową informację: -1 jeśli poniżej negatywnego tresholda(nie polecać), 
# 1 jeśli powyżej pozytywnego(polecać) i 0 jeśli pomiędzy tresholdami(nie wiadomo)
def classify_ratings(ratings, p_treshold=4.0, n_treshold=2.0):
    #zrobione na podstawie https://numpy.org/doc/stable/reference/generated/numpy.where.html
    return np.where(ratings <= n_treshold, -1, np.where(ratings < p_treshold, 0, 1))

def calculate_stats(test_ratings_set, predicted_ratings, positive_threshold, negative_threshold):
    # zamieniamy ratingi na informacje bardziej przystępną dla sklearnowego confusion matrix
    
    #tu normalizujemy ratingi (ucinamy id userów i filmów, bo sklearnowe confusion matrix ich nie potrzebuje)
    test_set_classified = classify_ratings(test_ratings_set['rating'], positive_threshold, negative_threshold)
    
    # to pewnie można zrobić dużo lepiej, ale zmęczenie wzięło górę
    #inicjalizuje tablicę długości o tej samej długości co testowa
    predicted_set_classified = np.zeros(test_ratings_set['rating'].size)
    # mapuję userId i movieId dla wszystkich wpisów w zbiorze testowym na na predykcje modelu
    # (dla każdego rzędu biorę movieId i userId i z macierzy predykcji pobieram predykcję dla tego movieId i userId)
    iterator = 0
    for index, row in test_ratings_set.iterrows():
        try:
            predicted_set_classified[iterator] = predicted_ratings[row['movieId']][ row['userId']]
        except KeyError:
            predicted_set_classified[iterator] = 3
        iterator+=1
    # tu znowu funkcja która zamienia nam ratingi na informację czy polecać, czy nie polecać, czy nie wiadomo
    predicted_set_classified = classify_ratings(predicted_set_classified, positive_threshold, negative_threshold)
    
    # nastepnie wyliczam wszystkie metryki
    # ustawiam sobię labelki(labels=[1, -1, 0]) tak żeby oceny neutralne były na końcu i je odcinam odcinam([0:2, 0:2]) 
    # w efekcie dostaję macierz konfuzji 2 na 2
    conf_matrix = confusion_matrix(test_set_classified, predicted_set_classified, labels=[1, -1, 0])[0:2, 0:2]
    
    true_positives = conf_matrix[0, 0]
    false_positives = conf_matrix[0, 1]
    true_negatives = conf_matrix[1, 0]
    false_negatives = conf_matrix[1, 1]
    
    accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1 = 2*precision*recall/( precision + recall )
        
    return {
        'true_positives': true_positives,
        'true_negatives': true_negatives,
        'false_positives': false_positives,
        'false_negatives': false_negatives,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

## Część 6. - porównanie algorytmów

In [20]:
# korzystając z funkcji z poprzedniego laboratorium, porownaj dwa zaimplementowane algorytmy Collaborative Filtering

# obliczamy, w ilu probach wytrenowany model okazal sie lepszy od losowego
# przeprowadzamy test statystyczny - jak prawdopodobne jest to, by k pozytywnych prob bylo dzielem przypadku

# wielokrotnie uruchamiamy trening modelu
# za każdym razem dzielimy dataset na zbior treningowy i testowy w inny sposob - klasa KFold robi to za nas
# zwroc uwage na bardzo istotny szczegol - oba modele, wytrenowany i losowy, musza byc porownywane na tym samym zbiorze testowym

n_tests = 5
collaborative_results = []
item_to_item_results = []

for train, test in KFold(n_splits=n_tests, shuffle=True).split(all_ratings):
    training_set = all_ratings.iloc[train]
    test_set = all_ratings.iloc[test]
    print(list(filter((lambda x: x not in test_set['userId'].unique()), training_set['userId'].unique())))
    print(list(filter((lambda x: x not in training_set['userId'].unique()), test_set['userId'].unique())))
    # wygeneruj macierz użytkowników i ocen
    users_no, users = initialize_users(training_set, K)
    movies_no, movies_train = initialize_movies(training_set, K)
    ratings = get_ratings(training_set, movies_train)
    # wytrenuj model
    users_model, movies_model = calculate_user_preferences(users, movies_train, ratings, training_set, users_no, movies_no, alpha, delta, lambd)
    collaborative_preferences = users_model.dot(movies.T)
    
    item2item_model = item_to_item(training_set)
    # oblicz metryki dla wytrenowanego modelu
    collaborative_results.append(calculate_stats(test_set, collaborative_preferences, positive_threshold, negative_threshold))
    # oblicz metryki dla modelu losowego
    item_to_item_results.append(calculate_stats(test_set, item2item_model, positive_threshold, negative_threshold))
    

[]
[]
494832.2067889451
354061.6930428137
291295.2305526297
253390.88472863374
227500.77380918985
208507.5862154033
193894.61586923405
182256.51840560368
172738.88131702726
164789.28281776694
158033.43079919615
152208.19926051653
147123.0130323213
142636.39340718184
138641.0931652308
135054.33715448264
131811.22259318357
128860.14276926429
126159.54467968002
123675.58829081297
121380.42852357574
119250.93554892053
117267.72879648296
115414.43888007416
113677.13735685352
112043.8916034085
110504.41402146136
109049.78310642911
107672.21979593868
106364.9067288234
105121.84109731512
103937.71400979398
102807.81093377723
101727.92902188016
100694.3080516374
99703.57241443993
98752.68212748737
97838.89125761138
96959.71246769869
96112.88664779002
95296.35679045304
94508.24542617539
93746.8350587211
93010.55113969963
92297.94720144992
91607.69183187425
90938.55722728434
90289.40910209977
89659.19776932726
89046.9502346445
88451.76317081535
87872.79665902207
87309.26860026093
86760.4497138082

  f1 = 2*precision*recall/( precision + recall )
  accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
  precision = true_positives / (true_positives + false_positives)
  recall = true_positives / (true_positives + false_negatives)


[214]
[]
507953.69271805364
362376.39639342495
298332.8014032965
259767.12757628702
233478.68257489178
214210.32836214182
199382.93239007352
187562.3154931859
177880.59466177618
169779.01193039413
162880.27156393157
156919.7940533136
151706.25792449425
147097.7472314125
142986.70146558745
139290.07341053346
135942.695577132
132892.6928011145
130098.2388271945
127525.21842900354
125145.51329600651
122935.72601104058
120876.21704486091
118950.36886650481
117144.01714519883
115445.00644692937
113842.83977011891
112328.39957522864
110893.72382862937
109531.82477463399
108236.54118335682
107002.41704095328
105824.60128822143
104698.76443639606
103621.0288092196
102587.90985875421
101596.26653658629
100643.25911375877
99726.31316229717
98843.08866089494
97991.45338370097
97169.45988657324
96375.32552887856
95607.41506796701
94864.22544317483
94144.37243072719
93446.57890336841
92769.6644714252
92112.53631719643
91474.18106359545
90853.6575419978
90250.09034424629
89662.66406045217
89090.6181

  f1 = 2*precision*recall/( precision + recall )
  accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
  precision = true_positives / (true_positives + false_positives)
  recall = true_positives / (true_positives + false_negatives)


[]
[]
503234.48784222343
357372.2993106928
293253.17957171414
254661.78428056417
228399.10116658194
209185.27389171507
194427.0605549301
182682.6573033282
173080.0680671403
165058.26779448515
158238.6614257436
152355.88316136805
147218.10847830217
142683.08758555638
138643.03829287033
135014.77360507214
131733.0499804479
128745.96713342104
126011.71438908714
123496.22407840478
121171.44986334738
119014.08421970467
117004.58995961634
115126.45983413319
113365.64410691346
111710.1033986538
110149.45603542778
108674.69744619187
107277.97502833312
105952.40610518832
104691.92964603615
103491.18465058038
102345.40975102174
101250.35981790062
100202.2362849172
99197.62861357532
98233.46485865599
97306.9697119959
96415.62872540343
95557.1576662542
94729.47615803678
93930.68491537015
93159.04600813237
92412.96568945987
91690.9794029286
90991.7386493388
90313.99944646352
89656.61215830977
89018.51250589374
88398.71360072576
87796.29886635211
87210.41573338117
86640.27001015165
86085.12084522462

  f1 = 2*precision*recall/( precision + recall )
  accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
  precision = true_positives / (true_positives + false_positives)
  recall = true_positives / (true_positives + false_negatives)


[478]
[]
489438.63914314087
344931.6226879355
282425.0307044848
245066.866762269
219738.59267560544
201260.70345774983
187103.7128710063
175864.7990550605
166696.67524176728
159054.6627088993
152571.43466102422
146989.69774446407
142123.6147142598
137835.51225505123
134021.1960095005
130600.3512413925
127510.07375640777
124700.39594404103
122131.12304531301
119769.55232039327
117588.80060439513
115566.55936200698
113684.15535704714
111925.8331820496
110278.20108249811
108729.79848144496
107270.75524644724
105892.52084556397
104587.64726505759
103349.61365923056
102172.68367027551
101051.78852836185
99982.43064850655
98960.60363972002
97982.72554460485
97045.58281283632
96146.2830361421
95282.21487635345
94451.01393151014
93650.53352984144
92878.8196338466
92134.08918887361
91414.71137160242
90719.19129062633
90046.1557691341
89394.34090258158
88762.58113531106
88149.79964174694
87554.99983195792
86977.25782950045
86415.71579271147
85869.57596992946
85338.09539520803
84820.58114456019
8

  f1 = 2*precision*recall/( precision + recall )
  accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
  precision = true_positives / (true_positives + false_positives)
  recall = true_positives / (true_positives + false_negatives)


[]
[]
502203.86638717673
358442.4360526867
295146.9668450014
257023.366491283
231018.50248716376
211953.85729935445
197289.3766537176
185609.54304547032
176055.2601545236
168071.90925141002
161284.17853081098
155428.48590770774
150314.06401647875
145799.3579551065
141777.0888180226
138164.4615435999
134896.55179880068
131921.72547711292
129198.39619571707
126692.68593163672
124376.70871954574
122227.29250238958
120225.014374006
118353.46340259035
116598.67099550249
114948.66614850904
113393.12485002073
111923.09122267833
110530.75385845454
109209.26500759546
107952.59332617364
106755.40311551357
105612.95463433312
104521.02129392426
103475.82047221178
102473.95538495862
101512.36598966604
100588.28731169426
99699.2139033572
98842.86939765564
98017.18031556506
97220.25344181297
96450.35620820096
95705.89962283148
94985.4233634916
94287.58271802565
93611.13710703443
92954.93996708315
92317.92980777657
91699.12228501485
91097.60315673023
90512.522007317
89943.08664358935
89388.55807901555

  f1 = 2*precision*recall/( precision + recall )


[{'true_positives': 0, 'true_negatives': 0, 'false_positives': 9801, 'false_negatives': 2696, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': nan}, {'true_positives': 0, 'true_negatives': 0, 'false_positives': 9550, 'false_negatives': 2736, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': nan}, {'true_positives': 0, 'true_negatives': 0, 'false_positives': 9690, 'false_negatives': 2682, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': nan}, {'true_positives': 0, 'true_negatives': 0, 'false_positives': 9621, 'false_negatives': 2696, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': nan}, {'true_positives': 0, 'true_negatives': 0, 'false_positives': 9712, 'false_negatives': 2660, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': nan}]


  accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
  precision = true_positives / (true_positives + false_positives)
  recall = true_positives / (true_positives + false_negatives)


In [25]:
print(collaborative_results)
print(item_to_item_results)

[{'true_positives': 0, 'true_negatives': 0, 'false_positives': 9801, 'false_negatives': 2696, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': nan}, {'true_positives': 0, 'true_negatives': 0, 'false_positives': 9550, 'false_negatives': 2736, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': nan}, {'true_positives': 0, 'true_negatives': 0, 'false_positives': 9690, 'false_negatives': 2682, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': nan}, {'true_positives': 0, 'true_negatives': 0, 'false_positives': 9621, 'false_negatives': 2696, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': nan}, {'true_positives': 0, 'true_negatives': 0, 'false_positives': 9712, 'false_negatives': 2660, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': nan}]
[{'true_positives': 0, 'true_negatives': 0, 'false_positives': 0, 'false_negatives': 0, 'accuracy': nan, 'precision': nan, 'recall': nan, 'f1': nan}, {'true_positives': 0, 'true_negatives': 0, 'false_positives': 0, 'false_ne

## Część 7. - istotność statystyczna

In [24]:
def possibility_of_at_least_k_successes_in_n(k, n):
    p = 0.0
    # obliczamy kolejno prawdopodobienstwo k sukcesow, k+1 sukcesow, ...
    # przydadza Ci sie funkcje marh.comb() i math.pow()
    for i in range(k, n+1):
        p += math.comb(n, i) * math.pow(0.5, i) * math.pow(0.5, n-i)
    
    return p

p = 0.05
metric = 'recall'

positive_tests_count = np.sum(list(map(lambda x, y: x[metric]>y[metric], collaborative_results, item_to_item_results)))# w ilu przypadkach okazalismy sie lepsi niz random?


if possibility_of_at_least_k_successes_in_n(positive_tests_count, n_tests) <= p:
    print('We are better than random!')
else:
    print('There is no evidence we are better')

There is no evidence we are better


[{'true_positives': 0, 'true_negatives': 0, 'false_positives': 0, 'false_negatives': 0, 'accuracy': nan, 'precision': nan, 'recall': nan, 'f1': nan}, {'true_positives': 0, 'true_negatives': 0, 'false_positives': 0, 'false_negatives': 0, 'accuracy': nan, 'precision': nan, 'recall': nan, 'f1': nan}, {'true_positives': 0, 'true_negatives': 0, 'false_positives': 0, 'false_negatives': 0, 'accuracy': nan, 'precision': nan, 'recall': nan, 'f1': nan}, {'true_positives': 0, 'true_negatives': 0, 'false_positives': 0, 'false_negatives': 0, 'accuracy': nan, 'precision': nan, 'recall': nan, 'f1': nan}, {'true_positives': 0, 'true_negatives': 0, 'false_positives': 0, 'false_negatives': 0, 'accuracy': nan, 'precision': nan, 'recall': nan, 'f1': nan}]
