In [1]:
import pandas as pd
import numpy as np
import numpy.ma as ma
import random

In [17]:
np.set_printoptions(precision=2, suppress=True)

In [2]:
random.seed()

In [3]:
links = pd.read_csv("links_small.csv")
ratings = pd.read_csv("ratings_small.csv")

ratings = ratings.drop("timestamp",axis=1)
ratings_wide = ratings.pivot_table(index = 'userId',
                                   columns = 'movieId',
                                   values = 'rating')

In [4]:
#### Matrices 

links_values = np.array(links)
links_features = np.array(links.columns)

ratings_values = np.array(ratings_wide)
ratings_features = np.array(ratings_wide.columns)

In [5]:
#### Collaborative Filtering 

def dCostFunction_dU(Y,U,V,k,i,lambd=.02):
    return Y[i,:][np.newaxis,:]@V.T@np.linalg.inv(V@V.T+lambd*np.eye(k))

def dCostFunction_dV(Y,U,V,k,i,lambd=.02):
    return Y[:,i][:,np.newaxis].T@U@np.linalg.inv(U.T@U+lambd*np.eye(k)) 

In [6]:
#http://ethen8181.github.io/machine-learning/recsys/1_ALSWR.html

def CollaborativeFiltering(Y,max_iter=10,k=2,eta=.02,lambd=10):
    
    Y = np.nan_to_num(Y)
    m,n = Y.shape
    
    U=np.random.uniform(low = 0,high = (1/np.sqrt(k)),size = [m,k] )
    Uold=np.random.uniform(low = 0,high = (1/np.sqrt(k)),size = [m,k] )
    V=np.random.uniform(low = 0,high = (1/np.sqrt(k)),size = [k,n] )
    
    j= 0
    
    while ((j <= max_iter) and (np.linalg.norm(U-Uold))>=.1):
        
        Uold = U.copy()    
        
        
        for i in range(U.shape[0]):
            U[i,:] = U[i,:] - eta*dCostFunction_dU(Y,U,V,k,i,lambd)

        for i in range(V.shape[1]):
            V[:,i] = V[:,i]-eta*dCostFunction_dV(Y,U,V,k,i,lambd)
            

        #print(U)
        #print(Uold)
        
        
        print(np.linalg.norm(U-Uold))
        print(j)

        #Vold = V
        #V = V-eta*dCostFunction_dV(Y,U,V,lambd)
            #print(np.linalg.norm(V-Vold))
            
        j+=1
        
        #Revisar tamaño de V, creo que está al revés
        
    return U,V

Y=ratings_values


In [7]:
U,V = CollaborativeFiltering(Y,max_iter=50,k=10,eta=.5,lambd=1) 

3.2654656988256368
0
12.39564058888798
1
9.51092239586364
2
7.60334361830856
3
6.464995361930703
4
5.710089484600897
5
5.166881261174067
6
4.752681071016543
7
4.423427566076934
8
4.153522302245535
9
3.9270156390395914
10
3.733395738779496
11
3.5654140078001664
12
3.417884916227954
13
3.2869850123557742
14
3.169823310289831
15
3.064166978590486
16
2.9682599471216595
17
2.8806992935955282
18
2.8003487841840133
19
2.7262770144164388
20
2.657712261525403
21
2.5940089489400955
22
2.5346223431754535
23
2.479089192504846
24
2.427012723628727
25
2.378050881402413
26
2.331907013881933
27
2.2883224234141366
28
2.2470703574338935
29
2.207951121296734
30
2.170788073744239
31
2.1354243226862053
32
2.101719981105513
33
2.0695498743069964
34
2.0388016133964046
35
2.0093739678684055
36
1.9811754839820754
37
1.9541233062721786
38
1.928142167857694
39
1.9031635217326588
40
1.879124790378534
41
1.8559687151354187
42
1.8336427900477896
43
1.8120987675379314
44
1.7912922253938544
45
1.7711821862932777
46
1

In [16]:
U.shape

(671, 2)

In [17]:
V.shape

(2, 9066)

In [18]:
U

array([[ 0.53978821,  0.30368339],
       [ 2.86735277,  0.20689059],
       [ 1.98870416,  0.60563549],
       ...,
       [ 0.93755113, -0.02484656],
       [ 2.17642403,  0.48260553],
       [ 4.34549565, -0.42027992]])

In [20]:
V

array([[-25.62695811,  -9.87173671,  -4.08813719, ...,   0.43304182,
          0.52351207,   0.48691622],
       [  0.29022773,  -0.20912787,   0.48751525, ...,  -0.07143005,
          0.26329494,   2.44946945]])

In [None]:
## notas de santi (no estoy seguro de lo siguiente): 
## Multiplicar U y V para obtener la matriz original pero sin nans _ ? 
# Eso nos daria nuestras predicciones, que habria que comparar con la matriz original y calcular el ndcg_score


In [11]:
Matriz_predicciones = U @ V

In [18]:
Matriz_predicciones

array([[ -5.49,   1.21,  -1.93, ...,   0.08,   0.23,   0.56],
       [-81.41, -75.8 , -32.42, ...,   1.15,   0.5 ,   3.91],
       [-38.34, -10.7 ,  -2.07, ...,   0.6 ,   0.7 ,   3.09],
       ...,
       [-20.45,   0.42,  -3.1 , ...,   0.33,   0.6 ,   1.85],
       [-43.08, -15.49,  -8.42, ...,   0.82,   0.65,   2.23],
       [-91.88, -16.94,  -1.34, ...,   1.23,   1.81,   2.96]])

In [19]:
Matriz_predicciones.shape

(671, 9066)

In [20]:
# Por que hay valores negativos como - 91 ... _??? 

In [8]:
from sklearn.metrics import ndcg_score

#documentacion sobre la metrica:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ndcg_score.html#sklearn.metrics.ndcg_score

