In [2]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [3]:
# load dataset ratings (dfr) and movies (dfm)
dfr = pd.read_csv("./data/vod_ratings.csv", sep="\t",encoding="utf-8-sig", error_bad_lines=False, low_memory=False)
dfm = pd.read_csv("./data/vod_films.csv", sep="\t",encoding="utf-8-sig", error_bad_lines=False, low_memory=False)

# Ratings : keep only user id, rating & movie id
dfr=dfr.loc[:,{'user_id','value','object_id'}]
dfr = dfr.rename(columns = {'value':'rating','object_id':'movie_id'})

# Movies : keep only 'movie_id', 'year','nb_rating', 'avg_rating','title'
dfm=dfm.loc[:,{'id', 'year','nb_rating', 'avg_rating','title'}]
dfm = dfm.rename(columns = {'id':'movie_id'})

b'Skipping line 22345: expected 28 fields, saw 29\n'


In [6]:
# Parameters
minimum_ratings_for_user = 100
minimum_ratings_for_movie = 100
include_movie_bias = False

In [7]:
# Keep only ratings from users who rated > {minimum_ratings_for_user} movies
ser=pd.DataFrame(dfr.groupby('user_id')['rating'].count().sort_values(ascending=False))
ser = ser[ser.rating>minimum_ratings_for_user]
temp = pd.merge(dfr, ser, left_on='user_id', right_index=True)
dfr = temp.loc[:,{'movie_id','user_id','rating_x'}].rename(columns = {'rating_x':'rating'})

# Keep only movies rated by at least > {minimum_ratings_for_movie} users
ser=pd.DataFrame(dfr.groupby('movie_id')['rating'].count().sort_values(ascending=False))
ser = ser[ser.rating>minimum_ratings_for_movie]
temp = pd.merge(dfr, ser, left_on='movie_id', right_index=True)
dfr = temp.loc[:,{'movie_id','user_id','rating_x'}].rename(columns = {'rating_x':'rating'})

#### Centering of data by removing bias of users who rate consistently higher or lower (+same for movies)
1. Calculer le rating moyen pour chaque film rating_moyen(film) = mean(ratings(film))
2. Correction biais film : rating (user,film)' = rating (user,film) - rating_moyen(film)
3. Correction biais user : rating (user,film)'' = rating (user,film)' - mean(rating (user,film)')
4. Formule finale : rating(u,f) <- rating(u,f) - mean(ratings(film)) - mean(rating (user,film) - rating_moyen(film))

In [8]:
if include_movie_bias:
    #Calculer le rating moyen pour chaque film rating_moyen(film) = mean(ratings(film))
    avg_movie_rating = pd.DataFrame(dfr.groupby('movie_id')['rating'].mean())

    #Correction biais film : rating (user,film)' = rating (user,film) - rating_moyen(film)
    temp = pd.merge(dfr, avg_movie_rating, left_on='movie_id', right_index=True)
    temp['rating_movie_centered']=temp['rating_x']-temp['rating_y']
    temp=temp.rename(columns = {'rating_y':'movie_avg_rating','rating_x':'rating'})

    #Calcul du biais moyen des ratings user (les ratings eux même étant débiaisés vs film)
    avg_user_rating = pd.DataFrame(temp.groupby('user_id')['rating_movie_centered'].mean())
    temp = pd.merge(temp, avg_user_rating, left_on='user_id', right_index=True)
    temp=temp.rename(columns = {'rating_movie_centered_x':'rating_movie_centered','rating_movie_centered_y':'bias_user_movie'})

    #Centrage final : rating (user,film)'' = rating (user,film)' - mean(rating (user,film)')
    temp['centered_rating'] = temp['rating_movie_centered']-temp['bias_user_movie']
    temp=temp.rename(columns = {'centered_rating':'rating'})

else:
    #Calcul du biais moyen des ratings user (les ratings ne sont pas débiaisés films)
    avg_user_rating = pd.DataFrame(dfr.groupby('user_id')['rating'].mean())
    temp = pd.merge(dfr, avg_user_rating, left_on='user_id', right_index=True)
    temp.rating_y = temp.rating_x-temp.rating_y
    temp=temp.loc[:,{'user_id','rating_y','movie_id'}]
    temp=temp.rename(columns = {'rating_y':'rating'})

temp[:3]    

Unnamed: 0,rating,movie_id,user_id
3,-0.199712,6909,6842
4,0.800288,9371,6842
5,-1.199712,9415,6842


In [9]:
# Creation of array USERS x PRODUCTS
table = temp.pivot_table(values='rating', index=['user_id'],columns=['movie_id'], aggfunc=np.max, fill_value=0)
table.shape

(6499, 6068)

In [7]:
"""
@INPUT:
    R     : a matrix to be factorized, dimension N x M
    P     : an initial matrix of dimension N x K
    Q     : an initial matrix of dimension M x K
    K     : the number of latent features
    steps : the maximum number of steps to perform the optimisation
    alpha : the learning rate
    beta  : the regularization parameter
@OUTPUT:
    the final matrices P and Q
@SOURCE : http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/
"""

'\n@INPUT:\n    R     : a matrix to be factorized, dimension N x M\n    P     : an initial matrix of dimension N x K\n    Q     : an initial matrix of dimension M x K\n    K     : the number of latent features\n    steps : the maximum number of steps to perform the optimisation\n    alpha : the learning rate\n    beta  : the regularization parameter\n@OUTPUT:\n    the final matrices P and Q\n@SOURCE : http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/\n'

In [39]:
def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = np.dot(P,Q)
        e = 0
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) )
        
        print("time {} : step {}/{}, alpha = {}, error {}".format(str(datetime.now()),step,steps, alpha, e))
        if e < 0.001:
            break
    return P, Q.T

In [40]:
#Factorization of matrix using 5 hidden factors
R = table.values
R = np.array(R)

N = len(R)
M = len(R[0])
K = 7

P = np.random.rand(N,K)
Q = np.random.rand(M,K)

nP, nQ = matrix_factorization(R, P, Q, K,steps = 300, alpha=0.0002)
#nR = np.dot(nP, nQ.T)

time 2017-11-26 11:53:14.245471 : step 0/300, alpha = 0.0002, error 557876.4031024616
time 2017-11-26 11:54:09.909655 : step 1/300, alpha = 0.0002, error 394500.8176216584
time 2017-11-26 11:55:03.881742 : step 2/300, alpha = 0.0002, error 329492.3234813779
time 2017-11-26 11:55:58.166847 : step 3/300, alpha = 0.0002, error 295337.98191056104
time 2017-11-26 11:56:52.221939 : step 4/300, alpha = 0.0002, error 274171.16091733123
time 2017-11-26 11:57:46.398037 : step 5/300, alpha = 0.0002, error 259592.7033195065
time 2017-11-26 11:58:40.391126 : step 6/300, alpha = 0.0002, error 248820.089059717
time 2017-11-26 11:59:34.620227 : step 7/300, alpha = 0.0002, error 240465.44474786756
time 2017-11-26 12:00:28.484308 : step 8/300, alpha = 0.0002, error 233760.87564938518
time 2017-11-26 12:01:22.733411 : step 9/300, alpha = 0.0002, error 228245.25412304065
time 2017-11-26 12:02:16.753501 : step 10/300, alpha = 0.0002, error 223622.6884034637
time 2017-11-26 12:03:11.033605 : step 11/300, al

time 2017-11-26 13:18:44.336895 : step 94/300, alpha = 0.0002, error 182523.38278795365
time 2017-11-26 13:19:40.077084 : step 95/300, alpha = 0.0002, error 182476.19888459408
time 2017-11-26 13:20:35.654262 : step 96/300, alpha = 0.0002, error 182429.8203368423
time 2017-11-26 13:21:31.268443 : step 97/300, alpha = 0.0002, error 182384.21378453422
time 2017-11-26 13:22:26.890625 : step 98/300, alpha = 0.0002, error 182339.34715801355
time 2017-11-26 13:23:22.611812 : step 99/300, alpha = 0.0002, error 182295.18960633237
time 2017-11-26 13:24:18.147988 : step 100/300, alpha = 0.0002, error 182251.71142974374
time 2017-11-26 13:25:13.644163 : step 101/300, alpha = 0.0002, error 182208.88401664267
time 2017-11-26 13:26:09.145337 : step 102/300, alpha = 0.0002, error 182166.67978392815
time 2017-11-26 13:27:04.746517 : step 103/300, alpha = 0.0002, error 182125.0721212109
time 2017-11-26 13:28:00.383700 : step 104/300, alpha = 0.0002, error 182084.03533844868
time 2017-11-26 13:28:56.0498

KeyboardInterrupt: 

In [None]:
import pickle
pickle.dump(nP, open( "nP.p", "wb" ) )
pickle.dump(nQ, open( "nQ.p", "wb" ) )

In [None]:
nPP = pickle.load( open( "nP.p", "rb" ))
nQQ = pickle.load( open( "nQ.p", "rb" ))
test = pd.DataFrame( np.dot(nP, nQ.T))