# Matrix Factorization

In [43]:
# Import libraries
import numpy as np
import pandas as pd

# Reading ratings file
ratings = pd.read_csv('./ml-latest-small/ratings.csv', sep=',', encoding='latin-1', usecols=['userId', 'movieId', 'rating'])

# Reading users file
users = pd.read_csv('./ml-latest-small/tags.csv', sep=',', encoding='latin-1', usecols=['userId'])

# Reading movies file
movies = pd.read_csv('./ml-latest-small/movies.csv', sep=',', encoding='latin-1', usecols=['movieId', 'title', 'genres'])

In [44]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [45]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [46]:
n_users = ratings.userId.unique().shape[0]
n_movies = ratings.movieId.unique().shape[0]
print ('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_movies))

Number of users = 610 | Number of movies = 9724


In [47]:
Ratings = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
Ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
R = Ratings.to_numpy()
user_ratings_mean = np.mean(R, axis = 1)
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [49]:
from scipy.sparse.linalg import svds 
U, sigma, Vt = svds(Ratings_demeaned, k = 5)

sigma = np.diag(sigma)

all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [50]:
preds = pd.DataFrame(all_user_predicted_ratings, columns = Ratings.columns)
preds.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
0,2.372491,1.10549,0.938741,-0.026981,0.259134,2.142086,0.364751,0.083575,0.142991,2.12629,...,-0.05597,-0.055302,-0.056638,-0.056638,-0.05597,-0.056638,-0.05597,-0.05597,-0.05597,-0.071995
1,0.167941,0.043916,-0.128525,0.002104,-0.039593,0.116747,-0.078574,-0.017696,-0.00658,-0.010213,...,0.010264,0.00972,0.010808,0.010808,0.010264,0.010808,0.010264,0.010264,0.010264,0.015142
2,0.016219,0.002541,0.031266,0.004392,-0.007318,0.08207,-0.009471,0.009514,0.015639,0.065282,...,0.007666,0.007663,0.00767,0.00767,0.007666,0.00767,0.007666,0.007666,0.007666,0.006166
3,1.096902,0.051728,0.236111,0.012459,0.061574,0.790497,0.287764,-0.011903,-0.008722,0.252987,...,-0.019262,-0.018326,-0.020198,-0.020198,-0.019262,-0.020198,-0.019262,-0.019262,-0.019262,-0.017496
4,1.351888,0.844861,0.384379,0.112229,0.457877,0.697474,0.537043,0.079407,0.123215,1.1538,...,0.012675,0.012995,0.012355,0.012355,0.012675,0.012355,0.012675,0.012675,0.012675,0.013733


In [51]:
def recommend_movies(predictions, userID, movies, original_ratings, num_recommendations):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # User ID starts at 1, not 0
    sorted_user_predictions = predictions.iloc[user_row_number].sort_values(ascending=False) # User ID starts at 1
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings[original_ratings.userId == (userID)]
    user_full = (user_data.merge(movies, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies[~movies['movieId'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

### Recomendações para um usuário

In [52]:
user = 2 #id of user that will be tested
already_rated, predictions = recommend_movies(preds, user, movies, ratings, 5)

#### Filmes já avaliados pelo usuário

In [53]:
# Top 5 movies that User user has rated 
already_rated.head(5)

Unnamed: 0,userId,movieId,rating,title,genres
28,2,131724,5.0,The Jinx: The Life and Deaths of Robert Durst ...,Documentary
27,2,122882,5.0,Mad Max: Fury Road (2015),Action|Adventure|Sci-Fi|Thriller
22,2,106782,5.0,"Wolf of Wall Street, The (2013)",Comedy|Crime|Drama
18,2,89774,5.0,Warrior (2011),Drama
9,2,60756,5.0,Step Brothers (2008),Comedy


#### Filmes recomandados pelo sistema para este usuário

In [54]:
# Top 5 movies that User user hopefully will enjoy
predictions

Unnamed: 0,movieId,title,genres
2223,2959,Fight Club (1999),Action|Crime|Drama|Thriller
1936,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
4795,7153,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy
257,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
3634,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy


### Avaliação

In [55]:
# Import libraries from Surprise package
from surprise import Reader, Dataset, SVD, model_selection
reader = Reader()

data = Dataset.load_from_df(ratings[ratings.movieId > 3000 ][['userId', 'movieId', 'rating']], reader)

data.split(n_folds=5)

In [56]:
svd = SVD()
result_rmse = model_selection.cross_validate(svd, data, measures=['RMSE'])

In [57]:
print(result_rmse,"\n")
for i in range(5):
    print("Result ", i+1, " fold avaliation:")
    print("rmse: ", result_rmse["test_rmse"][i])
    print("fit time: ", result_rmse["fit_time"][i])
    print("test time: ", result_rmse["test_time"][i], "\n")

({u'test_rmse': array([0.86745424, 0.86387862, 0.8738634 , 0.85563841, 0.87700068]), u'fit_time': (1.958899974822998, 1.898643970489502, 2.0645267963409424, 1.8861069679260254, 1.8072049617767334), u'test_time': (0.06477999687194824, 0.07619905471801758, 0.07004904747009277, 0.0637979507446289, 0.11854410171508789)}, '\n')
('Result ', 1, ' fold avaliation:')
('rmse: ', 0.8674542392291331)
('fit time: ', 1.958899974822998)
('test time: ', 0.06477999687194824, '\n')
('Result ', 2, ' fold avaliation:')
('rmse: ', 0.863878619643839)
('fit time: ', 1.898643970489502)
('test time: ', 0.07619905471801758, '\n')
('Result ', 3, ' fold avaliation:')
('rmse: ', 0.873863396863185)
('fit time: ', 2.0645267963409424)
('test time: ', 0.07004904747009277, '\n')
('Result ', 4, ' fold avaliation:')
('rmse: ', 0.8556384064400845)
('fit time: ', 1.8861069679260254)
('test time: ', 0.0637979507446289, '\n')
('Result ', 5, ' fold avaliation:')
('rmse: ', 0.8770006762719014)
('fit time: ', 1.8072049617767334

In [58]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd86835add0>

In [59]:
ratings[ratings['userId'] == user].head(100)

Unnamed: 0,userId,movieId,rating
232,2,318,3.0
233,2,333,4.0
234,2,1704,4.5
235,2,3578,4.0
236,2,6874,4.0
237,2,8798,3.5
238,2,46970,4.0
239,2,48516,4.0
240,2,58559,4.5
241,2,60756,5.0


In [67]:
user = 2
moviesToShow = ratings[ratings.movieId <= 3000 ][ratings.userId == user]['movieId']

print("\nSome movies predicted to user " + str(user))

total = 0
corrects = 0
for m in moviesToShow:
    evaluation = svd.predict(user, m).est
    userRealEvaluation = ratings[ratings.movieId == m ][ratings.userId == user]['rating']
    print("\nprediction (movieId",m,"): ",evaluation, " - nota real: ", userRealEvaluation.to_string(index=False))
    
    roundEvaluation = round(evaluation)
    
    if(int(roundEvaluation) == int(userRealEvaluation)):
        corrects += 1
    total += 1

corrects = float(corrects)
total = float(total)
if(total > 0):
    print("\nAcuracy to user " + str(user) + " predictions: ", corrects/total)
else:
    print("No movies to test")


Some movies predicted to user 2
('\nprediction (movieId', 318, '): ', 3.5912903661452327, ' - nota real: ', u' 3.0')
('\nprediction (movieId', 333, '): ', 3.5912903661452327, ' - nota real: ', u' 4.0')
('\nprediction (movieId', 1704, '): ', 3.5912903661452327, ' - nota real: ', u' 4.5')
('\nAcuracy to user 2 predictions: ', 0.6666666666666666)


  
  # Remove the CWD from sys.path while we load stuff.


Ao carregar o dataSet, fizemos a exclusão dos filmes com id > 3000, portanto utilizaremos estes dados que não foram usados na avaliação com o rmse, para ver mais explicitamente se as precisões estão de acordo com as avaliações reais.

A partir desta comparação com a predição feita, arredondando e a avaliação real do usuário, foi possível calcular uma acurácia.