# Matrix Factorization

In [57]:
# Import libraries
import numpy as np
import pandas as pd

# Reading ratings file
ratings = pd.read_csv('./ml-latest-small/ratings.csv', sep=',', encoding='latin-1', usecols=['userId', 'movieId', 'rating'])

# Reading users file
users = pd.read_csv('./ml-latest-small/tags.csv', sep=',', encoding='latin-1', usecols=['userId'])

# Reading movies file
movies = pd.read_csv('./ml-latest-small/movies.csv', sep=',', encoding='latin-1', usecols=['movieId', 'title', 'genres'])

In [58]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [59]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [60]:
n_users = ratings.userId.unique().shape[0]
n_movies = ratings.movieId.unique().shape[0]
print ('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_movies))

Number of users = 610 | Number of movies = 9724


In [61]:
Ratings = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
Ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
R = Ratings.to_numpy()
user_ratings_mean = np.mean(R, axis = 1)
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [63]:
from scipy.sparse.linalg import svds 
U, sigma, Vt = svds(Ratings_demeaned, k = 5)

sigma = np.diag(sigma)

all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [64]:
preds = pd.DataFrame(all_user_predicted_ratings, columns = Ratings.columns)
preds.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
0,2.372491,1.10549,0.938741,-0.026981,0.259134,2.142086,0.364751,0.083575,0.142991,2.12629,...,-0.05597,-0.055302,-0.056638,-0.056638,-0.05597,-0.056638,-0.05597,-0.05597,-0.05597,-0.071995
1,0.167941,0.043916,-0.128525,0.002104,-0.039593,0.116747,-0.078574,-0.017696,-0.00658,-0.010213,...,0.010264,0.00972,0.010808,0.010808,0.010264,0.010808,0.010264,0.010264,0.010264,0.015142
2,0.016219,0.002541,0.031266,0.004392,-0.007318,0.08207,-0.009471,0.009514,0.015639,0.065282,...,0.007666,0.007663,0.00767,0.00767,0.007666,0.00767,0.007666,0.007666,0.007666,0.006166
3,1.096902,0.051728,0.236111,0.012459,0.061574,0.790497,0.287764,-0.011903,-0.008722,0.252987,...,-0.019262,-0.018326,-0.020198,-0.020198,-0.019262,-0.020198,-0.019262,-0.019262,-0.019262,-0.017496
4,1.351888,0.844861,0.384379,0.112229,0.457877,0.697474,0.537043,0.079407,0.123215,1.1538,...,0.012675,0.012995,0.012355,0.012355,0.012675,0.012355,0.012675,0.012675,0.012675,0.013733


In [65]:
def recommend_movies(predictions, userID, movies, original_ratings, num_recommendations):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # User ID starts at 1, not 0
    sorted_user_predictions = predictions.iloc[user_row_number].sort_values(ascending=False) # User ID starts at 1
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings[original_ratings.userId == (userID)]
    user_full = (user_data.merge(movies, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies[~movies['movieId'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

### Recomendações para um usuário

In [66]:
already_rated, predictions = recommend_movies(preds, 1, movies, ratings, 20)

#### Filmes já avaliados pelo usuário

In [67]:
# Top 20 movies that User 1310 has rated 
already_rated.head(20)

Unnamed: 0,userId,movieId,rating,title,genres
231,1,5060,5.0,M*A*S*H (a.k.a. MASH) (1970),Comedy|Drama|War
185,1,2872,5.0,Excalibur (1981),Adventure|Fantasy
89,1,1291,5.0,Indiana Jones and the Last Crusade (1989),Action|Adventure
90,1,1298,5.0,Pink Floyd: The Wall (1982),Drama|Musical
190,1,2948,5.0,From Russia with Love (1963),Action|Adventure|Thriller
189,1,2947,5.0,Goldfinger (1964),Action|Adventure|Thriller
188,1,2944,5.0,"Dirty Dozen, The (1967)",Action|Drama|War
186,1,2899,5.0,Gulliver's Travels (1939),Adventure|Animation|Children
184,1,2858,5.0,American Beauty (1999),Drama|Romance
179,1,2700,5.0,"South Park: Bigger, Longer and Uncut (1999)",Animation|Comedy|Musical


#### Filmes recomandados pelo sistema para este usuário

In [68]:
# Top 20 movies that User 1310 hopefully will enjoy
predictions

Unnamed: 0,movieId,title,genres
475,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi
831,1200,Aliens (1986),Action|Adventure|Horror|Sci-Fi
736,1036,Die Hard (1988),Action|Crime|Thriller
615,858,"Godfather, The (1972)",Crime|Drama
445,541,Blade Runner (1982),Action|Sci-Fi|Thriller
28,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
314,380,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller
660,924,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi
1896,2762,"Sixth Sense, The (1999)",Drama|Horror|Mystery
844,1221,"Godfather: Part II, The (1974)",Crime|Drama


### Avaliação

In [52]:
# Import libraries from Surprise package
from surprise import Reader, Dataset, SVD, model_selection
reader = Reader()

data = Dataset.load_from_df(ratings[ratings.movieId > 130 ][['userId', 'movieId', 'rating']], reader)

data.split(n_folds=5)

In [59]:
svd = SVD()
result_rmse = model_selection.cross_validate(svd, data, measures=['RMSE'])

{'test_rmse': array([0.87148446, 0.87865544, 0.86622688, 0.88035164, 0.87531628]), 'fit_time': (4.967924118041992, 5.7349748611450195, 4.968340873718262, 4.889255046844482, 5.0389015674591064), 'test_time': (0.20512104034423828, 0.26027512550354004, 0.13405203819274902, 0.2688288688659668, 0.13214993476867676)}
Result first fold avaliation:

rmse:  0.8714844591322908


AttributeError: 'dict' object has no attribute 'fit_time'

In [63]:
print(result_rmse,"\n")
for i in range(5):
    print("Result ", i+1, " fold avaliation:")
    print("rmse: ", result_rmse["test_rmse"][i])
    print("fit time: ", result_rmse["fit_time"][i])
    print("test time: ", result_rmse["test_time"][i], "\n")

{'test_rmse': array([0.87148446, 0.87865544, 0.86622688, 0.88035164, 0.87531628]), 'fit_time': (4.967924118041992, 5.7349748611450195, 4.968340873718262, 4.889255046844482, 5.0389015674591064), 'test_time': (0.20512104034423828, 0.26027512550354004, 0.13405203819274902, 0.2688288688659668, 0.13214993476867676)} 

Result  1  fold avaliation:
rmse:  0.8714844591322908
fit time:  4.967924118041992
test time:  0.20512104034423828 

Result  2  fold avaliation:
rmse:  0.8786554402464454
fit time:  5.7349748611450195
test time:  0.26027512550354004 

Result  3  fold avaliation:
rmse:  0.86622688423767
fit time:  4.968340873718262
test time:  0.13405203819274902 

Result  4  fold avaliation:
rmse:  0.8803516436514902
fit time:  4.889255046844482
test time:  0.2688288688659668 

Result  5  fold avaliation:
rmse:  0.8753162801974724
fit time:  5.0389015674591064
test time:  0.13214993476867676 



In [54]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10e402390>

In [55]:
ratings[ratings['userId'] == 1].head(100)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
5,1,70,3.0
6,1,101,5.0
7,1,110,4.0
8,1,151,5.0
9,1,157,5.0


In [56]:
moviesToShow = ratings[ratings.movieId <= 130 ][ratings.userId == 1]['movieId']

print("\nSome movies predicted to user 1")

total = 0
corrects = 0
for m in moviesToShow:
    avaliation = svd.predict(1, m).est
    userRealAvaliation = ratings[ratings.movieId == m ][ratings.userId == 1]['rating']
    print("\nprediction (movieId",m,"): ",avaliation, " - nota real: ", userRealAvaliation.to_string(index=False))
    
    roundAvaliation = round(avaliation)
    
    if(int(roundAvaliation) == int(userRealAvaliation)):
        corrects += 1
    total += 1
    
if(total > 0):
    print("\nAcuracy to user 1 predictions: ",corrects/total)
else:
    print("No movies to test")


Some movies predicted to user 1

prediction (movieId 1 ):  4.20757893151167  - nota real:   4.0

prediction (movieId 3 ):  4.20757893151167  - nota real:   4.0

prediction (movieId 6 ):  4.20757893151167  - nota real:   4.0

prediction (movieId 47 ):  4.20757893151167  - nota real:   5.0

prediction (movieId 50 ):  4.20757893151167  - nota real:   5.0

prediction (movieId 70 ):  4.20757893151167  - nota real:   3.0

prediction (movieId 101 ):  4.20757893151167  - nota real:   5.0

prediction (movieId 110 ):  4.20757893151167  - nota real:   4.0

Acuracy to user 1 predictions:  0.5


  """Entry point for launching an IPython kernel.
  if __name__ == '__main__':


Ao carregar o dataSet, fizemos a exclusão dos filmes com id < 130, portanto utilizaremos estes dados que não foram usados na avaliação com o rmse, para ver mais explicitamente se as precisões estão de acordo com as avaliações reais.

A partir desta comparação com a predição feita, arredondando e a avaliação real do usuário, foi possível calcular uma acurácia.