# Matrix Factorization

In [156]:
# Import libraries
import numpy as np
import pandas as pd

# Reading ratings file
ratings = pd.read_csv('./ml-latest-small/ratings.csv', sep=',', encoding='latin-1', usecols=['userId', 'movieId', 'rating'])

# Reading users file
users = pd.read_csv('./ml-latest-small/tags.csv', sep=',', encoding='latin-1', usecols=['userId'])

# Reading movies file
movies = pd.read_csv('./ml-latest-small/movies.csv', sep=',', encoding='latin-1', usecols=['movieId', 'title', 'genres'])

In [157]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [158]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [159]:
n_users = ratings.userId.unique().shape[0]
n_movies = ratings.movieId.unique().shape[0]
print ('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_movies))

Number of users = 610 | Number of movies = 9724


In [160]:
Ratings = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
Ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [161]:
R = Ratings.to_numpy()
user_ratings_mean = np.mean(R, axis = 1)
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [162]:
from scipy.sparse.linalg import svds 
U, sigma, Vt = svds(Ratings_demeaned, k = 1.2)

sigma = np.diag(sigma)

all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [163]:
preds = pd.DataFrame(all_user_predicted_ratings, columns = Ratings.columns)
preds.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
0,2.256304,1.192633,0.456102,-0.013961,0.316419,1.248999,0.336774,0.042479,0.020354,1.278537,...,-0.040108,-0.040313,-0.039902,-0.039902,-0.040108,-0.039902,-0.040108,-0.040108,-0.040108,-0.032106
1,0.237791,0.126085,0.048734,-0.000632,0.034065,0.132004,0.036202,0.005296,0.002972,0.135106,...,-0.003378,-0.003399,-0.003356,-0.003356,-0.003378,-0.003356,-0.003378,-0.003378,-0.003378,-0.002537
2,0.033211,0.021625,0.013603,0.008483,0.012081,0.022239,0.012303,0.009098,0.008857,0.022561,...,0.008198,0.008196,0.0082,0.0082,0.008198,0.0082,0.008198,0.008198,0.008198,0.008285
3,1.247695,0.670068,0.270094,0.014826,0.194239,0.700677,0.205293,0.045476,0.033461,0.716718,...,0.000627,0.000515,0.000739,0.000739,0.000627,0.000739,0.000627,0.000627,0.000627,0.004972
4,0.486544,0.254206,0.093326,-0.00935,0.062815,0.266518,0.067261,0.002978,-0.001855,0.27297,...,-0.015062,-0.015107,-0.015017,-0.015017,-0.015062,-0.015017,-0.015062,-0.015062,-0.015062,-0.013314


In [164]:
def recommend_movies(predictions, userID, movies, original_ratings, num_recommendations):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # User ID starts at 1, not 0
    sorted_user_predictions = predictions.iloc[user_row_number].sort_values(ascending=False) # User ID starts at 1
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings[original_ratings.userId == (userID)]
    user_full = (user_data.merge(movies, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies[~movies['movieId'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

### Recomendações para um usuário

In [165]:
already_rated, predictions = recommend_movies(preds, 1, movies, ratings, 20)

#### Filmes já avaliados pelo usuário

In [166]:
# Top 20 movies that User 1310 has rated 
already_rated.head(20)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
231,1,5060,5.0,964984002,M*A*S*H (a.k.a. MASH) (1970),Comedy|Drama|War
185,1,2872,5.0,964981680,Excalibur (1981),Adventure|Fantasy
89,1,1291,5.0,964981909,Indiana Jones and the Last Crusade (1989),Action|Adventure
90,1,1298,5.0,964984086,Pink Floyd: The Wall (1982),Drama|Musical
190,1,2948,5.0,964982191,From Russia with Love (1963),Action|Adventure|Thriller
189,1,2947,5.0,964982176,Goldfinger (1964),Action|Adventure|Thriller
188,1,2944,5.0,964981872,"Dirty Dozen, The (1967)",Action|Drama|War
186,1,2899,5.0,964982703,Gulliver's Travels (1939),Adventure|Animation|Children
184,1,2858,5.0,964980868,American Beauty (1999),Drama|Romance
179,1,2700,5.0,964980985,"South Park: Bigger, Longer and Uncut (1999)",Animation|Comedy|Musical


#### Filmes recomandados pelo sistema para este usuário

In [167]:
# Top 20 movies that User 1310 hopefully will enjoy
predictions

Unnamed: 0,movieId,title,genres
259,318,"Shawshank Redemption, The (1994)",Crime|Drama
615,858,"Godfather, The (1972)",Crime|Drama
475,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi
3407,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
4568,7153,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy
3905,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
1896,2762,"Sixth Sense, The (1999)",Drama|Horror|Mystery
2910,4226,Memento (2000),Mystery|Thriller
2963,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...
28,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller


### Avaliação

In [177]:
# Import libraries from Surprise package
from surprise import Reader, Dataset, SVD, model_selection
reader = Reader()

data = Dataset.load_from_df(ratings[ratings.movieId > 130 ][['userId', 'movieId', 'rating']], reader)

data.split(n_folds=5)

In [178]:
svd = SVD()
model_selection.cross_validate(svd, data, measures=['RMSE'])

{'test_rmse': array([0.94275465, 0.87904711, 0.92017208, 0.89237104, 0.91662503]),
 'fit_time': (0.18485593795776367,
  0.16380596160888672,
  0.16330194473266602,
  0.16336488723754883,
  0.1695847511291504),
 'test_time': (0.0049610137939453125,
  0.00449681282043457,
  0.004781246185302734,
  0.004937171936035156,
  0.004511117935180664)}

In [172]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0xa272092b0>

In [183]:
ratings[ratings['userId'] == 1].head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [216]:
moviesToShow = ratings[ratings.movieId < 130 ][ratings.userId == 1]['movieId']

print("\nSome movies predicted to user 1")

total = 0
corrects = 0
for m in moviesToShow:
    avaliation = svd.predict(1, m).est
    userRealAvaliation = ratings[ratings.movieId == m ][ratings.userId == 1]['rating']
    print("\nprediction (movieId",m,"): ",avaliation, " - nota real: ", userRealAvaliation.to_string(index=False))
    
    roundAvaliation = round(avaliation)
    
    if(int(roundAvaliation) == int(userRealAvaliation)):
        corrects += 1
    total += 1
    
print("\nAcuracy to user 1 predictions: ",corrects/total)


Some movies predicted to user 1

prediction (movieId 1 ):  4.109711362270893  - nota real:   4.0

prediction (movieId 3 ):  3.5926149896466617  - nota real:   4.0

prediction (movieId 6 ):  4.02950911287846  - nota real:   4.0

prediction (movieId 47 ):  4.407610509265783  - nota real:   5.0

prediction (movieId 50 ):  4.599156626146369  - nota real:   5.0

prediction (movieId 70 ):  3.4295034095125594  - nota real:   3.0

prediction (movieId 101 ):  4.219756919411347  - nota real:   5.0

prediction (movieId 110 ):  4.234922483117044  - nota real:   4.0

Acuracy to user 1 predictions:  0.75


  """Entry point for launching an IPython kernel.
  if __name__ == '__main__':


Ao carregar o dataSet, fizemos a exclusão dos filmes com id < 130, portanto utilizaremos estes dados que não foram usados na avaliação com o rmse, para ver mais explicitamente se as precisões estão de acordo com as avaliações reais.

A partir desta comparação com a predição fe