# Matrix Factorization

In [None]:
# Import libraries
import numpy as np
import pandas as pd

# Reading ratings file
ratings = pd.read_csv('./ml-latest-small/ratings.csv', sep=',', encoding='latin-1', usecols=['userId', 'movieId', 'rating'])

# Reading users file
users = pd.read_csv('./ml-latest-small/tags.csv', sep=',', encoding='latin-1', usecols=['userId'])

# Reading movies file
movies = pd.read_csv('./ml-latest-small/movies.csv', sep=',', encoding='latin-1', usecols=['movieId', 'title', 'genres'])

In [None]:
movies.head()

In [None]:
ratings.head()

In [None]:
n_users = ratings.userId.unique().shape[0]
n_movies = ratings.movieId.unique().shape[0]
print ('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_movies))

In [None]:
Ratings = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
Ratings.head()

In [None]:
R = Ratings.to_numpy()
user_ratings_mean = np.mean(R, axis = 1)
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [None]:
from scipy.sparse.linalg import svds 
U, sigma, Vt = svds(Ratings_demeaned, k = 5)

sigma = np.diag(sigma)

all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [None]:
preds = pd.DataFrame(all_user_predicted_ratings, columns = Ratings.columns)
preds.head()

In [None]:
def recommend_movies(predictions, userID, movies, original_ratings, num_recommendations):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # User ID starts at 1, not 0
    sorted_user_predictions = predictions.iloc[user_row_number].sort_values(ascending=False) # User ID starts at 1
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings[original_ratings.userId == (userID)]
    user_full = (user_data.merge(movies, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies[~movies['movieId'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

### Recomendações para um usuário

In [None]:
already_rated, predictions = recommend_movies(preds, 1, movies, ratings, 20)

#### Filmes já avaliados pelo usuário

In [None]:
# Top 20 movies that User 1310 has rated 
already_rated.head(20)

#### Filmes recomandados pelo sistema para este usuário

In [None]:
# Top 20 movies that User 1310 hopefully will enjoy
predictions

### Avaliação

In [None]:
# Import libraries from Surprise package
from surprise import Reader, Dataset, SVD, model_selection
reader = Reader()

data = Dataset.load_from_df(ratings[ratings.movieId > 130 ][['userId', 'movieId', 'rating']], reader)

data.split(n_folds=5)

In [None]:
svd = SVD()
model_selection.cross_validate(svd, data, measures=['RMSE'])

In [None]:
trainset = data.build_full_trainset()
svd.fit(trainset)

In [None]:
ratings[ratings['userId'] == 1].head(10)

In [None]:
moviesToShow = ratings[ratings.movieId < 130 ][ratings.userId == 1]['movieId']

print("\nSome movies predicted to user 1")

total = 0
corrects = 0
for m in moviesToShow:
    avaliation = svd.predict(1, m).est
    userRealAvaliation = ratings[ratings.movieId == m ][ratings.userId == 1]['rating']
    print("\nprediction (movieId",m,"): ",avaliation, " - nota real: ", userRealAvaliation.to_string(index=False))
    
    roundAvaliation = round(avaliation)
    
    if(int(roundAvaliation) == int(userRealAvaliation)):
        corrects += 1
    total += 1
    
print("\nAcuracy to user 1 predictions: ",corrects/total)

Ao carregar o dataSet, fizemos a exclusão dos filmes com id < 130, portanto utilizaremos estes dados que não foram usados na avaliação com o rmse, para ver mais explicitamente se as precisões estão de acordo com as avaliações reais.

A partir desta comparação com a predição feita, arredondando e a avaliação real do usuário, foi possível calcular uma acurácia.