In [1]:
%run liblecture.py

import math
import numpy as np
from numpy import linalg as LA
import pandas as pd


In [3]:
movies = pd.read_csv("./data/movielens/movielens/movies_w_imgurl.csv")
movies.head()

Unnamed: 0,movieId,imdbId,title,genres,imgurl
0,1,114709,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,https://images-na.ssl-images-amazon.com/images...
1,2,113497,Jumanji (1995),Adventure|Children|Fantasy,https://images-na.ssl-images-amazon.com/images...
2,3,113228,Grumpier Old Men (1995),Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,4,114885,Waiting to Exhale (1995),Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,5,113041,Father of the Bride Part II (1995),Comedy,https://images-na.ssl-images-amazon.com/images...


In [13]:
movieGenres = pd.DataFrame(data=movies['genres'].str.split('|').apply(pd.Series, 1).stack(), columns=['genre'])

In [14]:
movieGenres.index = movieGenres.index.droplevel(1)
movieGenres

Unnamed: 0,genre
0,Adventure
0,Animation
0,Children
0,Comedy
0,Fantasy
...,...
9121,Fantasy
9121,Sci-Fi
9122,Documentary
9123,Comedy


In [17]:
genres = pd.DataFrame(movieGenres.groupby('genre')['genre'].count())
genres.columns = ['moviecount']

totalItems = movies.shape[0]

genres['idf'] = genres['moviecount'].apply(lambda x: math.log10(totalItems/x))
genres

Unnamed: 0_level_0,moviecount,idf
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
(no genres listed),18,2.70496
Action,1545,0.771304
Adventure,1117,0.91218
Animation,447,1.309925
Children,583,1.194564
Comedy,3315,0.439749
Crime,1100,0.91884
Documentary,495,1.265628
Drama,4365,0.320249
Fantasy,654,1.144655


In [18]:
movieGenreWeights = movieGenres.join(genres['idf'], on='genre')
movieGenreWeights

Unnamed: 0,genre,idf
0,Adventure,0.912180
0,Animation,1.309925
0,Children,1.194564
0,Comedy,0.439749
0,Fantasy,1.144655
...,...,...
9121,Fantasy,1.144655
9121,Sci-Fi,1.061508
9122,Documentary,1.265628
9123,Comedy,0.439749


In [22]:
movieWeights = movies[['movieId']]

for genre in genres.index:
    movieGenreIdf = movieGenreWeights[movieGenreWeights['genre'] == genre][['idf']]
    movieGenreIdf = movieGenreIdf.rename(columns={'idf':genre})
    movieWeights = movieWeights.join(movieGenreIdf)
    
    
movieWeights.fillna(0, inplace=True)

movieWeights

Unnamed: 0,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0.0,0.000000,0.91218,1.309925,1.194564,0.439749,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1,2,0.0,0.000000,0.91218,0.000000,1.194564,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
2,3,0.0,0.000000,0.00000,0.000000,0.000000,0.439749,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.771304,0.000000,0.0,0.0,0.0
3,4,0.0,0.000000,0.00000,0.000000,0.000000,0.439749,0.0,0.000000,0.320249,...,0.0,0.0,0.0,0.0,0.0,0.771304,0.000000,0.0,0.0,0.0
4,5,0.0,0.000000,0.00000,0.000000,0.000000,0.439749,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9120,162672,0.0,0.000000,0.91218,0.000000,0.000000,0.000000,0.0,0.000000,0.320249,...,0.0,0.0,0.0,0.0,0.0,0.771304,0.000000,0.0,0.0,0.0
9121,163056,0.0,0.771304,0.91218,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,1.061508,0.0,0.0,0.0
9122,163949,0.0,0.000000,0.00000,0.000000,0.000000,0.000000,0.0,1.265628,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
9123,164977,0.0,0.000000,0.00000,0.000000,0.000000,0.439749,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0


In [26]:
# compute l2-norm of movies
movieNorm = pd.DataFrame(data=LA.norm(movieWeights.iloc[:,1:].values, ord=2, axis=1), index=movieWeights.index, columns=['norm2'])

In [27]:
movieNorm

Unnamed: 0,norm2
0,2.340636
1,1.889257
2,0.887857
3,0.943848
4,0.439749
...,...
9120,1.236746
9121,1.965710
9122,1.265628
9123,0.439749


In [30]:
# nomalize vector
nomalizeMovieWeights = movieWeights.iloc[:, 1:].divide(movieNorm['norm2'], axis=0)

# ## 소수점 2째까지 설정
# np.set_printoptions(precision=2)
# pd.set_option('display.precision', 2)

nomalizeMovieWeights

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0.0,0.00,0.39,0.56,0.51,0.19,0.0,0.0,0.00,0.49,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0
1,0.0,0.00,0.48,0.00,0.63,0.00,0.0,0.0,0.00,0.61,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0
2,0.0,0.00,0.00,0.00,0.00,0.50,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.87,0.00,0.0,0.0,0.0
3,0.0,0.00,0.00,0.00,0.00,0.47,0.0,0.0,0.34,0.00,0.0,0.0,0.0,0.0,0.0,0.82,0.00,0.0,0.0,0.0
4,0.0,0.00,0.00,0.00,0.00,1.00,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9120,0.0,0.00,0.74,0.00,0.00,0.00,0.0,0.0,0.26,0.00,0.0,0.0,0.0,0.0,0.0,0.62,0.00,0.0,0.0,0.0
9121,0.0,0.39,0.46,0.00,0.00,0.00,0.0,0.0,0.00,0.58,0.0,0.0,0.0,0.0,0.0,0.00,0.54,0.0,0.0,0.0
9122,0.0,0.00,0.00,0.00,0.00,0.00,0.0,1.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0
9123,0.0,0.00,0.00,0.00,0.00,1.00,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0


In [32]:
# item-item similarity matrix
sims = pd.DataFrame(data=np.matmul(nomalizeMovieWeights, nomalizeMovieWeights.T))

sims.index = movieWeights['movieId']
sims.columns = movieWeights['movieId']

sims

  sims = pd.DataFrame(data=np.matmul(nomalizeMovieWeights, nomalizeMovieWeights.T))


movieId,1,2,3,4,5,6,7,8,9,10,...,161830,161918,161944,162376,162542,162672,163056,163949,164977,164979
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.00,0.81,0.09,0.09,0.19,0.00,0.09,0.64,0.00,0.25,...,0.00,0.19,0.00,0.00,0.00,0.29,0.47,0.0,0.19,0.0
2,0.81,1.00,0.00,0.00,0.00,0.00,0.00,0.80,0.00,0.32,...,0.00,0.23,0.00,0.00,0.00,0.36,0.58,0.0,0.00,0.0
3,0.09,0.00,1.00,0.94,0.50,0.00,1.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.63,0.54,0.00,0.0,0.50,0.0
4,0.09,0.00,0.94,1.00,0.47,0.00,0.94,0.00,0.00,0.00,...,0.08,0.00,0.34,0.34,0.60,0.60,0.00,0.0,0.47,0.0
5,0.19,0.00,0.50,0.47,1.00,0.00,0.50,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,1.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162672,0.29,0.36,0.54,0.60,0.00,0.00,0.54,0.45,0.00,0.48,...,0.06,0.36,0.26,0.26,0.46,1.00,0.34,0.0,0.00,0.0
163056,0.47,0.58,0.00,0.00,0.00,0.22,0.00,0.28,0.39,0.52,...,0.00,0.69,0.00,0.00,0.00,0.34,1.00,0.0,0.00,0.0
163949,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.0,0.00,1.0
164977,0.19,0.00,0.50,0.47,1.00,0.00,0.50,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,1.00,0.0


In [35]:
ratings = pd.read_csv('./data/movielens/movielens/ratings-9_1.csv')

train = ratings[ratings['type']=='train'][['userId', 'movieId', 'rating']]
test = ratings[ratings['type']=='test'][['userId', 'movieId', 'rating']]

In [36]:
# set test userID
userId = 33

In [41]:
userRatings = train[train['userId']==userId][['movieId', 'rating']]

topRatings = userRatings.sort_values(by='rating', ascending=False).head(20)

topRatings

displaymovies(movies, topRatings['movieId'].values, topRatings['rating'].values)

In [57]:
# predict item ratings for the test users
recSimSums = sims.loc[userRatings['movieId'].values, :].sum().values

# 유사도가 0인 것들에게서 발생하는 오류는 방지하기 위해 +1
recSimSums = recSimSums + 1

recWeightedRatingSums = np.matmul(sims.loc[userRatings['movieId'].values, :].T.values, userRatings['rating'].values)

recItemRatings = pd.DataFrame(np.divide(recWeightedRatingSums, recSimSums), index=sims.index)

recItemRatings.columns = ['pred']

recItemRatings

Unnamed: 0_level_0,pred
movieId,Unnamed: 1_level_1
1,2.99
2,2.72
3,3.21
4,3.22
5,3.22
...,...
162672,3.07
163056,2.75
163949,2.67
164977,3.22


## Check recommended items

- not 'recSimSums = recSimSums + 1'

In [56]:
top30Movies = recItemRatings.sort_values(by='pred', ascending=False).head(30) 

displaymovies(movies, top30Movies.index, top30Movies['pred'].values)

- do 'recSimSums = recSimSums + 1'

In [58]:
top30Movies = recItemRatings.sort_values(by='pred', ascending=False).head(30) 

displaymovies(movies, top30Movies.index, top30Movies['pred'].values)

- recSimSums = recSimSums + 1을 한 것과 안한 것의 차이가 심하다는 것을 알 수 있음

## Compute MAE and RMSE for the test user

In [54]:
userTestRatings = pd.DataFrame(data=test[test['userId']==userId])

temp = userTestRatings.join(recItemRatings.loc[userTestRatings['movieId']], on='movieId')

mae = getMAE(temp['rating'], temp['pred'])
rmse = getRMSE(temp['rating'], temp['pred'])

print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")

MAE: 0.9682
RMSE: 1.1347


- 지금은 userId가 33번인 user만 test 했지만 모든 user를 상대로 예측을 하고, 그 예측의 차이가 어떻게 됐는지 추출을 하고 개선을 해야 함