## Item-based CollaborativeFiltering

In [1]:
#import Packages
%run liblecture.py

## Read Data:movies and ratings

In [2]:
#Read movie
import pandas as pd
movies = pd.read_csv('/content/movies_w_imgurl.csv')
movies

Unnamed: 0,movieId,imdbId,title,genres,imgurl
0,1,114709,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,https://images-na.ssl-images-amazon.com/images...
1,2,113497,Jumanji (1995),Adventure|Children|Fantasy,https://images-na.ssl-images-amazon.com/images...
2,3,113228,Grumpier Old Men (1995),Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,4,114885,Waiting to Exhale (1995),Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,5,113041,Father of the Bride Part II (1995),Comedy,https://images-na.ssl-images-amazon.com/images...
...,...,...,...,...,...
9120,162672,3859980,Mohenjo Daro (2016),Adventure|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
9121,163056,4262980,Shin Godzilla (2016),Action|Adventure|Fantasy|Sci-Fi,https://images-na.ssl-images-amazon.com/images...
9122,163949,2531318,The Beatles: Eight Days a Week - The Touring Y...,Documentary,https://images-na.ssl-images-amazon.com/images...
9123,164977,27660,The Gay Desperado (1936),Comedy,https://images-na.ssl-images-amazon.com/images...


In [3]:
#Read rating data

ratings = pd.read_csv('/content/ratings-9_1.csv')

train = ratings[ratings['type'] =='train'][['userId', 'movieId', 'rating']]
test = ratings[ratings['type'] =='test'][['userId', 'movieId', 'rating']]

## Convert Ratings to Item-User Sparse Matrix

Create Index to Id Maps

In [6]:
#matrix의 인덱스를 가지고 활용한다
movieIds = train.movieId.unique()#학습이라고 봐도 무방

movieIdToIndex = {} #향후 탐색 가능
indexToMovieId = {} #향우 탐색 가능

rowIdx = 0

for movieId in movieIds:
    movieIdToIndex[movieId] = rowIdx #행 인덱스 넣기
    indexToMovieId[rowIdx] = movieId #row에는 무비 아이디
    rowIdx += 1 # 인덱스 1 증가

    

In [7]:
#사용자의 아이디 matrix map도 movie와 같은 방식
userIds = train.userId.unique()

userIdToIndex = {}
indexToUserId = {}

colIdx = 0

for userId in userIds:
    userIdToIndex[userId] = colIdx
    indexToUserId[colIdx] = userId
    colIdx += 1

Create Item-User Sparse Matrix

In [8]:
import scipy.sparse as sp #sparse matrix만들기

#모든 항목을 array화
rows = []
cols = []
vals = []

#array된 부분에 값 넣기
for row in train.itertuples():
    rows.append(movieIdToIndex[row.movieId]) #인덱스
    cols.append(userIdToIndex[row.userId])
    vals.append(row.rating)

#위와 같은 과정이 인덱스 맵핑하는 과정!

coomat = sp.coo_matrix((vals, (rows, cols)), shape = (rowIdx, colIdx)) #vals,,rows,cols을 활용하여 매트릭스를 만들기
#shape는 rows, col이 얼마나 되느지

matrix = coomat.todense() #sparse matrix to dense matrix
matrix

matrix([[2.5, 0. , 0. , ..., 0. , 0. , 0. ],
        [3. , 0. , 0. , ..., 0. , 0. , 0. ],
        [2. , 0. , 0. , ..., 0. , 0. , 0. ],
        ...,
        [0. , 0. , 0. , ..., 0. , 0. , 0. ],
        [0. , 0. , 0. , ..., 0. , 0. , 0. ],
        [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

## Compute Item-Item Similarities

Compute $l_2$-norm


In [9]:
from numpy import linalg as LA

norms = LA.norm(matrix, ord=2, axis=1)
norms

array([20.71, 20.35, 22.94, ...,  3.  ,  1.  ,  1.  ])



Normalize Row Vectors

In [11]:
import numpy as np
normat = np.divide(matrix.T, norms).T
normat

matrix([[0.12, 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
        [0.15, 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
        [0.09, 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
        ...,
        [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
        [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
        [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ]])

Compute Similarities ( = inner product)

In [12]:
sims = pd.DataFrame(data = np.matmul(normat, normat.T), index = movieIds, columns=movieIds)
sims

Unnamed: 0,31,1061,1129,1172,1287,1293,1339,1343,1371,1405,...,134528,134783,137595,138204,60832,64997,72380,129,4736,6425
31,1.00,0.20,0.13,0.07,0.09,0.15,0.07,0.18,0.12,0.14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0
1061,0.20,1.00,0.25,0.18,0.14,0.13,0.25,0.20,0.16,0.23,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.15,0.0
1129,0.13,0.25,1.00,0.17,0.25,0.16,0.26,0.30,0.34,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0
1172,0.07,0.18,0.17,1.00,0.27,0.33,0.12,0.12,0.11,0.16,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0
1287,0.09,0.14,0.25,0.27,1.00,0.32,0.17,0.07,0.18,0.18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64997,0.00,0.00,0.00,0.00,0.00,0.00,0.15,0.00,0.00,0.14,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.00,0.00,0.0
72380,0.00,0.00,0.00,0.00,0.00,0.00,0.15,0.00,0.00,0.14,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.00,0.00,0.0
129,0.00,0.15,0.00,0.00,0.00,0.00,0.13,0.00,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.00,1.00,0.0
4736,0.00,0.15,0.00,0.00,0.00,0.00,0.13,0.00,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.00,1.00,0.0


## Similarity Example

In [16]:
movieIdx = 6 #영화의 인덱스가 6

rels = sims.iloc[movieIdx,:].sort_values(ascending=False).head(6)[1:] #movieindx에서 유사도가 높은 것기준으로 내리기
#동일한 것을 제거하기 위해서 head(6)[1:]
displayMovies(movies, [indexToMovieId[movieIdx]]) #movieidx가 무슨 영화인지 보여준다
displayMovies(movies, rels.index, rels.values) #유사한 영화, 인덱스, 값

## User Rating Prediction

In [17]:
userId = 33

userRatings = train[train['userId'] == userId][['movieId', 'rating']] 

userRatings

Unnamed: 0,movieId,rating
6176,19,3.0
6177,88,3.0
6178,157,1.0
6179,231,3.0
6180,344,4.0
...,...,...
6309,5282,4.0
6310,5339,4.0
6311,5483,4.0
6312,5669,4.0


### Predict Ratings

In [18]:
recSimSums = sims.loc[userRatings['movieId'].values, :].sum().values

recWeightedRatingSums = np.matmul(sims.loc[userRatings['movieId'].values, :].T.values, userRatings['rating'].values)

recItemRatings = pd.DataFrame(data = np.divide(recWeightedRatingSums, recSimSums), index=sims.index)

recItemRatings.columns = ['pred']

recItemRatings

  """


Unnamed: 0,pred
31,3.43
1061,3.38
1129,3.36
1172,3.49
1287,3.42
...,...
64997,3.23
72380,3.23
129,3.00
4736,3.00


In [19]:
top30Movies = recItemRatings.sort_values(by='pred', ascending=False).head(30)

displayMovies(movies, top30Movies.index, top30Movies['pred'].values)

## Compute Errors(MAE, RMSE)

In [20]:
userTestRatings = pd.DataFrame(data=test[test['userId'] == userId])

temp = userTestRatings.join(recItemRatings.loc[userTestRatings['movieId']], on='movieId')

mae = getMAE(temp['rating'], temp['pred'])
rmse = getRMSE(temp['rating'], temp['pred'])

print(f"MAE : {mae:.4f}")
print(f"RMSE: {rmse:.4f}")

MAE : 0.7424
RMSE: 0.8455


### Compare Logs and Recommendations

In [21]:
logs = userRatings.sort_values(by='rating', ascending=False).head(20)
recs = recItemRatings.sort_values(by='pred', ascending=False).head(20)

In [22]:
displayMovies(movies, logs['movieId'].values, logs['rating'].values)
displayMovies(movies, recs.index, recs['pred'].values)