# Item-based Collaborative Filtering

Import Packages

In [1]:
import math
import numpy as np
from numpy import linalg as LA
from scipy.sparse import coo_matrix
import pandas as pd
from IPython.display import HTML, display

## Read Data: movies and ratings

Read Movies and Define `displayMovies`

In [2]:
movies = pd.read_csv('movies_w_imgurl.csv')

In [3]:
def displayMovies(movieIds, ratings=[]):
    html = ""
    for i, movieId in enumerate(movieIds):
        mov = movies[movies['movieId'] == movieId].iloc[0]
        html += """
            <div style="display:inline-block;min-width:150px;max-width:150px;vertical-align: top;">
                <img src="{}" width="120"><br/>
        """.format(mov.imgurl)
        if i < len(ratings):
            html += "<span>{:.4f}</span><br/>".format(ratings[i])
        html += "{}<br/>".format(mov.title)
        if mov.genres:
            ul = "<ul>"
            for genre in mov.genres.split('|'):
                ul += "<li>{}</li>".format(genre)
            ul += "</ul>"
            html += "{}<br/>".format(ul)
        html += "</div>"
    display(HTML(html))

Read Rating Data

In [4]:
ratings = pd.read_csv('ratings-9_1.csv')
train = ratings[ratings['type'] == 'train'][['userId', 'movieId', 'rating']]
test = ratings[ratings['type'] == 'test'][['userId', 'movieId', 'rating']]

## Convert Ratings to Item-User Sparse Matrix
### Create Index to Id Maps

In [5]:
movieIdToIndex = {}
indexToMovieId = {}
rowIdx = 0
for movieId in movies.movieId:
    movieIdToIndex[movieId] = rowIdx
    indexToMovieId[rowIdx] = movieId
    rowIdx += 1

In [6]:
userIdToIndex = {}
indexToUserId = {}
colIdx = 0
for userId in ratings.userId.unique():
    userIdToIndex[userId] = colIdx
    indexToUserId[colIdx] = userId
    colIdx += 1

### Creat Item-User Sparse Matrix

In [7]:
rows = []
cols = []
vals = []
for row in ratings.itertuples():
    rows.append(movieIdToIndex[row.movieId])
    cols.append(userIdToIndex[row.userId])
    vals.append(row.rating)
coomat = coo_matrix((vals, (rows, cols)), shape=(rowIdx, colIdx))

## Compute Item-Item Similarities

Compute $l_2$-norm

In [8]:
norms = LA.norm(coomat.toarray(), ord=2, axis=1)

Normalize Row Vectors

In [9]:
np.seterr(divide='ignore', invalid='ignore')
normmat = np.divide(coomat.transpose().toarray(), norms).T

In [10]:
np.nan_to_num(normmat, 0.0)

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.06380429,  0.07975537],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

Compute Similarities ( = inner product)

In [11]:
sims = pd.DataFrame(data=np.matmul(normmat, normmat.T), index=movies.movieId, columns=movies.movieId)

### Similarity Example

In [12]:
movieIdx = 7
rels = sims.iloc[movieIdx,:].sort_values(ascending=False).head(6)
displayMovies([indexToMovieId[movieIdx]])
displayMovies(rels.index, rels.values)

## User Rating Prediction

In [13]:
userId = 33
userRatings = train[train['userId'] == userId][['movieId', 'rating']]

### Predict Ratings

In [14]:
recSimSums = sims.loc[userRatings['movieId'].values, :].sum().values
recWeightedRatingSums = np.matmul(sims.loc[userRatings['movieId'].values, :].T.values, userRatings['rating'].values)
recItemRatings = pd.DataFrame(data=np.divide(recWeightedRatingSums, recSimSums), index=sims.index)
recItemRatings.columns = ['prediction']

### Compute Errors (MAE, RMSE)

In [15]:
userTestRatings = pd.DataFrame(data=test[test['userId'] == userId])
temp = userTestRatings.join(recItemRatings.loc[userTestRatings['movieId']], on='movieId')
mae = (temp['rating'] - temp['prediction']).abs().mean()
rmse = math.sqrt((temp['rating'] - temp['prediction']).pow(2).mean())
print(" MAE:", mae)
print("RMSE:", rmse)

 MAE: 0.749437923199
RMSE: 0.8521098359089229


### Compare Logs and Recommendations

In [16]:
logs = userRatings.sort_values(by='rating', ascending=False).head(20)
recs = recItemRatings.sort_values(by='prediction', ascending=False).head(20)

In [17]:
displayMovies(logs['movieId'].values, logs['rating'].values)
displayMovies(recs.index, recs['prediction'].values)