item similarity
The dataset consists of 100,000 ratings and 1,300 tag applications applied to 9,066 movies by 671 users.

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('C:/Users/Qiulan/kaggle/recommender/ratings.csv', sep=',')
df.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [2]:
df_id = pd.read_csv('C:/Users/Qiulan/kaggle/recommender/links.csv', sep=',')
df_id.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [3]:
df = pd.merge(df, df_id, on=['movieId'])
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId
0,1,31,2.5,1260759144,112792,9909.0
1,7,31,3.0,851868750,112792,9909.0
2,31,31,4.0,1273541953,112792,9909.0
3,32,31,4.0,834828440,112792,9909.0
4,36,31,3.0,847057202,112792,9909.0


In [4]:
rating_matrix = np.zeros((df.userId.unique().shape[0], max(df.movieId)))

In [5]:
for row in df.itertuples():
    rating_matrix[row[1]-1, row[2]-1] = row[3]
rating_matrix = rating_matrix[:,:9000]

We examine the sparsity of our rating matrix as follows:

In [6]:
sparsity = float(len(rating_matrix.nonzero()[0]))
sparsity /= (rating_matrix.shape[0] * rating_matrix.shape[1])
sparsity *= 100

Now, let's split the rating matrix to two smaller matrices for the purpose of training and testing. We remove 10 ratings from the rating matrix and place them in the test set.

In [7]:
train_matrix = rating_matrix.copy()
test_matrix = np.zeros(rating_matrix.shape)

for i in range(rating_matrix.shape[0]):
    rating_idx = np.random.choice(
        rating_matrix[i, :].nonzero()[0], 
        size=10, 
        replace=True)
    train_matrix[i, rating_idx] = 0.0
    test_matrix[i, rating_idx] = rating_matrix[i, rating_idx]

The (cosine) similarity among users/movies is calculated based on the following formula.

In [8]:
similarity_user = train_matrix.dot(train_matrix.T) + 1e-9
norms = np.array([np.sqrt(np.diagonal(similarity_user))])
similarity_user = ( similarity_user / (norms * norms.T) )


similarity_movie = train_matrix.T.dot(train_matrix) + 1e-9
norms = np.array([np.sqrt(np.diagonal(similarity_movie))])
similarity_movie = ( similarity_movie / (norms * norms.T) )

Using the similarity among the users, we are able to make a prediction for each user-to-movie rating and also calculate the corresponding MSE of our user-to-movie rating prediction. The prediction is made by considering the ratings that a similar user gives. In particular, we can make a user-to-movie rating prediction based on the following formula.
r_ui = SUM_v(s(u,v)r_vi)/SUM_v(ABS(s(u,v))), where the prediction for user u to movie i is a weighted sum (normalized) of ratings that user v  gives to movie i with the similarity between user u and v as the weight.

In [9]:
from sklearn.metrics import mean_squared_error

In [10]:
prediction = similarity_user.dot(train_matrix) / np.array([np.abs(similarity_user).sum(axis=1)]).T

In [11]:
prediction = similarity_user.dot(train_matrix) / np.array([np.abs(similarity_user).sum(axis=1)]).T
prediction = prediction[test_matrix.nonzero()].flatten()
test_vector = test_matrix[test_matrix.nonzero()].flatten()
mse = mean_squared_error(prediction, test_vector)

print('MSE = ' + str(mse))

MSE = 9.76278903237


We will query a movie of interest and ask our computer agent to recommend a few movies to us. The first thing to do is to get the corresponding movie posters so that we can see what the recommended movies are. We use the IMDB id numbers to get the movie posters from The Movie Database website using its API.

In [12]:
import requests
import json

response = requests.get('http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)')


# Get base url filepath structure. w185 corresponds to size of movie poster.
headers = {'Accept': 'application/json'}
payload = {'api_key': 'bb3beb7ec7af6d1c0c23ca7381b62a89'} 
response = requests.get("http://api.themoviedb.org/3/configuration", params=payload, headers=headers)
response = json.loads(response.text)
base_url = response['images']['base_url'] + 'w185'

def get_poster(imdb_url, base_url):
    # Get IMDB movie ID
    response = requests.get(imdb_url)
    movie_id = response.url.split('/')[-2]
    
    # Query themoviedb.org API for movie poster path.
    movie_url = 'http://api.themoviedb.org/3/movie/{:}/images'.format(movie_id)
    headers = {'Accept': 'application/json'}
    payload = {'api_key': 'INSERT API KEY HERE'} 
    response = requests.get(movie_url, params=payload, headers=headers)
    try:
        file_path = json.loads(response.text)['posters'][0]['file_path']
    except:
        # IMDB movie ID is sometimes no good. Need to get correct one.
        movie_title = imdb_url.split('?')[-1].split('(')[0]
        payload['query'] = movie_title
        response = requests.get('http://api.themoviedb.org/3/search/movie', params=payload, headers=headers)
        movie_id = json.loads(response.text)['results'][0]['id']
        payload.pop('query', None)
        movie_url = 'http://api.themoviedb.org/3/movie/{:}/images'.format(movie_id)
        response = requests.get(movie_url, params=payload, headers=headers)
        file_path = json.loads(response.text)['posters'][0]['file_path']
        
    return base_url + file_path

In [23]:
import requests
import json

from IPython.display import Image
from IPython.display import display
from IPython.display import HTML

idx_to_movie = {}
for row in df_id.itertuples():
    idx_to_movie[row[1]-1] = row[2]
idx_to_movie    

k = 6  
idx = 0
movies = [ idx_to_movie[x] for x in np.argsort(similarity_movie[idx,:])[:-k-1:-1] ] 
movies = filter(lambda imdb: len(str(imdb)) == 6, movies)

n_display = 5
URL = [0]*n_display
IMDB = [0]*n_display
i = 0
for movie in movies:
    (URL[i], IMDB[i]) = get_poster(movie, base_url)
    i += 1 
    
images = ''
for i in range(n_display):
    images += "<img style='width: 100px; margin: 0px; \
                float: left; border: 1px solid black;' src='%s' />" \
                % URL[i]

display(HTML(images))    


MissingSchema: Invalid URL '114709': No schema supplied. Perhaps you meant http://114709?

[114709, 120363, 116629, 107048, 109830]


[0, 0, 0, 0, 0]
[0, 0, 0, 0, 0]


array([  6.85314403e-02,   6.85314403e-02,   1.24193093e-01,
         5.41788606e-07,   5.41788606e-07,   5.41788606e-07])