# Building a Movie Recommender System
Using techniques such as Collaborative filtering and content based filtering.
Evaluation with different metrics.

In [1]:
import pandas as pd
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

You can get the dataset from [here](https://grouplens.org/datasets/movielens/100k/)

# Loading all the data

In [12]:
#Reading User File
u_cols = ['user_id','age','sex','occupation','zip_code']
users = pd.read_csv('./ml-100k/u.user',sep='|',names=u_cols,encoding='latin-1')

#Reading Ratings File
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,encoding='latin-1')

#Reading items file:
i_cols = ['movie_id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols,
encoding='latin-1')


In [13]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [14]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [15]:
items.head()

Unnamed: 0,movie_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


Here the genre is separated using binary values.

# Loading the train and test data
The dataset has already been divided into train and test by GroupLens where the test data has 10 ratings for each user, i.e. 9,430 rows in total. We will read both these files into our Python environment.

In [87]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_train.shape, ratings_test.shape

((90570, 4), (9430, 4))

# Building the model
We are going to recommend based on user-user similarity and item-item similarity.

In [16]:
n_users = ratings['user_id'].unique().shape[0]
n_items = items['movie_id'].unique().shape[0]
print("Number of unique users: {}".format(n_users))
print("Number of unique movies: {}".format(n_items))

Number of unique users: 943
Number of unique movies: 1682


In [38]:
# Matrix for user and movies to calculate similarity
data_matrix = np.zeros((n_users,n_items))
# for i in range(len(ratings)):
#     data_matrix[ratings['user_id'][i]][ratings['movie_id'][i]] = ratings['rating'][i]
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]
       

In [39]:
# Calculating similarity 
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(data_matrix, metric='cosine')
#Transposing the matrix in order to get item based similarity
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [80]:
def predict(ratings,similarity,type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [63]:
data = np.zeros((3,5))
data[0][0] = 5
data[1][0] = 4 
data[2][0] = 3
data [1][1] = 5
data [1][3] = 5
data [2][4] = 3
data [2][1] = 2
data [0] [4]= 1
data

array([[5., 0., 0., 0., 1.],
       [4., 5., 0., 5., 0.],
       [3., 2., 0., 0., 3.]])

In [64]:
print(data.mean(axis=1))  #Taking mean across axis 1 for User similarity
print(data.mean(axis=0))  # Taking mean across axis 0 for item similarity

[1.2 2.8 1.6]
[4.         2.33333333 0.         1.66666667 1.33333333]


In [59]:
u_sim = pairwise_distances(data,metric='cosine')  #user similarity
i_sim = pairwise_distances(data.T,metric='cosine') #item similarity

In [60]:
u_sim,i_sim

(array([[0.        , 0.51719545, 0.24738219],
        [0.51719545, 0.        , 0.42264973],
        [0.24738219, 0.42264973, 0.        ]]),
 array([[0.        , 0.31720655, 1.        , 0.43431458, 0.37390097],
        [0.31720655, 0.        , 1.        , 0.07152331, 0.64766787],
        [1.        , 1.        , 0.        , 1.        , 1.        ],
        [0.43431458, 0.07152331, 1.        , 0.        , 1.        ],
        [0.37390097, 0.64766787, 1.        , 1.        , 0.        ]]))

In [82]:
predict(data,i_sim,'item')

array([[0.17591845, 1.09688819, 1.5       , 1.26567361, 0.61871992],
       [1.76793383, 0.79868619, 3.5       , 0.83599776, 3.2214865 ],
       [0.82624341, 1.42144298, 2.        , 1.77425298, 0.79992837]])

# Using Turicreate to build the same thing

In [89]:
import turicreate
train_data = turicreate.SFrame(ratings_train)
test_data = turicreate.SFrame(ratings_test)

First we’ll build a model which will recommend movies based on the most popular choices, i.e., a model where all the users receive the same recommendation(s). We will use the turicreate recommender function popularity_recommender for this.

In [90]:
popularity_model = turicreate.popularity_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='rating')

# Prediction of Top 5 movies for 5 users

In [97]:
popularity_recomm = popularity_model.recommend(users=[1,2,3],k=5)
popularity_recomm.print_rows(num_rows=25)

+---------+----------+-------+------+
| user_id | movie_id | score | rank |
+---------+----------+-------+------+
|    1    |   1467   |  5.0  |  1   |
|    1    |   1201   |  5.0  |  2   |
|    1    |   1189   |  5.0  |  3   |
|    1    |   1122   |  5.0  |  4   |
|    1    |   814    |  5.0  |  5   |
|    2    |   1467   |  5.0  |  1   |
|    2    |   1201   |  5.0  |  2   |
|    2    |   1189   |  5.0  |  3   |
|    2    |   1122   |  5.0  |  4   |
|    2    |   814    |  5.0  |  5   |
|    3    |   1467   |  5.0  |  1   |
|    3    |   1201   |  5.0  |  2   |
|    3    |   1189   |  5.0  |  3   |
|    3    |   1122   |  5.0  |  4   |
|    3    |   814    |  5.0  |  5   |
+---------+----------+-------+------+
[15 rows x 4 columns]



Recommendations for all users are the same – 1467, 1201, 1189, 1122, 814. And they’re all in the same order! This confirms that all the recommended movies have an average rating of 5, i.e. all the users who watched the movie gave it a top rating. Thus our popularity system works as expected.

In [95]:
rec_mov = popularity_recomm['movie_id']

# Recommended Movies
This model is based on simple Popularity Based Recommendations

In [96]:
for i in rec_mov:
    print(items['movie title'][i-1])

Saint of Fort Washington, The (1993)
Marlene Dietrich: Shadow and Light (1996) 
Prefontaine (1997)
They Made Me a Criminal (1939)
Great Day in Harlem, A (1994)


# Now building a Collaborative Filtering Model

In [98]:
# Item Similarity Model
item_sim_model = turicreate.item_similarity_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='rating', similarity_type='cosine')



In [99]:
# Making Recommedndations
item_sim_recomm = item_sim_model.recommend(users=[1,2,3,4,5],k=5)
item_sim_recomm.print_rows(num_rows=25)

+---------+----------+--------------------+------+
| user_id | movie_id |       score        | rank |
+---------+----------+--------------------+------+
|    1    |   423    | 0.980611449434557  |  1   |
|    1    |   202    | 0.9387859158388531 |  2   |
|    1    |   655    | 0.8079265972585169 |  3   |
|    1    |   568    | 0.7693172377029448 |  4   |
|    1    |   403    | 0.765623665037956  |  5   |
|    2    |    50    | 1.1256258487701416 |  1   |
|    2    |   181    | 1.0272409457426805 |  2   |
|    2    |   121    | 0.8834222119588119 |  3   |
|    2    |    7     | 0.8567936833088214 |  4   |
|    2    |    9     | 0.831989913032605  |  5   |
|    3    |   313    | 0.6353766620159149 |  1   |
|    3    |   328    | 0.6032880300825293 |  2   |
|    3    |   315    | 0.5422587123784152 |  3   |
|    3    |   331    | 0.5355071858926252 |  4   |
|    3    |   332    | 0.5316696112806146 |  5   |
|    4    |    50    | 1.1311477082116264 |  1   |
|    4    |   288    | 1.048715

In [112]:
#Displaying Individual Recommendations of MOvies for 5 different Users
item_mov_rec = item_sim_recomm['movie_id']
count = 0
usr_id = 1
for i in item_mov_rec:
    if count%5 == 0:
        print()
        print("Movie recommendations for {} user:".format(usr_id))
        print()
        usr_id +=1
        count = 0
    print(items['movie title'][i+1])
    count+=1


Movie recommendations for 1 user:

Bob Roberts (1992)
Back to the Future (1985)
Manchurian Candidate, The (1962)
Wyatt Earp (1994)
Mission: Impossible (1996)

Movie recommendations for 2 user:

Madness of King George, The (1994)
Alien (1979)
Frighteners, The (1996)
Dead Man Walking (1995)
Seven (Se7en) (1995)

Movie recommendations for 3 user:

Apt Pupil (1998)
187 (1997)
In the Name of the Father (1993)
Game, The (1997)
U Turn (1997)

Movie recommendations for 4 user:

Madness of King George, The (1994)
Fierce Creatures (1997)
Alien (1979)
Dead Man Walking (1995)
Fly Away Home (1996)

Movie recommendations for 5 user:

Graduate, The (1967)
Back to the Future (1985)
Quiz Show (1994)
Robert A. Heinlein's The Puppet Masters (1994)
Silence of the Lambs, The (1991)


In [None]:
# TODO Matrix Facrtorization