# MOVIE RECOMMENDER SYSTEM - USER BASED COLLABORATIVE FILTERING

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from sklearn.metrics.pairwise import cosine_similarity

### Data Processing

In [2]:
df = pd.read_excel("test.xlsx")
df.head()
matrix_data = df.pivot_table(index='userid',columns='title',values='rating')
matrix_data.head()

title,12 Years a Slave,A Prophet,A Separation,A Serious Man,Amour,Argo,Arrival,Avatar,Beasts of the Southern Wild,Birdman,...,The Tree of Life,The White Ribbon,The Wolf of Wall Street,"Three Billboards Outside Ebbing, Missouri",Toni Erdmann,Toy Story 3,True Grit,Up in the Air,Winter's Bone,Zero Dark Thirty
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2.0,,,,,,,5.0,,4.0,...,,,5.0,,,,,5.0,,
1,,,,,,,5.0,3.0,,4.0,...,,,4.0,3.0,,,,3.0,,
2,,,,,,5.0,,,,,...,,,,,,,,,,
3,,,,,,,,5.0,,,...,,,4.0,5.0,,,,,,
4,,,,,,,,,,,...,,,,,,4.0,,,,


In [3]:
mat = matrix_data.subtract(matrix_data.mean(axis=1), axis = 'rows')
mat.head()

title,12 Years a Slave,A Prophet,A Separation,A Serious Man,Amour,Argo,Arrival,Avatar,Beasts of the Southern Wild,Birdman,...,The Tree of Life,The White Ribbon,The Wolf of Wall Street,"Three Billboards Outside Ebbing, Missouri",Toni Erdmann,Toy Story 3,True Grit,Up in the Air,Winter's Bone,Zero Dark Thirty
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-2.230769,,,,,,,0.769231,,-0.230769,...,,,0.769231,,,,,0.769231,,
1,,,,,,,0.947368,-1.052632,,-0.052632,...,,,-0.052632,-1.052632,,,,-1.052632,,
2,,,,,,1.0,,,,,...,,,,,,,,,,
3,,,,,,,,0.3,,,...,,,-0.7,0.3,,,,,,
4,,,,,,,,,,,...,,,,,,-0.3,,,,


### User similarity matrix using Pearson correlation

In [4]:
user_similarity = mat.T.corr()
user_similarity.head()

userid,0,1,2,3,4,5,6,7,8,9,...,368,369,370,371,372,373,374,375,376,377
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,-0.19868,-1.0,-0.316228,0.218218,0.566947,0.57735,-0.522233,0.030002,-0.3333333,...,0.637455,0.580465,-0.375,1.0,0.87831,-0.2041241,0.5875,0.970725,0.07689059,
1,-0.19868,1.0,,-0.174078,0.192308,0.011606,-0.816497,-6.500800000000001e-17,0.205196,0.1668115,...,0.36051,-0.068224,0.227429,0.5,-0.223607,-0.3818813,-0.074444,-0.29277,-0.2750095,-0.301511
2,-1.0,,1.0,,,,,,,,...,-1.0,-1.0,,,,1.0,,,-1.0,
3,-0.316228,-0.174078,,1.0,-0.29277,-0.727607,,0.8807048,-0.478091,0.29277,...,0.4,0.209165,-0.301511,-0.5,,0.3333333,-0.333333,-0.845154,-1.6024690000000003e-17,1.0
4,0.218218,0.192308,,-0.29277,1.0,0.785714,,-0.4,-0.324617,2.4825340000000002e-17,...,0.381881,0.492552,0.426401,-0.5,0.132453,-2.0981240000000003e-17,-0.223607,0.0,-0.3031695,


### User similarity matrix using cosine similarity

In [5]:
user_similarity_cosine = cosine_similarity(mat.fillna(0))
user_similarity_cosine

array([[ 1.        , -0.0994042 ,  0.04651303, ...,  0.23534666,
         0.04518771,  0.10963225],
       [-0.0994042 ,  1.        , -0.20246457, ..., -0.07663063,
        -0.23659147, -0.15111763],
       [ 0.04651303, -0.20246457,  1.        , ...,  0.53785287,
        -0.15122507,  0.        ],
       ...,
       [ 0.23534666, -0.07663063,  0.53785287, ...,  1.        ,
        -0.05422456, -0.12677314],
       [ 0.04518771, -0.23659147, -0.15122507, ..., -0.05422456,
         1.        , -0.10693227],
       [ 0.10963225, -0.15111763,  0.        , ..., -0.12677314,
        -0.10693227,  1.        ]])

In [6]:
# Pick a user ID
picked_userid = 332
# 344 Sri
# 362 Jinny
# 332 Fred
# 342 Ajaiy

# Remove picked user ID from the candidate list
user_similarity.drop(index=picked_userid, inplace=True)
#user_similarity_cosine.drop(index=picked_userid, inplace=True)
# Take a look at the data
user_similarity.head()
#print(user_similarity[[344]])

userid,0,1,2,3,4,5,6,7,8,9,...,368,369,370,371,372,373,374,375,376,377
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,-0.19868,-1.0,-0.316228,0.218218,0.566947,0.57735,-0.522233,0.030002,-0.3333333,...,0.637455,0.580465,-0.375,1.0,0.87831,-0.2041241,0.5875,0.970725,0.07689059,
1,-0.19868,1.0,,-0.174078,0.192308,0.011606,-0.816497,-6.500800000000001e-17,0.205196,0.1668115,...,0.36051,-0.068224,0.227429,0.5,-0.223607,-0.3818813,-0.074444,-0.29277,-0.2750095,-0.301511
2,-1.0,,1.0,,,,,,,,...,-1.0,-1.0,,,,1.0,,,-1.0,
3,-0.316228,-0.174078,,1.0,-0.29277,-0.727607,,0.8807048,-0.478091,0.29277,...,0.4,0.209165,-0.301511,-0.5,,0.3333333,-0.333333,-0.845154,-1.6024690000000003e-17,1.0
4,0.218218,0.192308,,-0.29277,1.0,0.785714,,-0.4,-0.324617,2.4825340000000002e-17,...,0.381881,0.492552,0.426401,-0.5,0.132453,-2.0981240000000003e-17,-0.223607,0.0,-0.3031695,


In [7]:
# Number of similar users
n = 3

# User similarity threashold
user_similarity_threshold = 0.3

# Get top n similar users
similar_users = user_similarity[
    user_similarity[picked_userid]>user_similarity_threshold][picked_userid].sort_values(ascending=False)[:n]

# Print out top n similar users
print(f'The similar users for user {picked_userid} are', similar_users)

The similar users for user 332 are userid
202    1.0
325    1.0
363    1.0
Name: 332, dtype: float64


In [8]:
# Movies that the target user has watched
picked_userid_watched = mat[mat.index == picked_userid].dropna(axis=1, how='all')
picked_userid_watched

title,Avatar,Dunkirk,Gravity,Inception,La La Land,Mad Max: Fury Road,Spotlight,The King's Speech,The Shape of Water,The Wolf of Wall Street
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
332,0.8,-0.2,-1.2,0.8,-0.2,-2.2,0.8,0.8,-0.2,0.8


In [9]:
# Movies that similar users watched. Remove movies that none of the similar users have watched
similar_user_movies = mat[mat.index.isin(similar_users.index)].dropna(axis=1, how='all')
similar_user_movies

title,Avatar,Birdman,Gravity,Inception,La La Land,Life of Pi,The Grand Budapest Hotel,The Shape of Water,The Wolf of Wall Street,Toy Story 3
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
202,0.285714,-1.714286,-0.714286,0.285714,,0.285714,1.285714,,0.285714,
325,,,,0.8,-0.2,-0.2,,-0.2,,-0.2
363,0.333333,,,,,,,-0.666667,,0.333333


In [10]:
# Remove the watched movie from the movie list
#similar_user_movies.drop(picked_userid_watched.columns,axis=1, inplace=True, errors='ignore')

# Take a look at the data
#similar_user_movies

### Movie Scoring

In [11]:
# A dictionary to store item scores
item_score = {}

# Loop through items
for i in similar_user_movies.columns:
  # Get the ratings for movie i
  movie_rating = similar_user_movies[i]
  # Create a variable to store the score
  total = 0
  # Create a variable to store the number of scores
  count = 0
  # Loop through similar users
  for u in similar_users.index:
    # If the movie has rating
    if pd.isna(movie_rating[u]) == False:
      # Score is the sum of user similarity score multiply by the movie rating
      score = similar_users[u] * movie_rating[u]
      # Add the score to the total score for the movie so far
      total += score
      # Add 1 to the count
      count +=1
  # Get the average score for the item
  item_score[i] = total / count

# Convert dictionary to pandas dataframe
item_score = pd.DataFrame(item_score.items(), columns=['movie', 'movie_score'])
    
# Sort the movies by score
ranked_item_score = item_score.sort_values(by='movie_score', ascending=False)

# Select top m movies
m = 50
ranked_item_score.head(m)

Unnamed: 0,movie,movie_score
6,The Grand Budapest Hotel,1.285714
3,Inception,0.542857
0,Avatar,0.309524
8,The Wolf of Wall Street,0.285714
9,Toy Story 3,0.066667
5,Life of Pi,0.042857
4,La La Land,-0.2
7,The Shape of Water,-0.433333
2,Gravity,-0.714286
1,Birdman,-1.714286


In [12]:
# Average rating for the picked user
avg_rating = matrix_data[matrix_data.index == picked_userid].T.mean()[picked_userid]

# Print the average movie rating for user 1
print(f'The average movie rating for user {picked_userid} is {avg_rating:.2f}')

The average movie rating for user 332 is 4.20


### Predicted Rating

In [13]:
# Calcuate the predicted rating
ranked_item_score['predicted_rating'] = ranked_item_score['movie_score'] + avg_rating

ranked_item_score.head(m)

Unnamed: 0,movie,movie_score,predicted_rating
6,The Grand Budapest Hotel,1.285714,5.485714
3,Inception,0.542857,4.742857
0,Avatar,0.309524,4.509524
8,The Wolf of Wall Street,0.285714,4.485714
9,Toy Story 3,0.066667,4.266667
5,Life of Pi,0.042857,4.242857
4,La La Land,-0.2,4.0
7,The Shape of Water,-0.433333,3.766667
2,Gravity,-0.714286,3.485714
1,Birdman,-1.714286,2.485714
