# User-based collaborative filter

Using the database from: https://www.kaggle.com/rounakbanik/the-movies-dataset 

In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv('movies.csv',index_col=0)
movies.head()

Unnamed: 0_level_0,title
id,Unnamed: 1_level_1
862,Toy Story
8844,Jumanji
15602,Grumpier Old Men
31357,Waiting to Exhale
11862,Father of the Bride Part II


In [4]:
# ratings was preprocessed to remove ratings for obscure movies that were only watched by a handful of people
ratings = pd.read_csv('ratings_short.csv',usecols=[1,2,3])
ratings.columns = ['user','id','rating']
ratings.head()

Unnamed: 0,user,id,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


In [5]:
ratings = ratings.merge(movies,on='id').drop('id',axis=1)
ratings.head()

Unnamed: 0,user,rating,title
0,1,1.0,Three Colors: Red
1,11,3.5,Three Colors: Red
2,22,5.0,Three Colors: Red
3,24,5.0,Three Colors: Red
4,29,3.0,Three Colors: Red


In [6]:
# generate the pivot table for all the ratings (users as rows and movies as columns)
table = pd.pivot_table(ratings,index='user',columns='title')
# prune out users that watched less than 5 movies
table = table[table.count(axis=1)>=5]
# clean the column names
table.columns = [j for i,j in table.columns]

In [7]:
# precalculate the vector size for later
process = pd.DataFrame(index=table.index)
process['length'] = np.sqrt((table.fillna(0)**2).sum(axis=1))
process.head()

Unnamed: 0_level_0,length
user,Unnamed: 1_level_1
1,13.883443
2,14.035669
3,9.0
4,18.165902
5,13.784049


In [91]:
acceptable_rating = 3
min_support = 5
n = 100

In [54]:
# randomly choose target user to recommend movies to
target = table.index[np.random.randint(len(table))]
table.loc[target].dropna().sort_values(ascending=False).head(10)

Human Nature                                    5.0
Star Wars: Episode III - Revenge of the Sith    4.5
American Graffiti                               4.5
Wedlock                                         4.0
The King of Comedy                              4.0
The Marriage of Maria Braun                     3.5
Monsters, Inc.                                  3.5
Changing Lanes                                  1.5
The Island                                      1.0
The Fountain                                    1.0
Name: 201285, dtype: float64

In [55]:
# rank all other users by similarity
# I use cosine similarity multiplied by the fraction of films in common
dot_product = (table.loc[target].fillna(0) * table.fillna(0)).sum(axis=1)
in_common = ((table.loc[target].fillna(0) * table.fillna(0))>0).sum(axis=1)

In [56]:
# divide by vector length (no need to divide by target length since it is always the same)
# multiply by in_common (no need for fraction, because the denominator is the same)
similarity = (dot_product / process['length']) * in_common

In [92]:
# find n most similar users
neighbors = similarity[similarity>0].sort_values(ascending=False).head(neighbors).drop(target)

In [101]:
# count the number of supporting neighbors for each movie
support = table.loc[neighbors.index].count()
# calculated the weighted (by similarity) average rating - off-set by the minimum rating acceptable
means = (table.loc[neighbors.index]-acceptable_rating).apply(lambda x: x * neighbors.values).mean()
# prune movies with not enough support
supported = means[support>min_support]
# removed watched movies
potential = supported[supported.index.difference(table.loc[target].dropna().index)]
# sort by weighted score, show top suggestions
potential[potential>0].sort_values(ascending=False).head(20)

Dead Man                                             20.933815
Hard Target                                          19.812212
The Sicilian Clan                                    19.624623
3-Iron                                               19.255941
Vivere                                               18.483348
The Million Dollar Hotel                             18.222054
Eight Miles High                                     17.846802
Don't Worry, I'm Fine                                17.772976
Galaxy Quest                                         17.522253
Once Were Warriors                                   17.396273
The Sixth Sense                                      17.346172
The Tulse Luper Suitcases, Part 1: The Moab Story    17.071667
Lotte from Gadgetville                               17.028374
Under the Sand                                       16.997765
Red River                                            16.717607
The Mummy Returns                                    16