# Item-based collaborative filter

Using the database from: https://www.kaggle.com/rounakbanik/the-movies-dataset 

In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv('movies.csv',index_col=0)
movies.head()

Unnamed: 0_level_0,title
id,Unnamed: 1_level_1
862,Toy Story
8844,Jumanji
15602,Grumpier Old Men
31357,Waiting to Exhale
11862,Father of the Bride Part II


In [3]:
# ratings was preprocessed to remove ratings for obscure movies that were only watched by a handful of people
ratings = pd.read_csv('ratings_short.csv',usecols=[1,2,3])
ratings.columns = ['user','id','rating']
ratings = ratings.merge(movies,on='id').drop('id',axis=1)
ratings = ratings[['user','title','rating']]
ratings.head()

Unnamed: 0,user,title,rating
0,1,Three Colors: Red,1.0
1,11,Three Colors: Red,3.5
2,22,Three Colors: Red,5.0
3,24,Three Colors: Red,5.0
4,29,Three Colors: Red,3.0


In [4]:
# generate the pivot table for all the ratings (users as rows and movies as columns)
table = pd.pivot_table(ratings,index='user',columns='title')
table.columns = [a[1] for a in table.columns]
table.head()

Unnamed: 0_level_0,!Women Art Revolution,$5 a Day,'Gator Bait,'Twas the Night Before Christmas,...And the Pursuit of Happiness,10 Items or Less,10 Things I Hate About You,"10,000 BC",11'09''01 - September 11,12 Angry Men,...,Zombie Flesh Eaters,Zombie Holocaust,Zozo,eXistenZ,xXx,¡A volar joven!,À nos amours,Ödipussi,Şaban Oğlu Şaban,Šíleně smutná princezna
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [7]:
# prune table because otherwise this is very slow
table = table[table.count(axis=1)>100]

In [8]:
# check how many ratings there are per movie
movies_stats = ratings.groupby('title').agg({'rating':[np.size,np.mean]})
movies_stats.columns = ['ratings_count','mean_rating']
movies_stats.head()

Unnamed: 0_level_0,ratings_count,mean_rating
title,Unnamed: 1_level_1,Unnamed: 2_level_1
!Women Art Revolution,883.0,2.802945
$5 a Day,156.0,2.246795
'Gator Bait,195.0,3.44359
'Twas the Night Before Christmas,452.0,3.732301
...And the Pursuit of Happiness,167.0,2.826347


### Single item recommendations

In [16]:
# pick one movie at random to recommend similar movies for
target = np.random.choice(table.columns,size=1)[0]
target_rating = table[target]
target

'The Silence of the Lambs'

In [17]:
# correlation matrix for this movie
correlations = table.corrwith(target_rating).dropna()
correlations = pd.DataFrame(correlations)
correlations.columns = ['corr']
correlations.sort_values(by='corr', ascending=False).head(10)

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


Unnamed: 0,corr
The Celluloid Closet,1.0
Always,1.0
The Silence of the Lambs,1.0
The Holy Mountain,1.0
The Hoodlum Priest,1.0
Ring of Fire II: Blood and Steel,1.0
The Believers,1.0
The Chamber,1.0
Long Pigs,1.0
Fighting Back,1.0


In [18]:
# check correlation only for fairly popular movies
cutoff = 750
recommendations = movies_stats[movies_stats['ratings_count']>cutoff].join(correlations)
recommendations.sort_values(by='corr', ascending=False).head(10)

Unnamed: 0_level_0,ratings_count,mean_rating,corr
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The Silence of the Lambs,1266.0,2.846367,1.0
Urban Explorer,1037.0,3.792671,0.814648
Dixie Chicks: Shut Up and Sing,859.0,2.058789,0.76351
The Beales of Grey Gardens,806.0,3.919975,0.759746
Teenage Mutant Ninja Turtles II: The Secret of the Ooze,812.0,2.303571,0.743274
The Bodyguard,780.0,2.617949,0.734512
Dragon Hunter,1783.0,2.437745,0.731258
A Few Good Men,1188.0,2.835017,0.72702
The Human Condition III: A Soldier's Prayer,826.0,2.141041,0.726956
Primal Fear,1594.0,2.341593,0.718455


### Many items recommendations

In [19]:
# cross correlation matrix for all movies (pruned, see above)
correlation_matrix = table.corr(min_periods=cutoff)
correlation_matrix.head()

Unnamed: 0,!Women Art Revolution,$5 a Day,'Gator Bait,'Twas the Night Before Christmas,...And the Pursuit of Happiness,10 Items or Less,10 Things I Hate About You,"10,000 BC",11'09''01 - September 11,12 Angry Men,...,Zombie Flesh Eaters,Zombie Holocaust,Zozo,eXistenZ,xXx,¡A volar joven!,À nos amours,Ödipussi,Şaban Oğlu Şaban,Šíleně smutná princezna
!Women Art Revolution,,,,,,,,,,,...,,,,,,,,,,
$5 a Day,,,,,,,,,,,...,,,,,,,,,,
'Gator Bait,,,,,,,,,,,...,,,,,,,,,,
'Twas the Night Before Christmas,,,,,,,,,,,...,,,,,,,,,,
...And the Pursuit of Happiness,,,,,,,,,,,...,,,,,,,,,,


In [69]:
# made up user, manually constructed
# minimum rating for a movie to be deemed watchable by user
minimum_watch = 3
target = {'Star Wars':5,'12 Angry Men':4,'2001: A Space Odyssey':1,
          'The Lord of the Rings':5,'Terminator 2: Judgment Day':5}

In [99]:
recommendations = pd.Series()
for movie in target:
    recommendations = recommendations.append(correlation_matrix[movie].dropna().map(
        lambda x: x*(target[movie]-minimum_watch)))

In [101]:
# add together duplicates
recommendations.groupby(recommendations.index).sum().sort_values(ascending=False).head(20)

The Lord of the Rings                                     2.000000
Terminator 2: Judgment Day                                1.925744
Star Wars                                                 1.717912
Monsieur Ibrahim                                          0.973142
Aliens vs Predator: Requiem                               0.918620
Eight Miles High                                          0.897916
Notes on a Scandal                                        0.849133
Oliver Twist                                              0.844201
That Man from Rio                                         0.815046
Pirates of the Caribbean: The Curse of the Black Pearl    0.813632
Hitman                                                    0.811026
M                                                         0.810865
Interview with the Vampire                                0.797615
Back to the Future Part II                                0.784315
Judgment Night                                            0.78

In [118]:
# real user (from ratings table) watched movies
target = table.iloc[0].dropna()

# same as above, find all 
recommendations = pd.Series()
for movie in target.index:
    recommendations = recommendations.append(correlation_matrix[movie].dropna().map(
        lambda x: x*(target[movie]-minimum_watch)))
recommendations = recommendations.groupby(recommendations.index).sum().sort_values(ascending=False)

In [123]:
# manually remove seen movies because the correlation matrix array is pruned, so there are missing movies
[(a,round(recommendations[a],2)) for a in recommendations.index if a not in target][:10]

[('The Good Shepherd', 26.19),
 ('Scarface', 25.61),
 ("Pirates of the Caribbean: Dead Man's Chest", 25.2),
 ('Dawn of the Dead', 24.67),
 ('The Sixth Sense', 24.46),
 ('Lonely Hearts', 24.21),
 ('That Man from Rio', 23.93),
 ('Beverly Hills Cop III', 23.61),
 ('Under the Sand', 23.35),
 ('Cold Mountain', 23.3)]