# Collaborative Filtering

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
user_ratings_df = pd.read_csv('user_ratings.csv')

In [3]:
def clean_title(x):
    return re.sub(r'\([0-9].*\)', '',x).strip()

In [4]:
user_ratings_df['title'] = user_ratings_df['title'].apply(clean_title)
user_ratings_df['title']

0                         Toy Story
1                         Toy Story
2                         Toy Story
3                         Toy Story
4                         Toy Story
                    ...            
100831                    Bloodmoon
100832    Sympathy for the Underdog
100833                       Hazard
100834                  Blair Witch
100835                           31
Name: title, Length: 100836, dtype: object

In [5]:
# Transform the table
movie_ratings_table = user_ratings_df.pivot_table(index='title', columns='userId', values='rating')
movie_ratings_table

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,,,,,,,,,,,...,,,,,,,,,,3.5
'71,,,,,,,,,,,...,,,,,,,,,,4.0
'Hellboy': The Seeds of Creation,,,,,,,,,,,...,,,,,,,,,,
'Round Midnight,,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ,,,,,,,,,,,...,,,5.0,,,,,4.5,,
xXx,,,,,,,,,1.0,,...,,,,,,,,3.5,,2.0
xXx: State of the Union,,,,,,,,,,,...,,,,,,,,,,1.5
¡Three Amigos!,4.0,,,,,,,,,,...,,,,,,,,,,


In [6]:
# Average rating given for a movie by all users
avg_ratings = movie_ratings_table.mean(axis=1)

In [7]:
movie_ratings_table_centered = np.round(movie_ratings_table.sub(avg_ratings, axis=0),1)
movie_ratings_table_normed = movie_ratings_table_centered.fillna(0)
movie_ratings_table_normed

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.2
'71,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Hellboy': The Seeds of Creation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.1,0.0,0.0,0.0,0.0,0.6,0.0,0.0
xXx,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.8,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7,0.0,-0.8
xXx: State of the Union,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.5
¡Three Amigos!,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Create item-item similarity matrix

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
# Assign the arrays to variables
sw_IV = movie_ratings_table_normed.loc['Star Wars: Episode IV - A New Hope', :].values.reshape(1, -1)
sw_V = movie_ratings_table_normed.loc['Star Wars: Episode V - The Empire Strikes Back', :].values.reshape(1, -1)

In [10]:
# Find the similarity between two Star Wars movies
similarity_A = cosine_similarity(sw_IV, sw_V)
print(similarity_A)

[[0.57125693]]


In [11]:
# Generate the similarity matrix
similarities = np.round(cosine_similarity(movie_ratings_table_normed),3)

# Wrap the similarities in a DataFrame
cosine_similarity_df = pd.DataFrame(similarities, index=movie_ratings_table_normed.index, columns=movie_ratings_table_normed.index)
cosine_similarity_df

title,Unnamed: 1_level_0,'71,'Hellboy': The Seeds of Creation,'Round Midnight,'Salem's Lot,'Til There Was You,'Tis the Season for Love,"'burbs, The",'night Mother,*batteries not included,...,Zulu,[REC],[REC]²,[REC]³ 3 Génesis,anohana: The Flower We Saw That Day - The Movie,eXistenZ,xXx,xXx: State of the Union,¡Three Amigos!,À nous la liberté (Freedom for Us)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,1.000,0.0,0.0,0.0,0.0,0.0,0.0,0.053,0.0,0.275,...,0.118,-0.091,0.005,0.0,0.0,-0.071,0.152,0.111,0.041,0.0
'71,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.000,...,0.000,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.000,0.0
'Hellboy': The Seeds of Creation,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.000,...,0.000,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.000,0.0
'Round Midnight,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.000,...,0.000,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.000,0.0
'Salem's Lot,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.000,...,0.000,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ,-0.071,0.0,0.0,0.0,0.0,0.0,0.0,0.070,0.0,0.032,...,-0.005,0.000,0.000,0.0,0.0,1.000,0.034,0.000,-0.032,0.0
xXx,0.152,0.0,0.0,0.0,0.0,0.0,0.0,-0.003,0.0,0.260,...,0.273,-0.203,0.004,0.0,0.0,0.034,1.000,0.248,0.016,0.0
xXx: State of the Union,0.111,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.000,...,0.000,-0.184,0.092,0.0,0.0,0.000,0.248,1.000,0.000,0.0
¡Three Amigos!,0.041,0.0,0.0,0.0,0.0,0.0,0.0,0.015,0.0,0.127,...,0.018,0.000,0.000,0.0,0.0,-0.032,0.016,0.000,1.000,0.0


In [12]:
# Find the similarity values for a specific movie
cosine_similarity_series = cosine_similarity_df.loc['Star Wars: Episode IV - A New Hope']

# Sort these values highest to lowest
ordered_similarities_items = cosine_similarity_series.sort_values(ascending=False)

print(ordered_similarities_items)

title
Star Wars: Episode IV - A New Hope                                         1.000
Star Wars: Episode V - The Empire Strikes Back                             0.571
Star Wars: Episode VI - Return of the Jedi                                 0.536
Indiana Jones and the Last Crusade                                         0.235
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark)    0.235
                                                                           ...  
2 Days in the Valley                                                      -0.190
Atlantic City                                                             -0.191
Career Girls                                                              -0.193
Heavy                                                                     -0.205
Blow Out                                                                  -0.236
Name: Star Wars: Episode IV - A New Hope, Length: 9446, dtype: float64


#### Five items most similar to <font color='brown'> Star Wars: Episode IV - A New Hope </font>

In [13]:
list(ordered_similarities_items[1:6].index)

['Star Wars: Episode V - The Empire Strikes Back',
 'Star Wars: Episode VI - Return of the Jedi',
 'Indiana Jones and the Last Crusade',
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark)',
 'Stardust Memories']

In [36]:
movie ='Star Wars: Episode IV - A New Hope'
# users who have rated this movie highly
user_ratings_for_movie = movie_ratings_table_normed.loc[[movie]]
sorted_ratings = user_ratings_for_movie.sort_values(by=movie, axis=1, ascending=False)

userId,1,249,267,276,279,288,290,294,302,304,...,497,549,363,224,370,361,73,466,156,461
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Star Wars: Episode IV - A New Hope,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,...,-2.2,-2.2,-2.2,-2.2,-2.2,-2.2,-2.2,-2.7,-3.2,-3.7


In [40]:
recommend_to_users = list(sorted_ratings.columns[0:5])
recommend_to_users

[1, 249, 267, 276, 279]

In [43]:
print("The item-item model recommends '{0}' to users {1}".format(movie,recommend_to_users ))

The item-item model recommends 'Star Wars: Episode IV - A New Hope' to users [1, 249, 267, 276, 279]


## k-NN Approach

In [14]:
def get_target(user,movie):
    # Get the data for the user you are predicting for
    target_movie_x = movie_ratings_table_normed.loc[[movie]]
    target_movie_x.drop(user, axis=1, inplace=True)
    return target_movie_x

In [15]:
def prepare_data(user,movie):
    # Get the data for the user you are predicting for
    target_movie_x = movie_ratings_table_normed.loc[[movie]]
    target_movie_x.drop(user, axis=1, inplace=True)
    movies_data = movie_ratings_table_normed[movie_ratings_table_normed.index!=movie]
    other_movies_y = movies_data[user]
    other_movies_x = movies_data.drop(user, axis=1)
    return other_movies_x, other_movies_y

In [16]:
# Import the regressor
from sklearn.neighbors import KNeighborsRegressor

# Instantiate the user KNN model
movie_knn = KNeighborsRegressor(metric='cosine', n_neighbors=10)

In [21]:
x_q = get_target(1,'Indiana Jones and the Last Crusade')
x_q

userId,2,3,4,5,6,7,8,9,10,11,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Indiana Jones and the Last Crusade,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.5


In [22]:
X, y = prepare_data(1,'Indiana Jones and the Last Crusade')

In [23]:
movie_knn.fit(X, y)
item_item_pred = movie_knn.predict(x_q)
print("The item-item model predicts {}".format(item_item_pred))

The item-item model predicts [0.3]
