## Nearest Neighbours Collaborative Filtering System

In [215]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

In [225]:
#LOADING DATASETS
movies = pd.read_csv('../../Datasets/movies.csv',index_col=[0])
ratings =pd.read_csv('../../Datasets/ratings.csv')
links = pd.read_csv('../../Datasets/links.csv',index_col=[0]).set_index('movieId')['imdb_link']
movieId = movies[["movieId", "title"]].set_index("title")
movieTitle = movies[["movieId","title"]].set_index("movieId")

In [226]:
movies[movies['movieId']==34048]

Unnamed: 0,movieId,title,genres
5931,34048,War of the Worlds (2005),Action|Adventure|Sci-Fi|Thriller


In [227]:
links.head()

movieId
1    https://www.imdb.com/title/tt0114709
2    https://www.imdb.com/title/tt0113497
3    https://www.imdb.com/title/tt0113228
4    https://www.imdb.com/title/tt0114885
5    https://www.imdb.com/title/tt0113041
Name: imdb_link, dtype: object

In [228]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [229]:
##COMBINING DATASETS
ratings = pd.merge(movies, ratings).drop(
            ["genres", "timestamp"], axis=1
        )
ratings[ratings['movieId']==34049]

Unnamed: 0,movieId,title,userId,rating


In [230]:
ratings[ratings['movieId']==64997]

Unnamed: 0,movieId,title,userId,rating
88672,64997,War of the Worlds (2005),28,3.5
88673,64997,War of the Worlds (2005),68,2.5


In [231]:
ratings = ratings.dropna(thresh=10, axis=1).fillna(
            0, axis=1
        )
ratings.fillna(0, axis=1, inplace=True)
ratings.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [232]:
movies_ratings = ratings.pivot_table(
            index=["userId"], columns=["movieId"], values="rating"
        )
movies_ratings = movies_ratings.dropna(thresh=10, axis=1).fillna(
            0, axis=1
        )
movies_ratings.fillna(0, axis=1, inplace=True)

In [233]:
corr_matrix = movies_ratings.corr(method="pearson")
corr_matrix

movieId,1,2,3,5,6,7,9,10,11,12,...,166461,166528,166643,168250,168252,174055,176371,177765,179819,187593
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.231327,0.173213,0.192474,0.192686,0.143743,0.177245,0.183382,0.172799,0.159352,...,-0.003950,0.070499,0.111588,0.022451,0.066439,0.059796,0.054642,-0.013607,0.053028,0.022160
2,0.231327,1.000000,0.191945,0.200526,0.158341,0.127569,-0.021045,0.285086,0.217090,0.115290,...,0.047793,0.167779,0.049701,0.036534,0.195111,0.060834,0.150281,-0.020637,0.113709,0.118390
3,0.173213,0.191945,1.000000,0.370171,0.196442,0.351513,0.275812,0.136916,0.174251,0.168038,...,-0.035086,0.000094,0.018855,-0.031249,-0.000959,0.044769,0.033825,-0.039794,0.026881,-0.039206
5,0.192474,0.200526,0.370171,1.000000,0.215503,0.429890,0.308085,0.110833,0.201002,0.173630,...,0.029139,0.006434,-0.005243,-0.004671,-0.022762,-0.003536,0.000280,-0.038865,-0.000983,-0.038291
6,0.192686,0.158341,0.196442,0.215503,1.000000,0.148109,0.167909,0.251343,0.182082,0.115893,...,-0.052982,0.106208,0.006448,0.038345,0.123248,0.078585,0.119709,-0.060090,0.015800,0.038239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174055,0.059796,0.060834,0.044769,-0.003536,0.078585,0.041336,0.014197,-0.009848,0.042380,0.017650,...,0.096152,0.184902,0.282894,0.341528,0.325772,1.000000,0.689737,0.127251,0.284834,0.128020
176371,0.054642,0.150281,0.033825,0.000280,0.119709,0.029589,-0.001716,0.032775,0.041955,0.083686,...,0.132310,0.434742,0.362638,0.368046,0.476659,0.689737,1.000000,0.169473,0.495133,0.382127
177765,-0.013607,-0.020637,-0.039794,-0.038865,-0.060090,-0.040824,-0.021576,-0.031616,-0.035557,-0.021954,...,0.237951,0.143826,0.085723,0.171565,0.062448,0.127251,0.169473,1.000000,0.362601,0.345148
179819,0.053028,0.113709,0.026881,-0.000983,0.015800,0.010491,-0.020035,-0.028194,0.041405,-0.005968,...,0.181528,0.468824,0.396642,0.299374,0.493593,0.284834,0.495133,0.362601,1.000000,0.466643


In [258]:
def recommend(user_ratings):
        if len(user_ratings) == 0:
            return []
        movies_list = []
        ratings_list = []
        for movie, rating in user_ratings:
            movies_list.append(movieId.loc[movie]['movieId'])
            ratings_list.append(rating)
        similar_movies = corr_matrix[movies_list]
        for i in range(len(ratings_list)):
            similar_movies.iloc[
                :, similar_movies.columns.get_loc(movies_list[i])
            ] = similar_movies[movies_list[i]] * (ratings_list[i] - 2.5)
        similar_movies = pd.DataFrame(
            similar_movies.sum(axis=1).sort_values(ascending=False)
        )
        similar_movies["title"] = [
            movieTitle.loc[movie_id]["title"] for movie_id in list(similar_movies.index)
        ]
#         similar_movies.sum().sort_values(ascending=False).head(20)
        return pd.concat(
            [
                similar_movies.head(30)["title"],
                links,
            ],
            axis=1,
            join="inner",
        )

In [259]:
recommend([['Batman Begins (2005)',5]])
# sim = pd.DataFrame((corr_matrix[['Batman Begins (2005)']]*(2.5)).sum(axis=1).sort_values(ascending=False))

# sim = corr_matrix[[movieId.loc['Batman Begins (2005)']['movieId']]]
# sim = pd.DataFrame(sim.sum(axis=1).sort_values(ascending=False))
# sim["title"] = [movieTitle.loc[movieId]['title'] for movieId in list(sim.index)]

# pd.concat(
#             [
#                 sim.head(30)["title"],
#                 links,
#             ],
#             axis=1,
#             join="inner",
#         )

Unnamed: 0_level_0,title,imdb_link
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
33794,Batman Begins (2005),https://www.imdb.com/title/tt0372784
32587,Sin City (2005),https://www.imdb.com/title/tt0401792
58559,The Dark Knight (2008),https://www.imdb.com/title/tt0468569
44191,V for Vendetta (2006),https://www.imdb.com/title/tt0434409
6539,Pirates of the Caribbean: The Curse of the Bla...,https://www.imdb.com/title/tt0325980
6365,The Matrix Reloaded (2003),https://www.imdb.com/title/tt0234215
7438,Kill Bill: Vol. 2 (2004),https://www.imdb.com/title/tt0378194
48780,The Prestige (2006),https://www.imdb.com/title/tt0482571
6874,Kill Bill: Vol. 1 (2003),https://www.imdb.com/title/tt0266697
8961,The Incredibles (2004),https://www.imdb.com/title/tt0317705
