In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors as nn


In [2]:
movies_df = pd.read_csv("movies.csv")
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings_df = pd.read_csv("ratings.csv")
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
merged_df = movies_df.merge(ratings_df, on=["movieId"])
merged_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [5]:
print("Movies", movies_df.shape)
print("Ratings", ratings_df.shape)
print("Merged", merged_df.shape)

Movies (9742, 3)
Ratings (100836, 4)
Merged (100836, 6)


In [6]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movieId    100836 non-null  int64  
 1   title      100836 non-null  object 
 2   genres     100836 non-null  object 
 3   userId     100836 non-null  int64  
 4   rating     100836 non-null  float64
 5   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 5.4+ MB


In [7]:
merged_df.isnull().sum()

movieId      0
title        0
genres       0
userId       0
rating       0
timestamp    0
dtype: int64

In [8]:
rating_count = merged_df.groupby(["title", "movieId"])["rating"].count().reset_index().rename(columns={"rating":"total_ratings"})[["movieId", "title","total_ratings"]]
rating_count.head()

Unnamed: 0,movieId,title,total_ratings
0,117867,'71 (2014),1
1,97757,'Hellboy': The Seeds of Creation (2004),1
2,26564,'Round Midnight (1986),2
3,27751,'Salem's Lot (2004),1
4,779,'Til There Was You (1997),2


In [9]:
combined_df = merged_df.merge(rating_count, on=["title", "movieId"], how="left")
combined_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,total_ratings
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703,215
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962,215
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946,215
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970,215
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483,215


In [10]:
combined_df.describe()

Unnamed: 0,movieId,userId,rating,timestamp,total_ratings
count,100836.0,100836.0,100836.0,100836.0,100836.0
mean,19435.295718,326.127564,3.501557,1205946000.0,58.755801
std,35530.987199,182.618491,1.042529,216261000.0,61.96667
min,1.0,1.0,0.5,828124600.0,1.0
25%,1199.0,177.0,3.0,1019124000.0,13.0
50%,2991.0,325.0,3.5,1186087000.0,39.0
75%,8122.0,477.0,4.0,1435994000.0,84.0
max,193609.0,610.0,5.0,1537799000.0,329.0


In [11]:
#### Total Rating quatile data ####
quantiles = combined_df["total_ratings"].quantile([0.25, 0.5, 0.75])
quantiles

0.25    13.0
0.50    39.0
0.75    84.0
Name: total_ratings, dtype: float64

In [12]:
IQR = quantiles[0.75] - quantiles[0.25]
IQR

71.0

In [13]:
lower_limit = quantiles[0.25] - 1.5*IQR
upper_limit = quantiles[0.75] + 1.5*IQR
print("limits", lower_limit, upper_limit)

limits -93.5 190.5


Quantiles cannot be used in this case as the greater number of reviews grater the popularity

In [14]:
%%time
# Minimum threshold set for a movie to be recomemded
popularity_threshold = 50
popular_movies = combined_df.loc[combined_df["total_ratings"]>=popularity_threshold]
popular_movies.shape

CPU times: user 2.72 ms, sys: 1.2 ms, total: 3.92 ms
Wall time: 2.85 ms


(41360, 7)

Use .loc method only as that is faster than the query method

In [15]:
%%time
popularity_threshold = 50
popular_movies = combined_df.query("total_ratings>=@popularity_threshold")
popular_movies.shape

CPU times: user 4.01 ms, sys: 1.37 ms, total: 5.38 ms
Wall time: 4.13 ms


(41360, 7)

Creating pivot matrix

In [16]:
movie_features_df = popular_movies.pivot_table(index="title", columns="userId", values="rating").fillna(0)
movie_features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


Creating an array matrix

In [21]:
movie_features_df[:10]

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0
"40-Year-Old Virgin, The (2005)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
A.I. Artificial Intelligence (2001),0.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,3.5,0.0,4.5,0.0,3.5
"Abyss, The (1989)",4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0
Ace Ventura: Pet Detective (1994),0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,2.0,0.0,0.0,0.0,3.5,0.0,3.0
Ace Ventura: When Nature Calls (1995),0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,1.0,0.0,2.0,0.0,2.0,0.0,0.0


In [22]:
split = len(movie_features_df)//5
array_matrix = []
start = 0
end = 0
for i in range(5):
    end = start+split
    array_matrix.append(csr_matrix(movie_features_df[start:end].values))
    start = end
print(array_matrix)

[<90x606 sparse matrix of type '<class 'numpy.float64'>'
	with 8135 stored elements in Compressed Sparse Row format>, <90x606 sparse matrix of type '<class 'numpy.float64'>'
	with 8255 stored elements in Compressed Sparse Row format>, <90x606 sparse matrix of type '<class 'numpy.float64'>'
	with 8265 stored elements in Compressed Sparse Row format>, <90x606 sparse matrix of type '<class 'numpy.float64'>'
	with 8579 stored elements in Compressed Sparse Row format>, <90x606 sparse matrix of type '<class 'numpy.float64'>'
	with 8126 stored elements in Compressed Sparse Row format>]


In [23]:
model_knn = nn(metric="cosine", algorithm="brute")

In [36]:
query_index = 240
c = 1
for matrix in array_matrix: 
    model_knn.fit(matrix)
    distance, index = model_knn.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1,-1), n_neighbors=6)
    print("ITERATION: ", c)
    for i in range(0, len(distance.flatten())):
        if i == 0:
            print("recommendation for {}".format(movie_features_df.index[query_index]))
        else:
            print("{}: {}, with distance of {}".format(i, movie_features_df.index[index[0][i]], distance[0][i]))
    c+=1
    

ITERATION:  1
recommendation for Leaving Las Vegas (1995)
1: Casino (1995), with distance of 0.6095470215889399
2: Broken Arrow (1996), with distance of 0.6369844465111143
3: Babe (1995), with distance of 0.6833780185835203
4: American President, The (1995), with distance of 0.6989988021718019
5: Batman (1989), with distance of 0.7064491684787317
ITERATION:  2
recommendation for Leaving Las Vegas (1995)
1: Blade Runner (1982), with distance of 0.5974010696406491
2: Beauty and the Beast (1991), with distance of 0.6271457555496478
3: Caddyshack (1980), with distance of 0.6656163603756377
4: Ace Ventura: Pet Detective (1994), with distance of 0.6882892318188077
5: Casablanca (1942), with distance of 0.6955507186838936
ITERATION:  3
recommendation for Leaving Las Vegas (1995)
1: American Beauty (1999), with distance of 0.5603442623696098
2: Austin Powers: International Man of Mystery (1997), with distance of 0.6041558605062269
3: 10 Things I Hate About You (1999), with distance of 0.697039

In [31]:
movie_features_df.shape

(450, 606)

In [26]:
query_index = 240 #np.random.choice(movie_features_df.shape[0])
query_index

240

In [31]:
distance, index = model_knn.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1,-1), n_neighbors=6)

In [32]:
distance

array([[0.53928512, 0.63774575, 0.64787755, 0.67592755, 0.67937041,
        0.69517793]])

In [33]:
for i in range(0, len(distance.flatten())):
    if i == 0:
        print("recommendation for {}".format(movie_features_df.index[query_index]))
    else:
        print("{}: {}, with distance of {}".format(i, movie_features_df.index[index[0][i]], distance[0][i]))

recommendation for Leaving Las Vegas (1995)
1: Blair Witch Project, The (1999), with distance of 0.6377457543575933
2: Blade (1998), with distance of 0.6478775503187729
3: Beauty and the Beast (1991), with distance of 0.6759275523721651
4: Bruce Almighty (2003), with distance of 0.6793704133863514
5: Blues Brothers, The (1980), with distance of 0.6951779321400062
