In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:
#Loading datset from CSV files - movies.csv and ratings.csv

movie_dataset = pd.read_csv('movies.csv')
movie_dataset['movieId'] = movie_dataset['movieId'].apply(pd.to_numeric)

ratings_dataset = pd.read_csv('ratings.csv')
movie_dataset.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
#Combining both tables- movies and ratings 

merge_data = pd.merge(ratings_dataset, movie_dataset, on='movieId')
merge_data.head(6)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,2,3.5,1112486027,Jumanji (1995),Adventure|Children|Fantasy
1,5,2,3.0,851527569,Jumanji (1995),Adventure|Children|Fantasy
2,13,2,3.0,849082742,Jumanji (1995),Adventure|Children|Fantasy
3,29,2,3.0,835562174,Jumanji (1995),Adventure|Children|Fantasy
4,34,2,3.0,846509384,Jumanji (1995),Adventure|Children|Fantasy
5,54,2,3.0,974918176,Jumanji (1995),Adventure|Children|Fantasy


In [4]:
# These columns are not required so, dropping Timestamp and Genre columns
columns = ['timestamp', 'genres']
data= merge_data.drop(columns, axis=1)
data.head(5)

Unnamed: 0,userId,movieId,rating,title
0,1,2,3.5,Jumanji (1995)
1,5,2,3.0,Jumanji (1995)
2,13,2,3.0,Jumanji (1995)
3,29,2,3.0,Jumanji (1995)
4,34,2,3.0,Jumanji (1995)


In [5]:
#Grouping the data on the basis of ratings and creating a new column for total count of ratings, 'TotalRatingCount'

rating =  data.dropna(axis = 0, subset = ['title'])
rating_count=(rating.groupby(by = ['title'])['rating'].count().reset_index())
rating_count = rating_count.rename(columns = {'rating': 'TotalRatingCount'})[['title', 'TotalRatingCount']]
    
rating_count.head()

Unnamed: 0,title,TotalRatingCount
0,"""Great Performances"" Cats (1998)",155
1,#chicagoGirl: The Social Network Takes on a Di...,3
2,$ (Dollars) (1971),24
3,$5 a Day (2008),39
4,$9.99 (2008),55


In [6]:
#Merging the calculated total count with original data

final_dataset = rating.merge(rating_count, left_on = 'title', right_on = 'title', how = 'left')
final_dataset.head()

Unnamed: 0,userId,movieId,rating,title,TotalRatingCount
0,1,2,3.5,Jumanji (1995),22243
1,5,2,3.0,Jumanji (1995),22243
2,13,2,3.0,Jumanji (1995),22243
3,29,2,3.0,Jumanji (1995),22243
4,34,2,3.0,Jumanji (1995),22243


In [7]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(rating_count['TotalRatingCount'].describe())

count   26729.000
mean      748.261
std      3086.673
min         1.000
25%         3.000
50%        18.000
75%       205.000
max     67310.000
Name: TotalRatingCount, dtype: float64


In [8]:
#Drop duplicate data

rating_data = final_dataset.drop_duplicates(['userId','title'])
rating_data.head(5)

Unnamed: 0,userId,movieId,rating,title,TotalRatingCount
0,1,2,3.5,Jumanji (1995),22243
1,5,2,3.0,Jumanji (1995),22243
2,13,2,3.0,Jumanji (1995),22243
3,29,2,3.0,Jumanji (1995),22243
4,34,2,3.0,Jumanji (1995),22243


In [9]:
#Many movies have more than 50 rating, so limiting it to the top 1%
popularity_limit = 50
popular_movie= rating_data.query('TotalRatingCount >= @popularity_limit')
popular_movie.head()

Unnamed: 0,userId,movieId,rating,title,TotalRatingCount
0,1,2,3.5,Jumanji (1995),22243
1,5,2,3.0,Jumanji (1995),22243
2,13,2,3.0,Jumanji (1995),22243
3,29,2,3.0,Jumanji (1995),22243
4,34,2,3.0,Jumanji (1995),22243


In [10]:
popular_movie.shape

(19847742, 5)

In [12]:
## Creating Pivot matrix from sparse matrix and filling missing values with zeros 

feature_matrix=popular_movie.pivot_table(index='title',columns='userId',values='rating').fillna(0)
feature_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,138484,138485,138486,138487,138488,138489,138490,138491,138492,138493
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$9.99 (2008),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
matrix_transpose = feature_matrix.T
matrix_transpose.shape

(138493, 10523)

In [14]:
#Creating scipy sparse matrix
from scipy.sparse import csr_matrix

moviefeature_matrix = csr_matrix(feature_matrix.values)

# Applying K Nearest Algorithm with Cosine similarity

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(moviefeature_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [15]:
feature_matrix.shape

(10523, 138493)

In [16]:
query_index = np.random.choice(feature_matrix.shape[0])
print(query_index)
query_index =2

5102


In [17]:
#Calculating distance using nearest neibhbor
distances, indices = model_knn.kneighbors(feature_matrix.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 7)

In [18]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(feature_matrix.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, feature_matrix.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for 'Hellboy': The Seeds of Creation (2004):

1: 'Salem's Lot (2004), with distance of 0.9399856526332447:
2: Run Ronnie Run (2002), with distance of 0.9469909022691365:
3: Avengers, The (2012), with distance of 0.9536943656569585:
4: Abbott and Costello Meet Dr. Jekyll and Mr. Hyde (1953), with distance of 0.9566089577119974:
5: Batman: The Dark Knight Returns, Part 1 (2012), with distance of 0.9569720067227591:
6: Amazing Spider-Man, The (2012), with distance of 0.9575318541312836:


In [19]:
feature_matrix.shape

(10523, 138493)

In [20]:
# Applying SVD  and fitting model for dimensionality reduction

import sklearn
from sklearn.decomposition import TruncatedSVD

SVD_model = TruncatedSVD(n_components=12, random_state=17)
final_matrix = SVD_model.fit_transform(feature_matrix)
final_matrix.shape

(10523, 12)

In [21]:
# Calculating the Pearson’s R correlation coefficient

import warnings
warnings.filterwarnings("ignore",category =RuntimeWarning)
coefficient = np.corrcoef(final_matrix)
coefficient.shape

(10523, 10523)

In [24]:
# Generating movie recommendations

mv_title = feature_matrix.index
mv_title_list = list(mv_title)
mv_index = mv_title_list.index("Avengers, The (2012)")
print(mv_index)

706


In [25]:
result = coefficient[mv_index]
list(mv_title[(result < 1.0) & (result > 0.9)])

["'Hellboy': The Seeds of Creation (2004)",
 '(500) Days of Summer (2009)',
 '10,000 BC (2008)',
 '12 Years a Slave (2013)',
 '127 Hours (2010)',
 '13 (2010)',
 '2 Guns (2013)',
 '2012 (2009)',
 '21 (2008)',
 '21 Jump Street (2012)',
 '21 and Over (2013)',
 '22 Jump Street (2014)',
 '3 Idiots (2009)',
 '30 Days of Night: Dark Days (2010)',
 '30 Minutes or Less (2011)',
 '300: Rise of an Empire (2014)',
 '47 Ronin (2013)',
 '5 Centimeters per Second (Byôsoku 5 senchimêtoru) (2007)',
 '50/50 (2011)',
 '9 (2009)',
 'A-Team, The (2010)',
 'Abduction (2011)',
 'About Time (2013)',
 'Abraham Lincoln: Vampire Hunter (2012)',
 'Adjustment Bureau, The (2011)',
 'Adventureland (2009)',
 'Adventures of Tintin, The (2011)',
 'After Earth (2013)',
 'Agora (2009)',
 'Alan Partridge: Alpha Papa (2013)',
 'Alice in Wonderland (2010)',
 'All-Star Superman (2011)',
 'Alone in the Wilderness (2004)',
 'Amazing Spider-Man, The (2012)',
 'American Hustle (2013)',
 'American Pie Presents Beta House (America