#### Steps Involved

1. Import all the libraries
2. Import dataset(s)
3. Merge the dataset based on common feature
4. Create Pivot table
5. Create a Sparse Matrix
6. Build an nearest neighbor model based on cosine similarity between movies

## 1. Import Modules

In [1]:
import pandas as pd
import numpy as np

## 2. Load dataset

In [2]:
movies_df = pd.read_csv('movies.csv')
rating_df = pd.read_csv('ratings.csv')

In [3]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
print('Movies Dataframe shape : ', movies_df.shape)
print('Rating Dataframe shape : ', rating_df.shape)

Movies Dataframe shape :  (9742, 3)
Rating Dataframe shape :  (100836, 4)


## 3. Merge the movie and rating dataset

In [6]:
merged_df = pd.merge(rating_df, movies_df, on='movieId')
merged_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [7]:
merged_df.drop(['timestamp', 'genres'], axis = 1, inplace = True)

In [8]:
merged_df.shape

(100836, 4)

## 4. Create Pivot Table

In [9]:
movie_features_df = merged_df.pivot_table(index = 'title',columns = 'userId',values = 'rating').fillna(0)
movie_features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
movie_features_df.shape

(9719, 610)

In [11]:
movie_features_df.values

array([[0. , 0. , 0. , ..., 0. , 0. , 4. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 1.5],
       [4. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

## 5. Create Sparse Matrix from Pivot table

In [12]:
from scipy.sparse import csr_matrix

movie_features_df_matrix = csr_matrix(movie_features_df.values)

## 6. Build NearestNeighbors model based on cosine similarity

In [13]:
from sklearn.neighbors import NearestNeighbors

nearest_neighbor_model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
nearest_neighbor_model.fit(movie_features_df_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

## 7. Select some random movie

In [14]:
total_no_of_moives = movie_features_df.shape[0]
print('Total Movies in our pivot table : ', total_no_of_moives)
print()

random_movie_index = np.random.choice(total_no_of_moives)
print('Random Moive Index : ', random_movie_index)

Total Movies in our pivot table :  9719

Random Moive Index :  136


In [15]:
movie_features_df.iloc[random_movie_index]

userId
1      0.0
2      0.0
3      0.0
4      0.0
5      0.0
      ... 
606    0.0
607    0.0
608    0.0
609    0.0
610    0.0
Name: 7th Voyage of Sinbad, The (1958), Length: 610, dtype: float64

## 8. One dimenesional vector representation of random movie

In [16]:
one_dimensional_representation_of_movie_vector = movie_features_df.iloc[random_movie_index].values.reshape(1, -1)
one_dimensional_representation_of_movie_vector

array([[0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. 

## 9. Similar movie for our random movie based on cosine distance

In [17]:
distances, indices = nearest_neighbor_model.kneighbors(one_dimensional_representation_of_movie_vector, n_neighbors = 6)

In [18]:
print('Distance :', distances)
print('Indices :', indices)

Distance : [[0.         0.43223711 0.44417378 0.4686611  0.49024154 0.49620728]]
Indices : [[ 136  269   60 2372 4231 3869]]


In [19]:
indices = indices.flatten()
distances = distances.flatten()

for i in range(0, len(indices)):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_features_df.index[random_movie_index]))
    else:
        print('{0}: {1}, with distance of {2}'.format(i, movie_features_df.index[indices[i]], distances[i]))

Recommendations for 7th Voyage of Sinbad, The (1958):

1: After the Thin Man (1936), with distance of 0.43223710904091805
2: 1984 (Nineteen Eighty-Four) (1984), with distance of 0.44417378279602737
3: Devil's Playground (2002), with distance of 0.468661104162323
4: Ice Castles (1978), with distance of 0.4902415398760306
5: Hell in the Pacific (1968), with distance of 0.49620727814012167
