In [1]:
#import important libraries
import numpy as np
import pandas as pd

In [2]:
#import user-movies data using pandas, reference from pandas documentation
data = pd.io.parsers.read_csv('ratings.dat', 
    names=['user_id', 'movie_id', 'rating', 'time'],
    engine='python', delimiter='::')
data.drop(columns=['time'], inplace=True)
data.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [3]:
#check shape of data
data.shape

(1000209, 3)

In [4]:
# count and mean of users and rating for each movie, reference from pandas documentation
f=['count','mean']
data_summary=data.groupby('movie_id').agg(f)
data_summary

Unnamed: 0_level_0,user_id,user_id,rating,rating
Unnamed: 0_level_1,count,mean,count,mean
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,2077,3053.819933,2077,4.146846
2,701,3027.977175,701,3.201141
3,478,2632.156904,478,3.016736
4,170,3268.841176,170,2.729412
5,296,3143.152027,296,3.006757
...,...,...,...,...
3948,862,2063.107889,862,3.635731
3949,304,2289.046053,304,4.115132
3950,54,2123.370370,54,3.666667
3951,40,1687.925000,40,3.900000


In [5]:
#count and mean of ratings for each movie
data_movie_summary=data.groupby('movie_id')['rating'].agg(f)
data_movie_summary

Unnamed: 0_level_0,count,mean
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2077,4.146846
2,701,3.201141
3,478,3.016736
4,170,2.729412
5,296,3.006757
...,...,...
3948,862,3.635731
3949,304,4.115132
3950,54,3.666667
3951,40,3.900000


In [6]:
#calculate 70 percent quantile of count of ratings 
data_movie_summary["count"].quantile(0.7)

280.0

In [7]:
#set movie views benchmark for count of ratings
movie_benchmark=round(data_movie_summary['count'].quantile(0.7),0)
movie_benchmark

280.0

In [8]:
#drop list of movies which are not falling in bechmark
drop_movie_list=data_movie_summary[data_movie_summary['count']<movie_benchmark].index
drop_movie_list

Index([   4,    8,    9,   12,   13,   14,   15,   18,   20,   23,
       ...
       3940, 3941, 3942, 3943, 3944, 3945, 3946, 3947, 3950, 3951],
      dtype='int64', name='movie_id', length=2593)

In [9]:
#chek count and mean rating for eahc user
data_user_summary=data.groupby('user_id')['rating'].agg(f)
data_user_summary

Unnamed: 0_level_0,count,mean
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,53,4.188679
2,129,3.713178
3,51,3.901961
4,21,4.190476
5,198,3.146465
...,...,...
6036,888,3.302928
6037,202,3.717822
6038,20,3.800000
6039,123,3.878049


In [10]:
#set user count benchmark for count rating
cust_benchmark=round(data_user_summary['count'].quantile(0.7),0)
cust_benchmark

173.0

In [11]:
#drop list of users which are not falling in bechmark
drop_user_list=data_user_summary[data_user_summary['count']<cust_benchmark].index
drop_user_list

Index([   1,    2,    3,    4,    6,    7,    8,    9,   11,   12,
       ...
       6027, 6028, 6029, 6030, 6031, 6032, 6033, 6034, 6038, 6039],
      dtype='int64', name='user_id', length=4225)

In [12]:
#size of original dataframe
print('The original dataframe has: ', data.shape, 'shape')

The original dataframe has:  (1000209, 3) shape


In [13]:
#size of dataframe after dropping items not falling in benchmark
data=data[~data['movie_id'].isin(drop_movie_list)]
data=data[~data['user_id'].isin(drop_user_list)]
print('After the triming, the shape is: {}'.format(data.shape))

After the triming, the shape is: (528192, 3)


In [14]:
#import movie data using pandas, reference from pandas documentation
movie_data = pd.io.parsers.read_csv('movies.dat',
    names=['movie_id', 'title', 'genre'],
    engine='python', delimiter='::', encoding='latin-1')
movie_data.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [15]:
#set rating matrix with (x,y) as (movie id, user id)
ratings_mat = np.ndarray(shape=(np.max(data.movie_id.values), np.max(data.user_id.values)), dtype=np.uint8)
ratings_mat[data.movie_id.values-1, data.user_id.values-1] = data.rating.values

In [16]:
data.movie_id.values

array([2987, 2333, 1175, ...,  562, 1096, 1097], dtype=int64)

In [17]:
data.user_id.values

array([   5,    5,    5, ..., 6040, 6040, 6040], dtype=int64)

In [18]:
data.rating

254        4
255        4
256        5
257        3
258        2
          ..
1000204    1
1000205    5
1000206    5
1000207    4
1000208    4
Name: rating, Length: 528192, dtype: int64

In [19]:
#normalize the rating matrix which helps in filling missing values in the matrix
normalised_mat = ratings_mat - np.asarray([(np.mean(ratings_mat, 1))]).T
normalised_mat

array([[-0.79354305, -0.79354305, -0.79354305, ..., -0.79354305,
        -0.79354305,  2.20645695],
       [-0.27665563, -0.27665563, -0.27665563, ..., -0.27665563,
        -0.27665563, -0.27665563],
       [-0.17086093, -0.17086093, -0.17086093, ..., -0.17086093,
        -0.17086093, -0.17086093],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.14453642, -0.14453642, -0.14453642, ..., -0.14453642,
        -0.14453642, -0.14453642]])

In [20]:
#Decomposition using SVD where U is item based matrix , S is singular value matrix and V is user based matrix ,set k=50
#to reduce dimentionality of problem and use user based matrix to search similar movies in latent space using cosine similarity 
#refernce from numpy documentation and chatgpt
A = normalised_mat.T / np.sqrt(ratings_mat.shape[0] - 1)
U, S, V = np.linalg.svd(A)
k = 50
sliced = V.T[:, :k] 

In [23]:
#Cosine similarity used to project values of movies feature set to reference movie in latent spance, reference from chatgpt
def top_cosine_similarity(data, movie_id):
    top_n = 5
    index = movie_id - 1
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

In [78]:
#Create array to store indexes of similar movies with respect to reference movie
vector = []
for index in range(movie_data.shape[0]):
    sort_indexes = top_cosine_similarity(sliced, movie_data.iloc[index].movie_id)
    vector.append(sort_indexes)
vector

  similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)


[array([   0, 3113, 2354, 2686, 2320], dtype=int64),
 array([   1, 3488,   59,  316,  454], dtype=int64),
 array([   2, 3449,  369,  829,  585], dtype=int64),
 array([   3, 1721, 1911,    9, 1428], dtype=int64),
 array([   4,   18, 2952,  585,  584], dtype=int64),
 array([   5, 1911, 2277,  162,  473], dtype=int64),
 array([   6,  235,  338, 1887, 2670], dtype=int64),
 array([   7, 1634,  673,  453, 3260], dtype=int64),
 array([   8,   54,   10,  439, 2856], dtype=int64),
 array([   9, 1721, 3081, 3638, 3634], dtype=int64),
 array([  10,  439,  338, 2670,    6], dtype=int64),
 array([  11,  341, 1373, 2389, 2639], dtype=int64),
 array([  12, 1579, 1710,  470, 1072], dtype=int64),
 array([  13,  149, 2790,  952,   90], dtype=int64),
 array([  14, 2727,   78, 3268, 3467], dtype=int64),
 array([  15, 1342,  430,  453,  480], dtype=int64),
 array([  16,  837,  341,  264, 1679], dtype=int64),
 array([  17,   22, 2093, 3094,  953], dtype=int64),
 array([  18,  343,  783, 2334,    4], dtype=i

In [123]:
#here function is used to display top 5 similar movies title based on their indexes, this function used to run streamlit code 
movies = movie_data
def print_similar_movies(movie):
    movieid = movies[movies['title'] == movie].values[0][0]
    movieind = movies[movies['title'] == movie].index[0]
    print('Recommendations for {0}: \n'.format(movies[movies.movie_id == movieid].title.values[0]))
    recommend_id = vector[movieind]
    print(recommend_id)
    recommend_movie = []
    for i in recommend_id:
        recommend_movie.append(movies[movies.movie_id == i+1].title.values[0])
    return recommend_movie

In [121]:
# here also function is used to display top 5 similar movies title based on their indexes, but not used in streamlit code
def print_similar_movies1(movie_data, movie_id, top_indexes):
    print('Recommendations for {0}: \n'.format(movie_data[movie_data.movie_id == movie_id].title.values[0]))
    print(top_indexes + 1)
    for id in top_indexes + 1:
        
        print(movie_data[movie_data.movie_id == id].title.values[0])

In [107]:
k = 50
movie_id = 1
sliced = V.T[:, :k] 
indexes = top_cosine_similarity(sliced, movie_id)
print(indexes)

[   0 3113 2354 2686 2320]


  similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)


In [124]:
print_similar_movies('Toy Story (1995)')

Recommendations for Toy Story (1995): 

[   0 3113 2354 2686 2320]


['Toy Story (1995)',
 'Toy Story 2 (1999)',
 "Bug's Life, A (1998)",
 'Tarzan (1999)',
 'Pleasantville (1998)']

In [122]:
print_similar_movies1(movie_data, movie_id, indexes)

Recommendations for Toy Story (1995): 

[   1 3114 2355 2687 2321]
Toy Story (1995)
Toy Story 2 (1999)
Bug's Life, A (1998)
Tarzan (1999)
Pleasantville (1998)


In [125]:
#import pickle file to store movie data and generated array of movie indexes, refernce from geeksforgeeks documentation
import pickle

In [126]:
#store movie data in pkl form
with open('movies_list.pkl', 'wb') as f:
    pickle.dump(movie_data, f)

In [127]:
#store generated array of movie indexes in pkl form
with open('similarity.pkl', 'wb') as g:
    pickle.dump(vector, g)