## Importing Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
movie = pd.read_csv("movies.csv")
rating = pd.read_csv("ratings.csv")

## Analysing the dataset

In [7]:
movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
movie = movie.iloc[:,0:-1]

In [None]:
movie.isnull().sum()

movieId    0
title      0
dtype: int64

In [None]:
movie.duplicated().sum()

0

In [None]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
rating.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [None]:
rating.duplicated().sum()

0

In [None]:
rating = rating.iloc[:,0:-1]

In [None]:
movie.shape

(9742, 2)

In [None]:
rating.shape

(100836, 3)

In [None]:
no_of_ratings_for_each_movie = rating.groupby('movieId').count()['userId'].reset_index()
no_of_ratings_for_each_movie.rename(columns={'userId':'No. of ratings'},inplace=True)
no_of_ratings_for_each_movie.head()

Unnamed: 0,movieId,No. of ratings
0,1,215
1,2,110
2,3,52
3,4,7
4,5,49


In [None]:
avg_rating_for_each_movie=rating.groupby('movieId').mean()['rating'].reset_index()
avg_rating_for_each_movie.rename(columns={'rating':'Avg. Rating'},inplace=True)
avg_rating_for_each_movie.head()

Unnamed: 0,movieId,Avg. Rating
0,1,3.92093
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429


In [None]:
movie_info = no_of_ratings_for_each_movie.merge(avg_rating_for_each_movie,on='movieId')
movie_info.head()

Unnamed: 0,movieId,No. of ratings,Avg. Rating
0,1,215,3.92093
1,2,110,3.431818
2,3,52,3.259615
3,4,7,2.357143
4,5,49,3.071429


## Colaborative Model

In [76]:
pt = movie.merge(rating, on='movieId')
pt.head()

Unnamed: 0,movieId,title,userId,rating,timestamp
0,1,Toy Story (1995),1,4.0,964982703
1,1,Toy Story (1995),5,4.0,847434962
2,1,Toy Story (1995),7,4.5,1106635946
3,1,Toy Story (1995),15,2.5,1510577970
4,1,Toy Story (1995),17,4.5,1305696483


In [77]:
for_similar_movies = pt.pivot_table(index='title', columns='userId', values = 'rating')
for_similar_movies.fillna(0,inplace=True)
for_similar_movies.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
for_similar_movies.shape

(9719, 610)

In [79]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(for_similar_movies)

In [80]:
def knn_recommend(movie_name , k=5):
  index = np.where(for_similar_movies.index == movie_name)[0][0]   # getting the index of the movie which i want to find similar books for
  similar_movies = similarity_matrix[index]                        # getting the similarity scores for that movie
  top_similar_indices_of_movies = np.argsort(similar_movies)[::-1][:k+1]  #gettig the top k similar movies, we take k+1 similar movies as the 1st movie (with max similarity) will be the movie itself
  top_similar_movies = for_similar_movies.index[top_similar_indices_of_movies]  #  # Getting the index of the top k similar movies from the 'for_similar_movies' table
  return top_similar_movies.tolist()

In [93]:
user_movie = input("Enter the name of the movie for which you want to find similar movies: ")
no_of_movies = int(input("Enter how many similar movies you want to watch: "))

Enter the name of the movie for which you want to find similar movies: 10 Things I Hate About You (1999)
Enter how many similar movies you want to watch: 5


In [97]:
recommend_movies = knn_recommend(user_movie,no_of_movies)
recommend_movies.pop(0)               # as the 1st movie is the movie itself so we dont need to include that in the similar movies. So we remove that movie
print("Movies similar to {} are: \n".format(user_movie))
for movie in recommend_movies:
  print(movie)

print('\nThese are the movies which users similar to you have watched.')

Movies similar to 10 Things I Hate About You (1999) are: 

Bring It On (2000)
She's All That (1999)
Wedding Singer, The (1998)
Never Been Kissed (1999)
Easy A (2010)

These are the movies which users similar to you have watched.


## Genre Based Recommendation

In [83]:
movie2 = pd.read_csv('movies.csv')

In [84]:
movie2.head(1)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [85]:
movie2['genres'] = movie2['genres'].str.replace('|',' ')      # to separate the different genres by a space instead of a '|'.
movie2.head(1)

  movie2['genres'] = movie2['genres'].str.replace('|',' ')      # to separate the different genres by a space instead of a '|'.


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy


In [86]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 30, stop_words = 'english')
vector = cv.fit_transform(movie2['genres']).toarray()
vector.shape

(9742, 23)

In [87]:
print(vector)

[[0 1 1 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [1 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [88]:
# To visualize what the feature extracted vector looks like any movie
movie_list = movie2['title'].tolist()  # As 'title' is the column containing movie names
for title, movie_vector in zip(movie_list[:5], vector[:5]):     # i am doing movie_list[:5] sp that only the 1st five movies and their features are printed
    print(f"Movie: {title}, Features: {movie_vector}")

Movie: Toy Story (1995), Features: [0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Movie: Jumanji (1995), Features: [0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Movie: Grumpier Old Men (1995), Features: [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
Movie: Waiting to Exhale (1995), Features: [0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
Movie: Father of the Bride Part II (1995), Features: [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


This finds the similarity between the rows , i.e similarity between the different movies based on genre.

In [89]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_between_eachMovies_genres = cosine_similarity(vector)
print(similarity_between_eachMovies_genres)

[[1.         0.77459667 0.31622777 ... 0.         0.31622777 0.4472136 ]
 [0.77459667 1.         0.         ... 0.         0.         0.        ]
 [0.31622777 0.         1.         ... 0.         0.         0.70710678]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.31622777 0.         0.         ... 0.         1.         0.        ]
 [0.4472136  0.         0.70710678 ... 0.         0.         1.        ]]


In [90]:
def recommend(movie, k):
    index = np.where(movie2['title'] == movie)[0][0]
    similar_movies = similarity_between_eachMovies_genres[index]
    # print(similar_movies.shape)        # this will print (9742,1)
    top_k_indices = np.argsort(similar_movies)[::-1][:k+1]
    top_movies = movie2['title'].iloc[top_k_indices]
    return top_movies.tolist()

# def recommend(movie,k):
#   index = np.where(movie2.index == movie)[0][0]
#   print(index)
#   similar_movies = similarity_between_eachMovies_genres[index]
#   top_k_indices = np.argsort(similar_movies)[::-1][:k]
#   top_movies = movie2.index[top_k_indices]

#   return top_movies.tolist()



Break down the expression np.argsort(similar_movies)[::-1][:k+1] step by step:

np.argsort(similar_movies): The function np.argsort() returns the indices that would sort the array similar_movies in ascending order. In this case, it will give the indices that sort similar_movies in increasing order of similarity scores.

[::-1]: The slice notation [::-1] is used to reverse the order of the array obtained from np.argsort(). This results in an array of indices sorted in descending order of similarity scores.

[:k+1]: Finally, we use slicing to select the first k+1 elements from the reversed array of indices. The k+1 elements are taken because we want to include the first k most similar movies along with the movie itself in the recommendations.

The code line top_movies = movie2['title'].iloc[top_k_indices] selects the movie titles from the 'title' column of the DataFrame movie2 corresponding to the indices stored in the top_k_indices array. As a result, top_movies will be a pandas Series containing the titles of the movies that are considered the most similar based on their genre similarity scores.

In [91]:
user_movie = input("Enter the movie for which you want to find similar movies: ")
no_of_movies = int(input("Enter how many similar movies do you want to know: "))

Enter the movie for which you want to find similar movies: 10 Things I Hate About You (1999)
Enter how many similar movies do you want to know: 5


In [98]:
movies_which_are_similar = recommend(user_movie, no_of_movies)
movies_which_are_similar.pop(0)
print("Movies with similar genre to {} are: \n".format(user_movie))
for movie in movies_which_are_similar:
  print(movie)

print('These are some movies which are similar to the movie which you input')

Movies with similar genre to 10 Things I Hate About You (1999) are: 

Down with Love (2003)
Playing It Cool (2014)
Bachelor, The (1999)
For Love or Money (1993)
Pursuit of Happiness (2001)
These are some movies which are similar to the movie which you input


## Hybrid Model

In [115]:
movie3 = pd.read_csv('movies.csv')
rating3 = pd.read_csv('ratings.csv')

In [116]:
movie3.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [117]:
rating3.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [118]:
rating3=rating3.iloc[:,0:-1]

In [119]:
#For collaborative model

#Make a pivot table
pt = movie3.merge(rating3, on='movieId')
collab = pt.pivot_table(index = 'title', columns = 'userId', values = 'rating')
collab.fillna(0,inplace = True)

#find the similarity matrix between the different movies (based on the ratings given by each user)
from sklearn.metrics.pairwise import cosine_similarity
collab_sim = cosine_similarity(collab)

In [120]:
#For making a genre based similarity matrix: which will give that how similar movies are to each other based on their properties
movie3['genres'] = movie3['genres'].str.replace('|',' ')

from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(max_features = 20, stop_words = 'english')
genre_vector = count.fit_transform(movie3['genres']).toarray()

  movie3['genres'] = movie3['genres'].str.replace('|',' ')


In [121]:
# movie3.head(1)
# print(genre_vector)

In [122]:
from sklearn.metrics.pairwise import cosine_similarity
genre_sim = cosine_similarity(genre_vector)

In [123]:
def hybrid_model(movie_name, k=5, weight_collab=0.5, weight_genre=0.5):

  total_weight = weight_collab + weight_genre
  weight_collab /= total_weight
  weight_genre /= total_weight

  index_collab = np.where(collab.index == movie_name)[0][0]
  scores_collab = collab_sim[index_collab]                      # this gives the similarity scores of the movie which user has input
  top_indices_collab = np.argsort(scores_collab)[::-1][:k+1]    # this gives the top indices i.e. indices for the k+1 most similar movies

  index_genre = np.where(movie3['title']==movie_name)[0][0]
  scores_genre = genre_sim[index_genre]
  top_indices_genre = np.argsort(scores_genre)[::-1][:k+1]

  # combined_scores = weight_collab*scores_collab[top_indices_collab] + weight_genre*scores_genre[top_indices_genre]   ->
  combined_scores = np.zeros_like(scores_collab)  # Initialize with zeros
  combined_scores[top_indices_collab] += weight_collab * scores_collab[top_indices_collab]
  combined_scores[top_indices_genre] += weight_genre * scores_genre[top_indices_genre]
  top_indices_hybrid = np.argsort(combined_scores)[::-1][:k+1]

  top_movies = movie3['title'].iloc[top_indices_hybrid]
  return top_movies.tolist()

 weight_collab*scores_collab[top_indices_collab] : this is the multiplying the weight you want of the collaborative model and the similarity score of the top k+1 movies according to collaborative model. BUt there is something wrong in this. It assumes that top_indices_collab and top_indices_genre are aligned, i.e., they refer to the same set of movies in the same order. This might not always be the case. So this needs to get adjusted to ensure that you're combining scores for the same movies.

The code uses the + operator to add the two sets of weighted scores together. This operation is performed element-wise for each corresponding pair of scores from the two models. The result, combined_scores, is an array of scores where each score is a combination of the collaborative and genre-based similarity scores for each movie.

In [124]:
user_input = input("Enter movie name: ")
k = int(input('Enter no. of suggestions wanted: '))

Enter movie name: 10 Things I Hate About You (1999)
Enter no. of suggestions wanted: 5


In [126]:
top_movies = hybrid_model(user_input,k)
top_movies.pop(0)
print('Movies similar to {} are: \n'.format(user_input))
for x in top_movies:
  print(x)

Movies similar to 10 Things I Hate About You (1999) are: 

Fraternity Vacation (1985)
Seems Like Old Times (1980)
Mr. Deeds Goes to Town (1936)
Impromptu (1991)
What Happens in Vegas... (2008)
