## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Load Dataset

In [2]:
movies = pd.read_pickle('Final_MovieData.pkl')

In [4]:
movies

Unnamed: 0,tconst,title,year,genres,actor_1,actor_2,actor_3,director,rating,combined
0,tt0035423,Kate & Leopold,2001,"Comedy,Fantasy,Romance",Meg Ryan,Hugh Jackman,Liev Schreiber,James Mangold,6.4,comedy fantasy romance megryan hughjackman lie...
1,tt0064730,Japan Organized Crime Boss,2000,"Action,Crime",Kôji Tsuruta,Tomisaburô Wakayama,Bunta Sugawara,Kinji Fukasaku,7.0,action crime kôjitsuruta tomisaburôwakayama bu...
2,tt0069049,The Other Side of the Wind,2018,Drama,John Huston,Oja Kodar,Peter Bogdanovich,Orson Welles,6.8,drama johnhuston ojakodar peterbogdanovich ors...
3,tt0081145,Me and the Kid,1993,"Comedy,Crime,Drama",Danny Aiello,Alex Zuckerman,Joe Pantoliano,Dan Curtis,5.4,comedy crime drama dannyaiello alexzuckerman j...
4,tt0094900,Committed,1991,"Drama,Thriller",Jennifer O'Neill,Robert Forster,William Windom,William A. Levey,5.1,drama thriller jennifero'neill robertforster w...
...,...,...,...,...,...,...,...,...,...,...
45594,tt9904670,Mountain Quest,2018,Documentary,Willem Dafoe,,,Jennifer Peedom,9.1,documentary willemdafoe jenniferpeedom
45595,tt9904802,Enemy Lines,2020,War,Ed Westwick,John Hannah,Tom Wisdom,Anders Banke,4.9,war edwestwick johnhannah tomwisdom andersbanke
45596,tt9904820,American Terror Story,2019,Horror,Rim Basma,Laurie Cummings,Winnie Du,Terry Spears,3.0,horror rimbasma lauriecummings winniedu terrys...
45597,tt9906644,Manoharam,2019,"Comedy,Drama",Vineeth Sreenivasan,Aparna Das,Basil Joseph,Anvar Sadik,6.8,comedy drama vineethsreenivasan aparnadas basi...


### Final data is ready
## Applying algorithms

In [5]:
# instantiating and generate the count matrix for 'combined' column
# the applying cosine similarity

cv = CountVectorizer(dtype=np.float32)
movie_matrix = cv.fit_transform(movies['combined'])
cos_similarity = cosine_similarity(movie_matrix)

In [6]:
# Saving cosine similarity matrix in file

np.save('cosine_matrix.npy', cos_similarity)

In [7]:
# Loading cosine similarity matrix from file

cos_sim = np.load('cosine_matrix.npy')

Following function takes index of user given movie..
and enumerate index and cosine similarity score in list..
and sorts that list wrt cosine similarity score in decsending order..
and takes top 50 similar movies to the user given movie..
and returns list of tuples with index and cosine similarity score..

In [8]:
def get_cosine_scores(ind):
    similarity_scores = list(enumerate(cos_sim[ind]))
    similarity_scores = sorted(similarity_scores, key=lambda x:x[1], reverse=True)   # sort based on cosine score
    similarity_scores = similarity_scores[1:51]                                      # Take top 50 cosine scores
    return similarity_scores                                                         # It returns list of index and its cosine score

Following function takes list of tuples returned by last function as parameter..
iterate through index and cosine score and get the rating for every index..
and takes weightage total of 3, out of which 2 is given to score and 1 to rating and divide it with 3 to get average..
then append this weightage score and index in a list..
then sort that list with average..
and take top 10 records..
then takes only indexes through map function.. 
and returns those indexes.

In [9]:
def weight_average_cosineScore_rating(cos_scores):
    averages=[]
    for i,score in cos_scores:
        rating = movies.loc[i,'rating']
        average = (2*score + (rating/10)) / 3
        averages.append((i,average))
    averages = sorted(averages, key=lambda x:x[1], reverse=True)
    averages = averages[0:10]
    indexes=list(map(lambda item: item[0], averages))
    return indexes
    

Following function takes list of indexes returned by last function as parameter..
Iterate through indexes and takes movie title and its release year in a string..
and append these titles and in a list..
and return that list with movie titles and year

In [10]:
def get_movie_names(ind):
    titles=[]
    for i in ind:
        title = movies.loc[i,'title']+ ' ('+ str(movies.loc[i,'year'])+')'
        titles.append(title)
    return titles
    

Following function is the MAIN function. as it takes user given title as parameter and gets its index..
then check for cosine scores..
then check for cosine score and rating averages..
then gets movie titles and years in a list..
and return that list to user

In [11]:
indexes = pd.Series(movies.index, index=movies['title'])
def recommendations(title):
    if title in movies['title'].unique():
        index = indexes[title]
        records = get_cosine_scores(index)
        records = weight_average_cosineScore_rating(records)
        records = get_movie_names(records)
        return records
    else:
        return ('This movie is not in our database. Please confirm if you spelled it correctly.')
    

In [12]:
recommendations("Iron Man")

['Iron Man 2 (2010)',
 'Iron Man 3 (2013)',
 'Spider-Man: Homecoming (2017)',
 'Avengers: Age of Ultron (2015)',
 'Jurassic Park (1993)',
 'Serenity (2005)',
 'The Fifth Element (1997)',
 'X-Men (2000)',
 'X2: X-Men United (2003)',
 'Spider-Man (2002)']