# Simple Recommender

This is a simple [movie recommendation model](https://www.datacamp.com/tutorial/recommender-systems-python) built to mimic the IMDB top 250. The data used in this notebook was downloaded from [here](https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset?resource=download).

This recommender uses the votes, vote averages, and vote means from the top 90% of the movies in the `metadata.csv`.

In [None]:
import pandas as pd

In [None]:
metadata = pd.read_csv('./data/movies_metadata.csv', low_memory=False)
print(metadata.shape)
metadata.head(3)

In [None]:
# Calculate mean of vote average column (scale goes from 0 to 10)
C = metadata['vote_average'].mean()
print(f'mean vote score: {C}')

In [None]:
# Calculate the number of votes the top 90% of movies have
m = metadata['vote_count'].quantile(0.90)
print(f'# of votes required to be considered: {m}')

# filter the movies to only the top 90%
filtered_movies = metadata[metadata['vote_count'] >= m]
print(filtered_movies.shape)
filtered_movies.head(3)

In [None]:
def weighted_rating(X, m=m, C=C):
    v = X['vote_count']
    R = X['vote_average']
    
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
filtered_movies['score'] = filtered_movies.apply(weighted_rating, axis=1)
filtered_movies['score']

In [None]:
#Sort movies based on score calculated above
filtered_movies = filtered_movies.sort_values('score', ascending=False)

#Print the top 15 movies
filtered_movies[['title', 'vote_count', 'vote_average', 'score']].head(20)

# Content Based Recommender

This recommender uses the cosine similarity of movie overviews to determine which movies are similar.

In [None]:
metadata = pd.read_csv('./data/movies_metadata.csv', low_memory=False)

# print plot overviews of the first 5 movies.
metadata['overview'].head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# define a TF-IDF Vectorizer Object
# rremove all english stop words
tfidf = TfidfVectorizer(stop_words='english')

# replace NaNs int he data with an empty string
metadata['overview'] = metadata['overview'].fillna('')

# construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

# movies x words(vocabulary)
tfidf_matrix.shape

In [None]:
tfidf.get_feature_names_out()[5000:5010]

In [None]:
# import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

cosine_sim.shape

In [None]:
cosine_sim[1]

In [None]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

indices[:10]

In [None]:
# function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # get the index of the movie that matches the title
    idx = indices[title]

    # get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

In [None]:
get_recommendations('The Dark Knight Rises')

In [None]:
get_recommendations('The Godfather')