In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

**Demographic Filtering**

In [None]:
credits_df = pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_credits.csv')
movies_df = pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_movies.csv')

credits_df.columns = ['id', 'title_x', 'cast', 'crew']
movies_df = movies_df.merge(credits_df, on='id')

In [None]:
movies_df.head(1)

Weighted_Rating(Formula) = (v / (v + m)) * R + (m / (v + m)) * C 

v - number of votes for a movie
m - minimum number of votes required to make movie eligible for listing
R - average rating of the movie
C - average of the ratings of all hte movies

We already have number of votes for a movie (v), and the rating of the movie. Now we will calculate C and m. 

In [None]:
C = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(0.9)
C, m

In [None]:
qualified_movies = movies_df[movies_df['vote_count'] >= m]
qualified_movies.shape

In [None]:
def weighted_rating(movie, C=C, m=m):
    v = movie['vote_count']
    R = movie['vote_average']
    # Calculation based on the IMDB formula
    return (v / (v + m) * R) + (m / (m + v) * C)

In [None]:
score_col = qualified_movies.apply(weighted_rating, axis=1).to_frame()
qualified_movies = qualified_movies.assign(score=score_col.values)
qualified_movies = qualified_movies.sort_values('score', ascending=False)
qualified_movies.shape

In [None]:
#Print the top 15 movies
qualified_movies[['title', 'vote_count', 'vote_average', 'score']].head(15)

In [None]:
popular_movies = movies_df.sort_values('popularity', ascending=False)
plt.figure(figsize=(12,4))

plt.barh(popular_movies['title'].head(6),popular_movies['popularity'].head(6), align='center',
        color='skyblue')
plt.gca().invert_yaxis()
plt.xlabel("Popularity")
plt.title("Popular Movies")

**Content Based Filtering**

In [None]:
#Replace NaN with an empty string
movies_df['overview'] = movies_df['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_generator = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_generator.fit_transform(movies_df['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

In [None]:
content_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)
movies_indexed_by_title = pd.Series(movies_df.index, index=movies_df['title']).drop_duplicates()
movies_indexed_by_title

In [None]:
def get_content_recommendations(movie_title, similarity):
    movie_index = movies_indexed_by_title[movie_title]
    sim_scores = list(enumerate(similarity[movie_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    
    #getting indexes of movies with highest similarity with the movie given to the function
    movie_indices = [score[0] for score in sim_scores]
    

    # # Return the top 10 most similar movies
    return movies_df['title'].iloc[movie_indices]


In [None]:
get_content_recommendations('The Dark Knight Rises', content_similarity)
# get_content_recommendations('The Godfather', content_similarity)

In [None]:
#testing
movies_df['genres'][0]



In [None]:
#strings to arrays
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    movies_df[feature] = movies_df[feature].apply(literal_eval)

In [None]:
## from column crew get name of director
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

#get top3 or less actors, keywords, genres
def get_list(x):
    if isinstance(x, list):
        # get name of attribute(name of actor | keyword | genre)
        names = [i['name'] for i in x]
        # if more than 3 get only top3
        if len(names) > 3:
            names = names[:3]
        return names

    #if some movie is missing data return empty list
    return []

def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    elif isinstance(x, str):
        return str.lower(x.replace(" ", ""))
    else:
        return ''


In [None]:
movies_df['director'] = movies_df['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    movies_df[feature] = movies_df[feature].apply(get_list)

In [None]:
movies_df[['title','director','cast','keywords','genres']].head(3)

In [None]:
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    movies_df[feature] = movies_df[feature].apply(clean_data)
    
movies_df[['title','director','cast','keywords','genres']].head(3)

In [None]:
def merge_movie_data(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [None]:
movies_df[['merged_data']] = movies_df.apply(merge_movie_data, axis=1)
movies_df['merged_data'][0]

In [None]:
cv_generator = CountVectorizer(stop_words='english')
cv_matrix = cv_generator.fit_transform(movies_df['merged_data'])
metadata_similarity = cosine_similarity(cv_matrix, cv_matrix)


In [None]:
get_content_recommendations('The Avengers', content_similarity)

In [None]:
get_content_recommendations('The Dark Knight Rises', metadata_similarity)
# get_content_recommendations('The Godfather', metadata_similarity)
