In [None]:
import pandas as pd
import numpy as np

## Get the data

In [None]:
metadata = pd.read_csv('movies_metadata.csv')

In [None]:
metadata.head()

In [None]:
metadata.columns

## Clean the data
As for now I am only focusing on genres, title, overview and tagline

In [None]:
metadata = metadata.drop_duplicates(subset=['id'])

In [None]:
movie_dt = metadata[['genres', 'title', 'overview', 'tagline']]

In [None]:
movie_dt.head()

In [None]:
movie_dt.isna().sum()

In [None]:
movie_dt.shape

Cleaning genre column and keeping list of genres for each movie

In [None]:
movie_dt.genres[0]

In [None]:
from ast import literal_eval
movie_dt.genres = movie_dt.genres.fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [None]:
movie_dt.genres[0]

In [None]:
movie_dt.head()

## Create feature vactors
Combine `overview` and `tagline` together and name the column as `content`. <br/> Convert `genres` column from list to continuous string. <br/> Then apply TFIDF on `content` column and CountVectorizer on `genres` column.

In [None]:
movie_dt.tagline = movie_dt.tagline.fillna('')
movie_dt.overview = movie_dt.overview.fillna('')
movie_dt['content'] = movie_dt.tagline + movie_dt.overview

In [None]:
movie_dt.head()

In [None]:
final_df = movie_dt[['genres', 'title', 'content']]
final_df['genres'] = final_df['genres'].apply(lambda x : ' '.join(x))
final_df.dropna(inplace=True)
final_df.head()

In [None]:
pd.concat(g for _, g in final_df.groupby("title") if len(g) > 1)

There are some duplicates. For the sake of this task, I am assuming all the duplicates actually indicates the same movie but different descriptions. So I am dropping the duplicate titles.

In [None]:
final_df = final_df.drop_duplicates(subset=['title'])
final_df.head()

## Create the recommendation system
Give 50-50 weightage for both genre tags and content

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [None]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_content = tf.fit_transform(final_df['content'])

In [None]:
cosine_sim_content = linear_kernel(tfidf_content, tfidf_content)

In [None]:
np.save('content.npy', cosine_sim_content)

In [None]:
count_vectorizer = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_genre = count_vectorizer.fit_transform(final_df['genres'])

In [None]:
cosine_sim_genre = cosine_similarity(count_genre, count_genre)

In [None]:
np.save('genre.npy', cosine_sim_genre)

In [None]:
final_df = final_df.reset_index()
titles = final_df['title']
indices = pd.Series(final_df.index, index=final_df['title'])

In [None]:
indices.to_pickle('movie_titles.pkl')

In [None]:
# content = np.load('content.npy')
# genre = np.load('genre.npy')
movie_titles = pd.read_pickle('movie_titles.pkl')

In [None]:
def get_recommendations(title):
    idx = movie_titles[title]
    sim_scores = list(enumerate(content[idx])) + list(enumerate(genre[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [None]:
get_recommendations('Interstellar')

In [None]:
get_recommendations('The Dark Knight')

## convert