# Movie Suggestion Algorithm

When I am looking for a new movie to watch, I can think of past movies that fit what I would like but need suggestions similar. This algorithm clusters movies based on genre and tags and chooses movies from same cluster as the movie suggested by the user. The cluster method constrains the suggestion to similar movies but allows some randomness.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.cluster import KMeans
from sklearn.preprocessing import MultiLabelBinarizer

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import data
review_data = pd.read_csv('/kaggle/input/imdb-extensive-dataset/IMDb ratings.csv')
movie_data = pd.read_csv('/kaggle/input/imdb-extensive-dataset/IMDb movies.csv')
movie_tags = pd.read_csv('/kaggle/input/mpst-movie-plot-synopses-with-tags/mpst_full_data.csv')
movie_data.index = movie_data['imdb_title_id'].astype(str)

In [None]:
#converts the comma separated tags into list
tag_append = []
for tag in movie_tags.tags:
    tag_append.append(tag.split(', '))
#movie_tags.tags = tag_append

#converts the tags into labels

tag_mlb = MultiLabelBinarizer()
tag_mlb.fit(movie_tags['tags'])
tag_labels = pd.DataFrame(tag_mlb.transform(pd.Series(tag_append)), index=movie_tags['imdb_id'].astype(str))

# convert genre into labels
genre_append = []
for g in movie_data.genre:
    genre_append.append(g.split(', '))

genre_mlb = MultiLabelBinarizer()
genre_mlb.fit(movie_tags['tags'])
genre_labels = pd.DataFrame(genre_mlb.transform(pd.Series(genre_append)), 
                            index=movie_data['imdb_title_id'].astype(str))

In [None]:
# filter values
genre = "Comedy"
user_review = 6.5
critic_review = 6.5
movie_language = 'English'
    
filter_columns = ['language', 'reviews_from_users', 'reviews_from_critics', 'avg_vote']
value_columns = ['reviews_from_users', 'reviews_from_critics', 'avg_vote']
filtered_movie_data = movie_data.copy()
filtered_movie_data.index = filtered_movie_data['imdb_title_id'].astype(str)
filtered_movie_data = filtered_movie_data.query('avg_vote > @user_review & language == @movie_language').loc[:, value_columns] 

intersection = filtered_movie_data.index.intersection(tag_labels.index).intersection(genre_labels.index)
movie_cluster_data = pd.concat([
    filtered_movie_data.loc[intersection,:], 
    tag_labels.loc[intersection,:]],
    axis=1).dropna()

In [None]:
#create movie clusters
desired_suggestions = {"very similar": int(len(movie_cluster_data) / 15), 
                       "somewhat similar": int(len(movie_cluster_data) / 30), 
                       "keep me guessing": int(len(movie_cluster_data) / 60)}
desired_suggestions_input = 'very similar'

kmeans = KMeans(20, random_state=0)
kmeans.fit(movie_cluster_data)
movie_cluster_dimensions = kmeans.transform(movie_cluster_data)

top_cluster = []
for r in movie_cluster_dimensions:
    top_cluster.append(np.where(r == min(r))[0][0])
movie_cluster = pd.DataFrame({'cluster': top_cluster, 'movie_id' : movie_cluster_data.index})

In [None]:
# choose movie
movie_basis = 'tt1403865' # True Grit
suggested_movie_cluster = int(movie_cluster.query('movie_id == @movie_basis').cluster)
suggested_movie_ids = movie_cluster.query('cluster == @suggested_movie_cluster').movie_id

suggested_movies = ''
if len(suggested_movie_ids) > 5:
    suggested_movies = movie_data.loc[suggested_movie_ids].sample(n=5)
else:
    suggested_movies = movie_data.loc[suggested_movie_ids]

try:
    print(list(suggested_movies.original_title))
except:
    print("No similar movies found in the cluster")