# Description: Build a movie recommendation engine
Reference: https://medium.com/analytics-vidhya/build-a-movie-recommendation-engine-using-python-scikit-learn-machine-learning-e68ba297e163

In [0]:
# Import the libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load the data
from google.colab import files
uploaded = files.upload()

Saving tmdb_movies_data.csv to tmdb_movies_data (1).csv


In [0]:
df = pd.read_csv('tmdb_movies_data.csv')

In [4]:
# Print the first 6 rows of data
df.head(6)

Unnamed: 0,id,imdb_id,popularity,budget,revenue,original_title,cast,homepage,director,tagline,keywords,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
0,135397,tt0369610,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,http://www.jurassicworld.com/,Colin Trevorrow,The park is open.,monster|dna|tyrannosaurus rex|velociraptor|island,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/2015,5562,6.5,2015,137999939.3,1392446000.0
1,76341,tt1392190,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,http://www.madmaxmovie.com/,George Miller,What a Lovely Day.,future|chase|post-apocalyptic|dystopia|australia,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/2015,6185,7.1,2015,137999939.3,348161300.0
2,262500,tt2908446,13.112507,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,http://www.thedivergentseries.movie/#insurgent,Robert Schwentke,One Choice Can Destroy You,based on novel|revolution|dystopia|sequel|dyst...,Beatrice Prior must confront her inner demons ...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,3/18/2015,2480,6.3,2015,101199955.5,271619000.0
3,140607,tt2488496,11.173104,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,http://www.starwars.com/films/star-wars-episod...,J.J. Abrams,Every generation has a story.,android|spaceship|jedi|space opera|3d,Thirty years after defeating the Galactic Empi...,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,12/15/2015,5292,7.5,2015,183999919.0,1902723000.0
4,168259,tt2820852,9.335014,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,http://www.furious7.com/,James Wan,Vengeance Hits Home,car race|speed|revenge|suspense|car,Deckard Shaw seeks revenge against Dominic Tor...,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,4/1/2015,2947,7.3,2015,174799923.1,1385749000.0
5,281957,tt1663202,9.1107,135000000,532950503,The Revenant,Leonardo DiCaprio|Tom Hardy|Will Poulter|Domhn...,http://www.foxmovies.com/movies/the-revenant,Alejandro GonzÃ¡lez IÃ±Ã¡rritu,"(n. One who has returned, as if from the dead.)",father-son relationship|rape|based on novel|mo...,"In the 1820s, a frontiersman, Hugh Glass, sets...",156,Western|Drama|Adventure|Thriller,Regency Enterprises|Appian Way|CatchPlay|Anony...,12/25/2015,3929,7.2,2015,124199945.4,490314200.0


In [5]:
# Get a count of the number of rows and columns
df.shape

(10866, 21)

In [6]:
# Create a list of important columns to keep
features = ['keywords', 'cast', 'genres', 'director']
df[features].head()

Unnamed: 0,keywords,cast,genres,director
0,monster|dna|tyrannosaurus rex|velociraptor|island,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Action|Adventure|Science Fiction|Thriller,Colin Trevorrow
1,future|chase|post-apocalyptic|dystopia|australia,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,Action|Adventure|Science Fiction|Thriller,George Miller
2,based on novel|revolution|dystopia|sequel|dyst...,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Adventure|Science Fiction|Thriller,Robert Schwentke
3,android|spaceship|jedi|space opera|3d,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,Action|Adventure|Science Fiction|Fantasy,J.J. Abrams
4,car race|speed|revenge|suspense|car,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,Action|Crime|Thriller,James Wan


In [0]:
# Clean and process teh data
for feature in features:
  df[feature] = df[feature].fillna('') # Fill any missing values with the empty string

In [0]:
# Create a function to combine the values of the important column into a single 
def combine_features(row):
  return row['keywords']+" "+row['cast']+" "+row['genres']+" "+row['director']

In [9]:
# Apply the function to each row in the data set to store the combined strings into a new column
df['combined_features'] = df.apply(combine_features, axis = 1)



df['index'] = df.index
# Print the data frame
df.head()


Unnamed: 0,id,imdb_id,popularity,budget,revenue,original_title,cast,homepage,director,tagline,keywords,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj,combined_features,index
0,135397,tt0369610,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,http://www.jurassicworld.com/,Colin Trevorrow,The park is open.,monster|dna|tyrannosaurus rex|velociraptor|island,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/2015,5562,6.5,2015,137999939.3,1392446000.0,monster|dna|tyrannosaurus rex|velociraptor|isl...,0
1,76341,tt1392190,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,http://www.madmaxmovie.com/,George Miller,What a Lovely Day.,future|chase|post-apocalyptic|dystopia|australia,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/2015,6185,7.1,2015,137999939.3,348161300.0,future|chase|post-apocalyptic|dystopia|austral...,1
2,262500,tt2908446,13.112507,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,http://www.thedivergentseries.movie/#insurgent,Robert Schwentke,One Choice Can Destroy You,based on novel|revolution|dystopia|sequel|dyst...,Beatrice Prior must confront her inner demons ...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,3/18/2015,2480,6.3,2015,101199955.5,271619000.0,based on novel|revolution|dystopia|sequel|dyst...,2
3,140607,tt2488496,11.173104,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,http://www.starwars.com/films/star-wars-episod...,J.J. Abrams,Every generation has a story.,android|spaceship|jedi|space opera|3d,Thirty years after defeating the Galactic Empi...,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,12/15/2015,5292,7.5,2015,183999919.0,1902723000.0,android|spaceship|jedi|space opera|3d Harrison...,3
4,168259,tt2820852,9.335014,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,http://www.furious7.com/,James Wan,Vengeance Hits Home,car race|speed|revenge|suspense|car,Deckard Shaw seeks revenge against Dominic Tor...,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,4/1/2015,2947,7.3,2015,174799923.1,1385749000.0,car race|speed|revenge|suspense|car Vin Diesel...,4


In [0]:
# Convert a collection of text to a matrix of token counts
count_matrix = CountVectorizer().fit_transform(df['combined_features'])

In [11]:
# Get the cosine similarity matrix from the count matrix
cosine_sim =  cosine_similarity(count_matrix)
print(cosine_sim)

[[1.         0.20833333 0.16666667 ... 0.         0.05270463 0.        ]
 [0.20833333 1.         0.25       ... 0.         0.05270463 0.04351941]
 [0.16666667 0.25       1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.05773503 0.        ]
 [0.05270463 0.05270463 0.         ... 0.05773503 1.         0.05504819]
 [0.         0.04351941 0.         ... 0.         0.05504819 1.        ]]


In [12]:
# Get the number of rows and columns in the cosine_sim
cosine_sim.shape

(10866, 10866)

In [0]:
# Helper function to get the title from the index
def get_title_from_index(index):
  return df[df.index == index]['original_title'].values[0]

# Helper function to get the index from the title
def get_index_from_title(title):
  return df[df.original_title == title]['index'].values[0]

In [14]:
# Get the title of the mvoie that the user likes
movie_user_likes = 'Jurassic World'

# Find that movies index
movie_index = get_index_from_title(movie_user_likes)
movie_index

0

In [15]:
# Enumerate through all the similarity score of 'Jurassic World' to make 
# a tuple of movie index and similarity scores
# NOTE: We will return a list of tuples in the form (movie index, similarity score)

similar_movies = list(enumerate(cosine_sim[movie_index]))
similar_movies[:20]

[(0, 0.9999999999999997),
 (1, 0.2083333333333334),
 (2, 0.1666666666666667),
 (3, 0.17407765595569785),
 (4, 0.08512565307587487),
 (5, 0.07715167498104596),
 (6, 0.19641855032959654),
 (7, 0.1276884796138123),
 (8, 0.04166666666666668),
 (9, 0.0),
 (10, 0.08333333333333336),
 (11, 0.1543033499620919),
 (12, 0.08908708063747481),
 (13, 0.17025130615174974),
 (14, 0.23570226039551584),
 (15, 0.0),
 (16, 0.08703882797784893),
 (17, 0.16012815380508716),
 (18, 0.0),
 (19, 0.12009611535381537)]

In [16]:
# Sort the list of similar movies according to the similarity scores in descenfing order
sorted_similar_movies = sorted(similar_movies, key = lambda x:x[1], reverse=True)[1:]

# Print 
sorted_similar_movies[:30]

[(2820, 0.4166666666666668),
 (5391, 0.38306543884143684),
 (10223, 0.3046358979224712),
 (2576, 0.297939785765562),
 (1392, 0.28022426915890253),
 (2784, 0.2738612787525831),
 (3802, 0.26352313834736496),
 (7973, 0.2553769592276246),
 (9205, 0.2553769592276246),
 (2216, 0.2551551815399144),
 (1895, 0.25000000000000006),
 (5235, 0.25000000000000006),
 (5553, 0.25000000000000006),
 (4804, 0.2475368857441686),
 (659, 0.24494897427831788),
 (4383, 0.24494897427831788),
 (9258, 0.24494897427831788),
 (978, 0.24056261216234412),
 (3574, 0.24056261216234412),
 (4030, 0.24056261216234412),
 (14, 0.23570226039551584),
 (1178, 0.23570226039551584),
 (1612, 0.23570226039551584),
 (5233, 0.23570226039551584),
 (510, 0.2341464528954235),
 (2696, 0.2341464528954235),
 (3132, 0.2341464528954235),
 (5915, 0.2341464528954235),
 (630, 0.23145502494313788),
 (4361, 0.23145502494313788)]

In [17]:
# Create a loop to print the first 5 entries from the sorted similar movies list
i = 0
print('The top 5 similar movies to' +movie_user_likes+' are:')
for element in sorted_similar_movies:
  print(get_title_from_index(element[0]))
  i = i + 1
  if i >= 5:
    break

The top 5 similar movies toJurassic World are:
Jurassic Park III
The Lost World: Jurassic Park
Jurassic Park
Beowulf
Terminator Salvation
