In [None]:
#Description: Build a movie recommendation engine using python

In [None]:
#Import the libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
#Load the data
from google.colab import files
uploaded = files.upload()

Saving movie_data.csv to movie_data.csv


In [None]:
#Store the data
df = pd.read_csv('movie_data.csv')
df['Movie_id']=range(0,1000)
#Show the first 3 rows of data
df.head(3)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Movie_id
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,1
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,2


In [None]:
#Get a count of the number of rows/movies in the data set and the number of columns
df.shape

(1000, 13)

In [None]:
#Create a list of important columns for the recommendation engine
columns = ['Actors','Director','Genre','Title']

In [None]:
#Show the data
df[columns].head(3)

Unnamed: 0,Actors,Director,Genre,Title
0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",James Gunn,"Action,Adventure,Sci-Fi",Guardians of the Galaxy
1,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",Ridley Scott,"Adventure,Mystery,Sci-Fi",Prometheus
2,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",M. Night Shyamalan,"Horror,Thriller",Split


In [None]:
#Check for any missing values in the important columns
df[columns].isnull().values.any()

False

In [None]:
#create a function to combine the values of the important columns into a single string
def get_important_features(data):
  important_features = []
  for i in range(0, data.shape[0]):
    important_features.append(data['Actors'][i]+' '+data['Director'][i]+' '+data['Genre'][i]+' '+data['Title'][i])

  return important_features

In [None]:
#Create a column to hold the combined strings
df['important_features'] = get_important_features(df)

#Show the data
df.head(3)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Movie_id,important_features
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S..."
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,1,"Noomi Rapace, Logan Marshall-Green, Michael Fa..."
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,2,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar..."


In [None]:
#convert the text to a matrix of token counts
cm = CountVectorizer().fit_transform(df['important_features'])

In [None]:
#Get the cosine similarity matrix from the count matrix
cs = cosine_similarity(cm)
#Print the cosine similarity matrix
print(cs)

[[1.         0.1767767  0.06085806 ... 0.0571662  0.06537205 0.        ]
 [0.1767767  1.         0.         ... 0.         0.06933752 0.        ]
 [0.06085806 0.         1.         ... 0.         0.         0.        ]
 ...
 [0.0571662  0.         0.         ... 1.         0.06726728 0.        ]
 [0.06537205 0.06933752 0.         ... 0.06726728 1.         0.07161149]
 [0.         0.         0.         ... 0.         0.07161149 1.        ]]


In [None]:
#Get the shape of the cosine similarity matrix
cs.shape

(1000, 1000)

In [None]:
#Get the title of the movie that the user likes
title = 'Cars'

#Find the movies id
movie_id = df[df.Title == title]['Movie_id'].values[0]

In [None]:
#Create a list of enumerations for the similarity score [ (movie_id, similarity score), (...) ]
scores = list(enumerate(cs[movie_id]))

In [None]:
#Sort the list
sorted_scores = sorted(scores, key = lambda x:x[1], reverse = True)
sorted_scores = sorted_scores[1:]

In [None]:
#Print the sorted scores
print(sorted_scores)

[(775, 0.75), (629, 0.3061862178478973), (730, 0.2773500981126146), (843, 0.2727723627949905), (242, 0.2672612419124244), (499, 0.2672612419124244), (834, 0.2672612419124244), (412, 0.2581988897471611), (104, 0.25), (305, 0.25), (563, 0.25), (15, 0.24253562503633297), (296, 0.24253562503633297), (712, 0.24253562503633297), (447, 0.23570226039551587), (846, 0.20801257358446093), (13, 0.2004459314343183), (30, 0.2004459314343183), (74, 0.2004459314343183), (174, 0.2004459314343183), (288, 0.2004459314343183), (330, 0.2004459314343183), (407, 0.2004459314343183), (409, 0.2004459314343183), (589, 0.2004459314343183), (883, 0.2004459314343183), (313, 0.19611613513818404), (23, 0.19364916731037082), (40, 0.19364916731037082), (119, 0.19364916731037082), (179, 0.19364916731037082), (241, 0.19364916731037082), (274, 0.19364916731037082), (403, 0.19364916731037082), (559, 0.19364916731037082), (597, 0.19364916731037082), (663, 0.19364916731037082), (688, 0.19364916731037082), (707, 0.1936491673

In [None]:
#Create a loop to print the first 7 similar movies
j = 0
print('The 7 most recommended movies to', title, 'are:\n')
for item in sorted_scores:
  movie_title = df[df.Movie_id == item[0]]['Title'].values[0]
  print(j+1, movie_title)
  j=j+1
  if j>6:
    break

The 7 most recommended movies to Cars are:

1 Cars 2
2 Night at the Museum: Secret of the Tomb
3 The Internship
4 The Princess and the Frog
5 Rock Dog
6 Up
7 The Dictator
