In [2]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
movie_data = pd.read_csv("movies.csv")

In [5]:
#number of rows and columns in dataset

movie_data.shape

(4803, 24)

In [6]:
#picking the most relevant features/attributes for recommendation

relevant_features = ["genres", "keywords", "tagline", "cast", "director"]


In [7]:
#replacing empty values with "null" to prevent the code from breaking

for feature in relevant_features:
    movie_data[feature] = movie_data[feature].fillna('')


In [8]:
#combining all the relevant features 

combined_features = movie_data["genres"]+" "+movie_data["keywords"]+" "+movie_data["tagline"]\
+" "+movie_data["cast"]+" "+movie_data["director"]

combined_features

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai Daniel Henney Eliza...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object

In [9]:
#converting the text to feature vectors

vectorizer = TfidfVectorizer()

In [68]:
feature_vectors = vectorizer.fit_transform(combined_features)
# print(feature_vectors) **COMMENTED OUT TO MAKE VIEW EXPERIENCE EASIER, THE PRINT OUTPUT IS SUPPPPPPER LONG**

In [59]:
#Using Cosine Similarity to get the similarity scores

similarity = cosine_similarity(feature_vectors)
# print(similarity) **COMMENTED OUT TO MAKE VIEW EXPERIENCE EASIER, THE PRINT OUTPUT IS SUPPPPPPER LONG**

In [12]:
print(similarity.shape)

#this prints (4803, 4803) since it compares each movie (4803 )with all the rest 4803 times to see which ones are similar

(4803, 4803)


In [23]:
#Getting the movie name from the user

movie_name = input("Enter your favourite movie: ")

Enter your favourite movie: avatar


In [64]:
#creating a list of all the movie names in the dataset

list_of_all_titles = movie_data["title"].tolist()
# print(list_of_all_titles) **COMMENTED OUT TO MAKE VIEW EXPERIENCE EASIER, THE PRINT OUTPUT IS SUPPPPPPER LONG**

In [15]:
#finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['Iron Man', 'Iron Man 3', 'Iron Man 2']


In [16]:
close_match = find_close_match[0]
close_match

'Iron Man'

In [17]:
#finding the index (row) of the movie given its title 

index_of_movie = movie_data[movie_data.title == close_match]["index"].values[0]
index_of_movie

68

In [60]:
#Getting a list of similar movies (The meat of the code)

similarity_score = list(enumerate(similarity[index_of_movie]))
similarity_score 

In [19]:
len(similarity_score)

4803

In [69]:
#Sorting the movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)
# print(sorted_similar_movies) **COMMENTED OUT TO MAKE VIEW EXPERIENCE EASIER, THE PRINT OUTPUT IS SUPPPPPPER LONG**

In [33]:
#converting index of movie to name

print("Movies suggested: \n")

i = 1

for movie in sorted_similar_movies:
    index = movie [0] 
    title_from_index = movie_data[movie_data.index == index]["title"].values[0]
    if (i<30):
        print(i,".",title_from_index)
        i+=1

Movies suggested: 

1 . Iron Man
2 . Iron Man 2
3 . Iron Man 3
4 . Avengers: Age of Ultron
5 . The Avengers
6 . Captain America: Civil War
7 . Captain America: The Winter Soldier
8 . Ant-Man
9 . X-Men
10 . Made
11 . X-Men: Apocalypse
12 . X2
13 . The Incredible Hulk
14 . The Helix... Loaded
15 . X-Men: First Class
16 . X-Men: Days of Future Past
17 . Captain America: The First Avenger
18 . Kick-Ass 2
19 . Guardians of the Galaxy
20 . Deadpool
21 . Thor: The Dark World
22 . G-Force
23 . X-Men: The Last Stand
24 . Duets
25 . Mortdecai
26 . The Last Airbender
27 . Southland Tales
28 . Zathura: A Space Adventure
29 . Sky Captain and the World of Tomorrow


In [71]:
#FINAL PRODUCT

movie_name = input("Enter your favourite movie: ")

list_of_all_titles = movie_data["title"].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_movie = movie_data[movie_data.title == close_match]["index"].values[0]

similarity_score = list(enumerate(similarity[index_of_movie]))


sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)

print("Movies suggested: \n")

i = 1

for movie in sorted_similar_movies:
    index = movie [0] 
    title_from_index = movie_data[movie_data.index == index]["title"].values[0]
    if (i<11):
        print(i,".",title_from_index)
        i+=1



Enter your favourite movie: batman
Movies suggested: 

1 . Batman
2 . Batman Returns
3 . Batman & Robin
4 . The Dark Knight Rises
5 . Batman Begins
6 . The Dark Knight
7 . A History of Violence
8 . Superman
9 . Beetlejuice
10 . Bedazzled


In [53]:
import pickle

In [56]:
pickle.dump(combined_features,open('movie_data.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))