In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import difflib # for string matching, for a example, the user inputs "transforms" instead of "transformers", with this library we can find the closest match
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity #for finding the similarity between the user input and the movie titles

In [2]:
#popularity based recommendation system
data = pd.read_csv('movies.csv')
data.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [3]:
data.shape

(4803, 24)

In [4]:
#selecting the relevant features for recommendation
selected_features = ['genres', 'keywords', 'tagline', 'cast', 'director']
print(selected_features)

['genres', 'keywords', 'tagline', 'cast', 'director']


In [5]:
#replacing the NaN values with an empty string
for feature in selected_features:
    data[feature] = data[feature].fillna('')

In [6]:
data[selected_features].head()

Unnamed: 0,genres,keywords,tagline,cast,director
0,Action Adventure Fantasy Science Fiction,culture clash future space war space colony so...,Enter the World of Pandora.,Sam Worthington Zoe Saldana Sigourney Weaver S...,James Cameron
1,Adventure Fantasy Action,ocean drug abuse exotic island east india trad...,"At the end of the world, the adventure begins.",Johnny Depp Orlando Bloom Keira Knightley Stel...,Gore Verbinski
2,Action Adventure Crime,spy based on novel secret agent sequel mi6,A Plan No One Escapes,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,Sam Mendes
3,Action Crime Drama Thriller,dc comics crime fighter terrorist secret ident...,The Legend Ends,Christian Bale Michael Caine Gary Oldman Anne ...,Christopher Nolan
4,Action Adventure Science Fiction,based on novel mars medallion space travel pri...,"Lost in our world, found in another.",Taylor Kitsch Lynn Collins Samantha Morton Wil...,Andrew Stanton


In [7]:
#combining all 5 selected features into a single column
combined_features = data['genres'] + ' ' + data['keywords'] + ' ' + data['tagline'] + ' ' + data['cast'] + ' ' + data['director']

In [8]:
#convert the text to a matrix of TF-IDF features
vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(combined_features)

In [9]:
print(feature_vectors)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 124266 stored elements and shape (4803, 17318)>
  Coords	Values
  (0, 201)	0.07860022416510505
  (0, 274)	0.09021200873707368
  (0, 5274)	0.11108562744414445
  (0, 13599)	0.1036413987316636
  (0, 5437)	0.1036413987316636
  (0, 3678)	0.21392179219912877
  (0, 3065)	0.22208377802661425
  (0, 5836)	0.1646750903586285
  (0, 14378)	0.33962752210959823
  (0, 16587)	0.12549432354918996
  (0, 3225)	0.24960162956997736
  (0, 14271)	0.21392179219912877
  (0, 4945)	0.24025852494110758
  (0, 15261)	0.07095833561276566
  (0, 16998)	0.1282126322850579
  (0, 11192)	0.09049319826481456
  (0, 11503)	0.27211310056983656
  (0, 13349)	0.15021264094167086
  (0, 17007)	0.23643326319898797
  (0, 17290)	0.20197912553916567
  (0, 13319)	0.2177470539412484
  (0, 14064)	0.20596090415084142
  (0, 16668)	0.19843263965100372
  (0, 14608)	0.15150672398763912
  (0, 8756)	0.22709015857011816
  :	:
  (4801, 403)	0.17727585190343229
  (4801, 4835)	0.247137650

In [10]:
# getting the cosine similarity score
similarity = cosine_similarity(feature_vectors)
print(similarity)

[[1.         0.07219487 0.037733   ... 0.         0.         0.        ]
 [0.07219487 1.         0.03281499 ... 0.03575545 0.         0.        ]
 [0.037733   0.03281499 1.         ... 0.         0.05389661 0.        ]
 ...
 [0.         0.03575545 0.         ... 1.         0.         0.02651502]
 [0.         0.         0.05389661 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.02651502 0.         1.        ]]


In [11]:
similarity.shape

(4803, 4803)

In [16]:
#getting the title of the movie that the user likes
movie_name = input("Enter the movie name: ")
print(movie_name)

iron man


In [17]:
#creating a list of all the movie titles in the dataset to match the user input
movie_list = data['title'].tolist()

In [18]:
#finding the closest match of the user input in the movie list
best_match = difflib.get_close_matches(movie_name, movie_list)
print(best_match)

['Iron Man', 'Iron Man 3', 'Iron Man 2']


In [19]:
close_match = best_match[0]
print(close_match)

Iron Man


In [None]:
#finding the index of the movie with title (for finding the similarity score)
index = data[data['title'] == close_match].index[0]
print(index)

68


In [None]:
#getting a list of similar movies
similarity_score = list(enumerate(similarity[index])) #enumarate is a function that adds a counter to an iterable and returns it in a form of enumerate object and it is like a loop counter
print(similarity_score)

[(0, np.float64(0.033570748780675445)), (1, np.float64(0.0546448279236134)), (2, np.float64(0.013735500604224325)), (3, np.float64(0.006468756104392058)), (4, np.float64(0.03268943310073387)), (5, np.float64(0.013907256685755475)), (6, np.float64(0.07692837576335508)), (7, np.float64(0.23944423963486416)), (8, np.float64(0.007882387851851008)), (9, np.float64(0.07599206098164224)), (10, np.float64(0.07536074882460439)), (11, np.float64(0.01192606921174529)), (12, np.float64(0.013707618139948932)), (13, np.float64(0.01237607492508997)), (14, np.float64(0.09657127116284187)), (15, np.float64(0.007286271383816743)), (16, np.float64(0.22704403782296806)), (17, np.float64(0.013112928084103857)), (18, np.float64(0.04140526820609594)), (19, np.float64(0.07883282546834255)), (20, np.float64(0.07981173664799916)), (21, np.float64(0.011266873271064948)), (22, np.float64(0.006892575895462364)), (23, np.float64(0.006599097891242659)), (24, np.float64(0.012665208122549735)), (25, np.float64(0.0)), 

In [None]:
#sorting the list based on the similarity score in descending order
sorted_score = sorted(similarity_score, key = lambda x:x[1], reverse = True) #reverse = True for descending order
print(sorted_score)

[(68, np.float64(1.0)), (79, np.float64(0.40890433998005965)), (31, np.float64(0.3146705244947752)), (7, np.float64(0.23944423963486416)), (16, np.float64(0.22704403782296806)), (26, np.float64(0.21566241096831162)), (85, np.float64(0.20615862984665334)), (182, np.float64(0.19573956139611612)), (511, np.float64(0.16702973947860683)), (3623, np.float64(0.1609246088135586)), (64, np.float64(0.1529992413944514)), (203, np.float64(0.1481866794866512)), (174, np.float64(0.1471993120942043)), (4401, np.float64(0.14505971470107848)), (101, np.float64(0.14401677581826292)), (46, np.float64(0.14216268867232232)), (169, np.float64(0.1380947013224906)), (1740, np.float64(0.1362438264169076)), (94, np.float64(0.13616819579029016)), (788, np.float64(0.13305895074229218)), (126, np.float64(0.13263982780511063)), (131, np.float64(0.13137698586006535)), (33, np.float64(0.13089810941050173)), (2487, np.float64(0.12309731939910509)), (783, np.float64(0.12162995562040377)), (138, np.float64(0.11846458075

In [30]:
#printing the top 5 similar movies
print("Top 10 similar movies to " + movie_name + " are: ")
i = 1
for movie in sorted_score:
    index = movie[0] #getting the index of the movie from the sorted list bcz in all the movies the first element is the index
    title = data[data.index == index]['title'].values[0] #getting the title of the movie from the dataset
    if (i < 11): #printing the top 30 movies
        print(i, title)
        i = i + 1
    

Top 10 similar movies to avatar are: 
1 Avatar
2 Alien
3 Aliens
4 Guardians of the Galaxy
5 Star Trek Beyond
6 Star Trek Into Darkness
7 Galaxy Quest
8 Alien³
9 Cargo
10 Trekkies


include all previous codes in a one cell to run the recommendation system easily

In [32]:
#getting the title of the movie that the user likes
movie_name = input("Enter the movie name: ")
print(movie_name)
#creating a list of all the movie titles in the dataset to match the user input
movie_list = data['title'].tolist()
#finding the closest match of the user input in the movie list
best_match = difflib.get_close_matches(movie_name, movie_list)
print(best_match)
close_match = best_match[0]
print(close_match)
#finding the index of the movie with title (for finding the similarity score)
index = data[data['title'] == close_match].index[0]
print(index)
#getting a list of similar movies
similarity_score = list(enumerate(similarity[index])) #enumarate is a function that adds a counter to an iterable and returns it in a form of enumerate object and it is like a loop counter
print(similarity_score)
#sorting the list based on the similarity score in descending order
sorted_score = sorted(similarity_score, key = lambda x:x[1], reverse = True) #reverse = True for descending order
print(sorted_score)
#printing the top 5 similar movies
print("Top 10 similar movies to " + movie_name + " are: ")
i = 1
for movie in sorted_score:
    index = movie[0] #getting the index of the movie from the sorted list bcz in all the movies the first element is the index
    title = data[data.index == index]['title'].values[0] #getting the title of the movie from the dataset
    if (i < 11): #printing the top 30 movies
        print(i, title)
        i = i + 1
    

hulk
['Hulk']
Hulk
165
[(0, np.float64(0.061063213309123024)), (1, np.float64(0.020526860692304)), (2, np.float64(0.032494750708450926)), (3, np.float64(0.017749699819554335)), (4, np.float64(0.029763513697224533)), (5, np.float64(0.10265491230361133)), (6, np.float64(0.0)), (7, np.float64(0.03085235772851647)), (8, np.float64(0.0)), (9, np.float64(0.006732271450558831)), (10, np.float64(0.040637705757654)), (11, np.float64(0.006064558003533321)), (12, np.float64(0.04465264609921927)), (13, np.float64(0.011422525454431494)), (14, np.float64(0.03589437184334987)), (15, np.float64(0.0)), (16, np.float64(0.029254593410653507)), (17, np.float64(0.006668090847895653)), (18, np.float64(0.08060540273172108)), (19, np.float64(0.015570117642843162)), (20, np.float64(0.012339213546642577)), (21, np.float64(0.005729348476658153)), (22, np.float64(0.005025065887495757)), (23, np.float64(0.06838988118941305)), (24, np.float64(0.0202458906822748)), (25, np.float64(0.00352994278849995)), (26, np.floa