In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer #convert text to numerical values
from sklearn.metrics.pairwise import cosine_similarity
import difflib #used to find the closest match based on the given input

#### Data Collection and Pre Processing

In [2]:
df=pd.read_csv('movies.csv')
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [3]:
df.shape

(4803, 24)

In [4]:
# selecting relevant features for collection
selected_features=['genres','keywords','tagline','cast','director']
print(selected_features)

['genres', 'keywords', 'tagline', 'cast', 'director']


In [5]:
# replacing null values with null string

for features in selected_features:
    df[features]=df[features].fillna('')

In [6]:
#combining all the selected features
features_combined=df['genres']+' '+df['keywords']+' '+df['tagline']+' '+df['cast']+' '+df['director']

In [7]:
features_combined

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai Daniel Henney Eliza...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object

In [8]:
#converting the text data to feature vectors
vectorizer=TfidfVectorizer()

In [9]:
features_vector=vectorizer.fit_transform(features_combined)

In [11]:
# getting the similarity socre using cosine similarity
similarity=cosine_similarity(features_vector)

In [13]:
similarity.shape

(4803, 4803)

In [29]:
# getting the movie name from the user
movie_name=input('Enter the movie name : ')

Enter the movie name :  batman


In [30]:
# creating the list of movie names

list_of_movie_name = df['title'].tolist()

In [31]:
# finding a close match with the movie given by the user

find_close_match=difflib.get_close_matches(movie_name,list_of_movie_name)
print(find_close_match)

['Batman', 'Batman', 'Catwoman']


In [32]:
# getting the first value from find_close_match
close_match=find_close_match[0]
close_match

'Batman'

In [33]:
# finding the index of the movie given

find_index=df[df.title==close_match]['index'].values[0]
print(find_index)

1359


In [34]:
# getting a list of similar movies

similarity_score=list(enumerate(similarity[find_index]))

In [35]:
len(similarity_score)

4803

In [36]:
# sorting the movies based on their similarity score

sorted_similar_movies=sorted(similarity_score, key=lambda x:x[1], reverse=True)


In [37]:
# printing the name of the movie

print('Movies suggestions for you : \n')

i=1

for movies in sorted_similar_movies:
    index=movies[0]
    title_of_movies=df[df.index==index]['title'].values[0]
    if(i<30):
        print(i, '.',title_of_movies)
        i+=1

Movies suggestions for you : 

1 . Batman
2 . Batman Returns
3 . Batman & Robin
4 . The Dark Knight Rises
5 . Batman Begins
6 . The Dark Knight
7 . A History of Violence
8 . Superman
9 . Beetlejuice
10 . Bedazzled
11 . Mars Attacks!
12 . The Sentinel
13 . Planet of the Apes
14 . Man of Steel
15 . Suicide Squad
16 . The Mask
17 . Salton Sea
18 . Spider-Man 3
19 . The Postman Always Rings Twice
20 . Hang 'em High
21 . Spider-Man 2
22 . Dungeons & Dragons: Wrath of the Dragon God
23 . Superman Returns
24 . Jonah Hex
25 . Exorcist II: The Heretic
26 . Superman II
27 . Green Lantern
28 . Superman III
29 . Something's Gotta Give


In [39]:
movie_name=input('Enter the movie name : ')

list_of_movie_name = df['title'].tolist()

find_close_match=difflib.get_close_matches(movie_name,list_of_movie_name)

close_match=find_close_match[0]

find_index=df[df.title==close_match]['index'].values[0]

similarity_score=list(enumerate(similarity[find_index]))

sorted_similar_movies=sorted(similarity_score, key=lambda x:x[1], reverse=True)

print('Movies suggestions for you : \n')

i=1

for movies in sorted_similar_movies:
    index=movies[0]
    title_of_movies=df[df.index==index]['title'].values[0]
    if(i<30):
        print(i, '.',title_of_movies)
        i+=1

Enter the movie name :  avatar


Movies suggestions for you : 

1 . Avatar
2 . Alien
3 . Aliens
4 . Guardians of the Galaxy
5 . Star Trek Beyond
6 . Star Trek Into Darkness
7 . Galaxy Quest
8 . Alien³
9 . Cargo
10 . Trekkies
11 . Gravity
12 . Moonraker
13 . Jason X
14 . Pocahontas
15 . Space Cowboys
16 . The Helix... Loaded
17 . Lockout
18 . Event Horizon
19 . Space Dogs
20 . Machete Kills
21 . Gettysburg
22 . Clash of the Titans
23 . Star Wars: Clone Wars: Volume 1
24 . The Right Stuff
25 . Terminator Salvation
26 . The Astronaut's Wife
27 . Planet of the Apes
28 . Star Trek
29 . Wing Commander
