In [1]:
import pandas as pd
import numpy as np

import difflib

# for converting text vectors into feature vectors
from sklearn.feature_extraction.text import TfidfVectorizer

#for finding cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

#movies dataframe
movies_df = pd.read_csv(r"C:\Users\dell\Downloads\movies.csv")



In [2]:
title_list = movies_df['title'].to_list()

print(len(title_list))

4803


In [3]:
movies_df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [4]:
#select features 
selected_features = ['genres','keywords','tagline','cast','director']
print(selected_features)

['genres', 'keywords', 'tagline', 'cast', 'director']


In [5]:
#preprocessing the data

for feature in selected_features:
    movies_df[feature] = movies_df[feature].fillna('')


In [6]:
#combining features

combined_features = movies_df['genres']+ ' ' +movies_df['keywords']+ ' ' +movies_df['tagline']+ ' ' +movies_df['cast']+ ' ' +movies_df['director']

In [7]:
print(combined_features)

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai Daniel Henney Eliza...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object


In [8]:
#converting the textual data to feature vector

vectorizer = TfidfVectorizer()

feature_vectors = vectorizer.fit_transform(combined_features)

In [9]:
print(feature_vectors)

  (0, 2432)	0.17272411194153
  (0, 7755)	0.1128035714854756
  (0, 13024)	0.1942362060108871
  (0, 10229)	0.16058685400095302
  (0, 8756)	0.22709015857011816
  (0, 14608)	0.15150672398763912
  (0, 16668)	0.19843263965100372
  (0, 14064)	0.20596090415084142
  (0, 13319)	0.2177470539412484
  (0, 17290)	0.20197912553916567
  (0, 17007)	0.23643326319898797
  (0, 13349)	0.15021264094167086
  (0, 11503)	0.27211310056983656
  (0, 11192)	0.09049319826481456
  (0, 16998)	0.1282126322850579
  (0, 15261)	0.07095833561276566
  (0, 4945)	0.24025852494110758
  (0, 14271)	0.21392179219912877
  (0, 3225)	0.24960162956997736
  (0, 16587)	0.12549432354918996
  (0, 14378)	0.33962752210959823
  (0, 5836)	0.1646750903586285
  (0, 3065)	0.22208377802661425
  (0, 3678)	0.21392179219912877
  (0, 5437)	0.1036413987316636
  :	:
  (4801, 17266)	0.2886098184932947
  (4801, 4835)	0.24713765026963996
  (4801, 403)	0.17727585190343226
  (4801, 6935)	0.2886098184932947
  (4801, 11663)	0.21557500762727902
  (4801, 1672

In [10]:
#finding similarity between vectors

similarity = cosine_similarity(feature_vectors)



In [11]:
print(similarity)

[[1.         0.07219487 0.037733   ... 0.         0.         0.        ]
 [0.07219487 1.         0.03281499 ... 0.03575545 0.         0.        ]
 [0.037733   0.03281499 1.         ... 0.         0.05389661 0.        ]
 ...
 [0.         0.03575545 0.         ... 1.         0.         0.02651502]
 [0.         0.         0.05389661 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.02651502 0.         1.        ]]


In [12]:
#the above cell finds similarity of a particular movie with all other movies

#similarity[0][2]  indicates similarity of movie 0 with movie 2

In [13]:
movie_name = input('enter your favorite movie\n')

enter your favorite movie
iron man


In [14]:
#finding close match for the given input

list_of_titles = movies_df['title'].to_list()

#print(list_of_titles)

In [15]:
#find the close match of given input with list of all titles

close_matches = difflib.get_close_matches(movie_name,list_of_titles)

print(close_matches)

['Iron Man', 'Iron Man 3', 'Iron Man 2']


In [16]:
#take the first close match

close_match = close_matches[0]

print(close_match)


Iron Man


In [17]:
#now find the index of the movie
index_of_movie = movies_df[movies_df['title'] == close_match]['index'].values[0]

print(index_of_movie)

68


In [18]:
#now take the similarity list and find the index in it

similarity_score = list(enumerate(similarity[index_of_movie]))

In [19]:
#print(similarity_score)

In [20]:
#now sort the list in descending order

sorted_list = sorted(similarity_score,key = lambda x:x[1],reverse = True)



In [21]:
#(sorted_list)

In [22]:
#iterate through this list and recommend the top k movies to the users
recommended_list = []
i = 0
for it in sorted_list:
    movie_index = it[0]
    
    #find the title for the corresponding index
    movie_name = movies_df[movies_df['index'] == movie_index]['title'].values[0]
    recommended_list.append(movie_name)
    
    if(i > 20):
        break
        
    i = i +1   

    


In [23]:
#print(recommended_list)

print('movies suggested for  you are\n')

for movie in recommended_list:
    print(movie)

movies suggested for  you are

Iron Man
Iron Man 2
Iron Man 3
Avengers: Age of Ultron
The Avengers
Captain America: Civil War
Captain America: The Winter Soldier
Ant-Man
X-Men
Made
X-Men: Apocalypse
X2
The Incredible Hulk
The Helix... Loaded
X-Men: First Class
X-Men: Days of Future Past
Captain America: The First Avenger
Kick-Ass 2
Guardians of the Galaxy
Deadpool
Thor: The Dark World
G-Force


In [25]:
movie_name = input('enter your favorite movie\n')

list_of_titles = movies_df['title'].to_list()

close_matches = difflib.get_close_matches(movie_name,list_of_titles)

close_match = close_matches[0]

index_of_movie = movies_df[movies_df['title'] == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_movie]))

sorted_list = sorted(similarity_score,key = lambda x:x[1],reverse = True)


print('movies suggested for you are\n')
i = 0
for it in sorted_list:
    movie_index = it[0]
    
    #find the title for the corresponding index
    movie_name = movies_df[movies_df['index'] == movie_index]['title'].values[0]
    print(movie_name)
    
    if(i > 20):
        break
        
    i = i +1   

    


enter your favorite movie
Avatar
movies suggested for you are

Avatar
Alien
Aliens
Guardians of the Galaxy
Star Trek Beyond
Star Trek Into Darkness
Galaxy Quest
Alien³
Cargo
Trekkies
Gravity
Moonraker
Jason X
Pocahontas
Space Cowboys
The Helix... Loaded
Lockout
Event Horizon
Space Dogs
Machete Kills
Gettysburg
Clash of the Titans
