In [3]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
movies_data=pd.read_csv("C:\\Users\\Nithiyanand\\Downloads\\movies.csv")
movies_data.head

<bound method NDFrame.head of       index     budget                                    genres  \
0         0  237000000  Action Adventure Fantasy Science Fiction   
1         1  300000000                  Adventure Fantasy Action   
2         2  245000000                    Action Adventure Crime   
3         3  250000000               Action Crime Drama Thriller   
4         4  260000000          Action Adventure Science Fiction   
...     ...        ...                                       ...   
4798   4798     220000                     Action Crime Thriller   
4799   4799       9000                            Comedy Romance   
4800   4800          0             Comedy Drama Romance TV Movie   
4801   4801          0                                       NaN   
4802   4802          0                               Documentary   

                                               homepage      id  \
0                           http://www.avatarmovie.com/   19995   
1          http://d

In [7]:
movies_data.shape

(4803, 24)

In [8]:
selected_features = ['genres','keywords','tagline','cast','director']
print(selected_features)

['genres', 'keywords', 'tagline', 'cast', 'director']


In [9]:
for feature in selected_features:
  movies_data[feature] = movies_data[feature].fillna('')

In [11]:
# combining all the 5 selected features

combined_features = movies_data['genres']+' '+movies_data['keywords']+' '+movies_data['tagline']+' '+movies_data['cast']+' '+movies_data['director']
combined_features

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai Daniel Henney Eliza...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object

In [14]:
# converting the text data to feature vectors

vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(combined_features)


In [15]:
print(feature_vectors)

  (0, 2432)	0.17272411194153
  (0, 7755)	0.1128035714854756
  (0, 13024)	0.1942362060108871
  (0, 10229)	0.16058685400095302
  (0, 8756)	0.22709015857011816
  (0, 14608)	0.15150672398763912
  (0, 16668)	0.19843263965100372
  (0, 14064)	0.20596090415084142
  (0, 13319)	0.2177470539412484
  (0, 17290)	0.20197912553916567
  (0, 17007)	0.23643326319898797
  (0, 13349)	0.15021264094167086
  (0, 11503)	0.27211310056983656
  (0, 11192)	0.09049319826481456
  (0, 16998)	0.1282126322850579
  (0, 15261)	0.07095833561276566
  (0, 4945)	0.24025852494110758
  (0, 14271)	0.21392179219912877
  (0, 3225)	0.24960162956997736
  (0, 16587)	0.12549432354918996
  (0, 14378)	0.33962752210959823
  (0, 5836)	0.1646750903586285
  (0, 3065)	0.22208377802661425
  (0, 3678)	0.21392179219912877
  (0, 5437)	0.1036413987316636
  :	:
  (4801, 17266)	0.2886098184932947
  (4801, 4835)	0.24713765026963996
  (4801, 403)	0.17727585190343226
  (4801, 6935)	0.2886098184932947
  (4801, 11663)	0.21557500762727902
  (4801, 1672

In [16]:
# getting the similarity scores using cosine similarity

similarity = cosine_similarity(feature_vectors)
similarity

array([[1.        , 0.07219487, 0.037733  , ..., 0.        , 0.        ,
        0.        ],
       [0.07219487, 1.        , 0.03281499, ..., 0.03575545, 0.        ,
        0.        ],
       [0.037733  , 0.03281499, 1.        , ..., 0.        , 0.05389661,
        0.        ],
       ...,
       [0.        , 0.03575545, 0.        , ..., 1.        , 0.        ,
        0.02651502],
       [0.        , 0.        , 0.05389661, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.02651502, 0.        ,
        1.        ]])

In [17]:
similarity.shape

(4803, 4803)

In [34]:
# getting the movie name from the user

movie_name = input(' Enter your favourite movie name : ')

 Enter your favourite movie name : the orphan


In [36]:
# creating a list with all the movie names given in the dataset

list_of_all_titles = movies_data['title'].tolist()


In [37]:
# finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['The Orphanage', 'The Company', 'The Forsaken']


In [38]:
close_match = find_close_match[0]
print(close_match)

The Orphanage


In [39]:
# finding the index of the movie with title

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]
print(index_of_the_movie)

3761


In [40]:
# getting a list of similar movies

similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, 0.014263838708104846), (1, 0.034870276946509994), (2, 0.0), (3, 0.008689828513323188), (4, 0.0), (5, 0.02661102951103471), (6, 0.03292580156996512), (7, 0.0), (8, 0.014732579291529161), (9, 0.0), (10, 0.0), (11, 0.014607817948157072), (12, 0.0), (13, 0.0), (14, 0.0), (15, 0.0), (16, 0.0), (17, 0.0), (18, 0.0), (19, 0.012369586143507446), (20, 0.030615284615131702), (21, 0.0), (22, 0.012882570968508752), (23, 0.0), (24, 0.01606338947293382), (25, 0.007835814567363663), (26, 0.0), (27, 0.0045724142395646055), (28, 0.005019971245809017), (29, 0.005251637402341122), (30, 0.040265707098339744), (31, 0.0), (32, 0.0), (33, 0.005034943568942893), (34, 0.0), (35, 0.0), (36, 0.0), (37, 0.0), (38, 0.0), (39, 0.0), (40, 0.0), (41, 0.004835237357818837), (42, 0.0), (43, 0.0051021413602687525), (44, 0.0), (45, 0.024294359675821768), (46, 0.03158468113615194), (47, 0.0), (48, 0.03333169705594271), (49, 0.014248461668448877), (50, 0.0), (51, 0.0), (52, 0.0), (53, 0.0), (54, 0.0), (55, 0.0), (56, 

In [41]:
len(similarity_score)

4803

In [42]:
# sorting the movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 
print(sorted_similar_movies)

[(3761, 0.9999999999999999), (2725, 0.13271065322077266), (2627, 0.13259158826907022), (1255, 0.12694711441500764), (2561, 0.09948239167004043), (1799, 0.09690622954538182), (4556, 0.09566120684186956), (4032, 0.09234318343769238), (3794, 0.09056521606476839), (2973, 0.089227434586344), (2684, 0.08918297274575741), (1556, 0.08901389101430447), (4309, 0.08300011152052825), (1654, 0.08169516293202736), (1804, 0.07709097828962821), (493, 0.0770222712044967), (3496, 0.0762629721454739), (81, 0.07524923209635588), (1563, 0.07495520320718564), (2910, 0.0748476886792971), (2914, 0.07478827605952817), (632, 0.07229078193646594), (4416, 0.07212139559851251), (646, 0.07199768760744991), (2224, 0.06953007793988358), (1668, 0.06872801888566653), (1410, 0.06840673904280352), (2064, 0.0680464075875237), (3529, 0.06772538021926958), (4234, 0.06686080730649094), (4415, 0.06630910569673434), (2485, 0.06600434390253228), (1522, 0.06423034764747415), (3343, 0.06416841100965136), (881, 0.0641672070811196)

In [43]:
# print the name of similar movies based on the index

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

Movies suggested for you : 

1 . The Orphanage
2 . The Sea Inside
3 . Mulholland Drive
4 . The Impossible
5 . 28 Weeks Later
6 . Original Sin
7 . El Rey de Najayo
8 . Sleep Dealer
9 . Mondays in the Sun
10 . For Greater Glory - The True Story of Cristiada
11 . Vampire in Brooklyn
12 . Mystic River
13 . The Blade of Don Juan
14 . Godsend
15 . Snow White: A Tale of Terror
16 . A Beautiful Mind
17 . The Flower of Evil
18 . Maleficent
19 . The Prince of Tides
20 . A Tale of Three Cities
21 . Doctor Zhivago
22 . Dreamcatcher
23 . Hidden Away
24 . The Kid
25 . Sweet Charity
26 . Miss Potter
27 . Mirrors
28 . Underdogs
29 . Shine
