In [1]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer #to covert text into num
from sklearn.metrics.pairwise import cosine_similarity #for giving list of movies,comparing the datas

### data collection and preprocessing

In [2]:
movies_data=pd.read_csv("movie_dataset.csv")

In [3]:
movies_data.head()#displays first 5 rows

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [4]:
movies_data.shape#rows and columns

(4803, 24)

In [5]:
selected_features=['genres','keywords','tagline','cast','director']
selected_features

['genres', 'keywords', 'tagline', 'cast', 'director']

In [6]:
#replacing null values with the help of null string
for feature in selected_features:
    movies_data[feature]=movies_data[feature].fillna('')

In [7]:
#combining all the 5 selected features

combined_features=movies_data['genres']+''+movies_data['keywords']+''+movies_data['tagline']+''+movies_data['cast']+''+movies_data['director']


In [8]:
combined_features

0       Action Adventure Fantasy Science Fictioncultur...
1       Adventure Fantasy Actionocean drug abuse exoti...
2       Action Adventure Crimespy based on novel secre...
3       Action Crime Drama Thrillerdc comics crime fig...
4       Action Adventure Science Fictionbased on novel...
                              ...                        
4798    Action Crime Thrillerunited states\u2013mexico...
4799    Comedy RomanceA newlywed couple's honeymoon is...
4800    Comedy Drama Romance TV Moviedate love at firs...
4801    A New Yorker in ShanghaiDaniel Henney Eliza Co...
4802    Documentaryobsession camcorder crush dream gir...
Length: 4803, dtype: object

In [9]:
vectorizer=TfidfVectorizer()

In [10]:
feature_vectors=vectorizer.fit_transform(combined_features)

In [11]:
feature_vectors

<4803x27580 sparse matrix of type '<class 'numpy.float64'>'
	with 111820 stored elements in Compressed Sparse Row format>

### Cosine similarity

In [12]:
#getting the similarity scores using cosine similarity

similarity=cosine_similarity(feature_vectors)

In [13]:
similarity

array([[1.        , 0.06865296, 0.01492221, ..., 0.        , 0.        ,
        0.        ],
       [0.06865296, 1.        , 0.02799128, ..., 0.01243107, 0.        ,
        0.        ],
       [0.01492221, 0.02799128, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.01243107, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [14]:
similarity.shape

(4803, 4803)

In [15]:
#getting the movie name from the user

movie_name=input('enter your favourite movie name:')

enter your favourite movie name:Twillight


In [16]:
# creating a list with all the movie names in the dataset

list_of_all_titles=movies_data['title'].tolist()
list_of_all_titles

['Avatar',
 "Pirates of the Caribbean: At World's End",
 'Spectre',
 'The Dark Knight Rises',
 'John Carter',
 'Spider-Man 3',
 'Tangled',
 'Avengers: Age of Ultron',
 'Harry Potter and the Half-Blood Prince',
 'Batman v Superman: Dawn of Justice',
 'Superman Returns',
 'Quantum of Solace',
 "Pirates of the Caribbean: Dead Man's Chest",
 'The Lone Ranger',
 'Man of Steel',
 'The Chronicles of Narnia: Prince Caspian',
 'The Avengers',
 'Pirates of the Caribbean: On Stranger Tides',
 'Men in Black 3',
 'The Hobbit: The Battle of the Five Armies',
 'The Amazing Spider-Man',
 'Robin Hood',
 'The Hobbit: The Desolation of Smaug',
 'The Golden Compass',
 'King Kong',
 'Titanic',
 'Captain America: Civil War',
 'Battleship',
 'Jurassic World',
 'Skyfall',
 'Spider-Man 2',
 'Iron Man 3',
 'Alice in Wonderland',
 'X-Men: The Last Stand',
 'Monsters University',
 'Transformers: Revenge of the Fallen',
 'Transformers: Age of Extinction',
 'Oz: The Great and Powerful',
 'The Amazing Spider-Man 2',

In [17]:
# finding the close match for the movie name given by the user

find_close_match=difflib.get_close_matches(movie_name,list_of_all_titles)
find_close_match


['Twilight', 'Flight']

In [18]:
close_match=find_close_match[0]
close_match

'Twilight'

In [19]:
# find the index of the movie with title

index_of_the_movie=movies_data[movies_data.title == close_match]['index'].values[0]
index_of_the_movie

1337

In [20]:
# getting the list of similar movies

similarity_score=list(enumerate(similarity[index_of_the_movie]))
similarity_score

[(0, 0.021709189424478567),
 (1, 0.037792734168230424),
 (2, 0.008352257906998912),
 (3, 0.004824833909037518),
 (4, 0.032201605369644955),
 (5, 0.027765112928565),
 (6, 0.008340473105189521),
 (7, 0.023529160212755685),
 (8, 0.022899768514248076),
 (9, 0.008176976347444184),
 (10, 0.02508178032028724),
 (11, 0.060338655748514256),
 (12, 0.02214646968423631),
 (13, 0.007849236810890687),
 (14, 0.03993866788898445),
 (15, 0.007847706612697247),
 (16, 0.014640870948396528),
 (17, 0.043652515417416965),
 (18, 0.0),
 (19, 0.007106182599039277),
 (20, 0.00783806490267079),
 (21, 0.0),
 (22, 0.007753166380355805),
 (23, 0.006990416024909137),
 (24, 0.011743677047201289),
 (25, 0.03054813777519388),
 (26, 0.02502593437611437),
 (27, 0.020443937535716345),
 (28, 0.007888070381958559),
 (29, 0.00858379688451077),
 (30, 0.021533487617818),
 (31, 0.023252665060210558),
 (32, 0.025123447726793244),
 (33, 0.03920739658498082),
 (34, 0.025908382920797685),
 (35, 0.0),
 (36, 0.0),
 (37, 0.04847506162

In [21]:
len(similarity_score)

4803

In [22]:
#sorting the movie based on there similarity scores

sorted_similar_movies=sorted(similarity_score,key=lambda x:x[1],reverse=True)
sorted_similar_movies


[(1337, 1.0000000000000002),
 (898, 0.34261706114668267),
 (612, 0.31969719861053586),
 (172, 0.2843110995375613),
 (2577, 0.18972678694453665),
 (994, 0.17376918089642357),
 (2191, 0.17180187106160127),
 (1132, 0.15758253042701076),
 (825, 0.14239332146355738),
 (3538, 0.13200316297668868),
 (2333, 0.1313821924447543),
 (1632, 0.11884440856606464),
 (992, 0.11482004834418291),
 (600, 0.11453792018169318),
 (960, 0.11401868725767446),
 (912, 0.11239241354035194),
 (3143, 0.10910876208520645),
 (4200, 0.10836960866639372),
 (1086, 0.1080241418260009),
 (3505, 0.10798396114935566),
 (80, 0.10659315922568637),
 (2460, 0.1057441562754669),
 (1714, 0.10484425255574742),
 (2171, 0.10386427037721006),
 (3847, 0.1037571138794828),
 (2389, 0.1036558438478879),
 (4203, 0.09929535999974173),
 (796, 0.09926140218963456),
 (4643, 0.09741486806610805),
 (2878, 0.0971781586339095),
 (4434, 0.09657836311459231),
 (1110, 0.09449223062014918),
 (624, 0.09439915365204013),
 (2475, 0.09025024938894127),
 

In [23]:
#print the name of the similar movies based on the index
print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

Movies suggested for you : 

1 . Twilight
2 . The Twilight Saga: New Moon
3 . The Twilight Saga: Eclipse
4 . The Twilight Saga: Breaking Dawn - Part 2
5 . Tuck Everlasting
6 . Gamer
7 . Capitalism: A Love Story
8 . Red Riding Hood
9 . Flightplan
10 . Do the Right Thing
11 . Peggy Sue Got Married
12 . The Next Three Days
13 . Domino
14 . Killer Elite
15 . The Adventures of Sharkboy and Lavagirl
16 . Interview with the Vampire
17 . You Only Live Twice
18 . You Can't Take It With You
19 . Aliens in the Attic
20 . Men of War
21 . Snow White and the Huntsman
22 . The Unborn
23 . Serendipity
24 . My Best Friend's Girl
25 . Winter Passing
26 . Renaissance
27 . Grace Unplugged
28 . The Ridiculous 6
29 . Like Crazy


In [24]:
movi_name = 'Twilight'

list_of_all_titles = movies_data['title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
      index = movie[0]
      title_from_index = movies_data[movies_data.index==index]['title'].values[0]
      if (i<30):
        print(i, '.',title_from_index)
        i+=1

Movies suggested for you : 

1 . Twilight
2 . The Twilight Saga: New Moon
3 . The Twilight Saga: Eclipse
4 . The Twilight Saga: Breaking Dawn - Part 2
5 . Tuck Everlasting
6 . Gamer
7 . Capitalism: A Love Story
8 . Red Riding Hood
9 . Flightplan
10 . Do the Right Thing
11 . Peggy Sue Got Married
12 . The Next Three Days
13 . Domino
14 . Killer Elite
15 . The Adventures of Sharkboy and Lavagirl
16 . Interview with the Vampire
17 . You Only Live Twice
18 . You Can't Take It With You
19 . Aliens in the Attic
20 . Men of War
21 . Snow White and the Huntsman
22 . The Unborn
23 . Serendipity
24 . My Best Friend's Girl
25 . Winter Passing
26 . Renaissance
27 . Grace Unplugged
28 . The Ridiculous 6
29 . Like Crazy
